# Standardization

This notebook downloads data and standardize them using ChEMBL Structure Pipeline:
https://jcheminf.biomedcentral.com/articles/10.1186/s13321-020-00456-1
https://github.com/chembl/ChEMBL_Structure_Pipeline



In [11]:
import os
from zipfile import ZipFile
from io import BytesIO

import pandas as pd
import numpy as np

from rdkit.Chem import AllChem
from rdkit.Chem import PandasTools
from rdkit import Chem

from rdkit import Chem, SimDivFilters

from chembl_structure_pipeline import standardizer


from IPython.display import display

from rdkit import RDLogger
RDLogger.DisableLog('rdApp.error')


# Fetch data


In [12]:
def read_sdf(sdf_file_path):
    supplier = Chem.SDMolSupplier(sdf_file_path)
    data = []
    for mol in supplier:
        if mol is not None:
            # Create a dictionary with only the SMILES and the molecule object
            mol_data = {
                'SMILES': Chem.MolToSmiles(mol, canonical=True),  # Canonical SMILES
                'mol': mol  # RDKit molecule object
            }
            data.append(mol_data)
    
    # Creating a DataFrame from the list of dictionaries
    df = pd.DataFrame(data, columns=['SMILES', 'mol'])
    return df

In [13]:
chemdiv_df = read_sdf('../../../data/raw_data/raw_data2/chemdiv.sdf')
life_chemicals_df = read_sdf('../../../data/raw_data/raw_data2/life_chemicals.sdf')
enamine_rna_df = read_sdf('../../../data/raw_data/raw_data2/enamine_rna.sdf')
enamine_protein_df = read_sdf('../../../data/raw_data/raw_data2/enamine_protein.sdf')
robin_df = read_sdf('../../../data/raw_data/raw_data2/robin.sdf')


In [14]:
datasets = [chemdiv_df, life_chemicals_df, robin_df, enamine_rna_df, enamine_protein_df]

# for all dataset rename the column from 0 to mol
for df in datasets:
    df.rename(columns={0: 'mol'}, inplace=True)

## calculate 2D coordinates for ROBIN lib. because it is in 3D

In [15]:
def generate_2d_coords(mol):
    mol_2d = Chem.Mol(mol)  # Create a copy of the molecule
    AllChem.Compute2DCoords(mol_2d)
    return mol_2d

# Define the standardize_mol function
def standardize_mol(mol):
    """Standardize molecule."""
    mol = standardizer.standardize_mol(mol)
    return mol

In [16]:
# calculate 2D coordinates for ROBIN lib. because it is in 3D
robin_df['mol'] = robin_df['mol'].apply(generate_2d_coords)

In [None]:
# Standardize all datasets
chemdiv_df['mol'] = chemdiv_df['mol'].apply(lambda mol: standardize_mol(mol))
life_chemicals_df['mol'] = life_chemicals_df['mol'].apply(lambda mol: standardize_mol(mol))
enamine_rna_df['mol'] = enamine_rna_df['mol'].apply(lambda mol: standardize_mol(mol))
enamine_protein_df['mol'] = enamine_protein_df['mol'].apply(lambda mol: standardize_mol(mol))
robin_df['mol'] = robin_df['mol'].apply(lambda mol: standardize_mol(mol))

In [18]:
chemdiv_df['source'] = 'chemdiv'
enamine_rna_df['source'] = 'enamine'
enamine_protein_df['source'] = 'enamine_protein'
life_chemicals_df['source'] = 'life_chemicals'
robin_df['source'] = 'robin'


In [19]:
# create SMILES column for each dataset using rdKit
datasets = [chemdiv_df, enamine_rna_df, enamine_protein_df, life_chemicals_df, robin_df]

for dataset in datasets:
    dataset['SMILES'] = dataset['mol'].apply(Chem.MolToSmiles, canonical=True)


In [20]:
rna_df = pd.concat([chemdiv_df, enamine_rna_df, life_chemicals_df, robin_df], ignore_index=True)
all_df = pd.concat([chemdiv_df, enamine_rna_df, enamine_protein_df, life_chemicals_df, robin_df], ignore_index=True)

In [21]:
chemdiv_df.shape, enamine_rna_df.shape, enamine_protein_df.shape, life_chemicals_df.shape, robin_df.shape, rna_df.shape, all_df.shape

((20000, 3),
 (15520, 3),
 (460160, 3),
 (5544, 3),
 (2003, 3),
 (43067, 3),
 (503227, 3))

In [22]:
rna_df.shape, enamine_protein_df.shape

((43067, 3), (460160, 3))

In [None]:
import pandas as pd
from rdkit import Chem
from chembl_structure_pipeline import standardizer

def standardize_smiles(smiles):
    """Standardize SMILES using ChEMBL Structure Pipeline."""
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        standardized_mol = standardizer.standardize_mol(mol)
        standardized_smiles = Chem.MolToSmiles(standardized_mol, canonical=True)
        return standardized_smiles
    return None

def correct_smiles(smiles):
    """Keep the longer part of the SMILES string, discarding salts."""
    parts = smiles.split('.')
    if len(parts) > 1:
        largest_part = max(parts, key=lambda x: len(Chem.MolFromSmiles(x).GetAtoms()))
        return largest_part
    return smiles

# Assuming all_df is your DataFrame containing molecules data
# Remove the 'mol' column
all_df.drop(columns=['mol'], inplace=True)

# Initialize a list to store molecules with disconnected components
disconnected_mols = []

# Process each row in all_df
for idx, row in all_df.iterrows():
    smiles = row['SMILES']
    source = row['source']

    # Standardize and correct SMILES
    standardized_smiles = standardize_smiles(smiles)
    if standardized_smiles is None:
        print(f"Invalid SMILES string at index {idx}: {smiles}")
        continue

    corrected_smiles = correct_smiles(standardized_smiles)
    mol_from_smiles = Chem.MolFromSmiles(corrected_smiles)

    if mol_from_smiles is None or mol_from_smiles.GetNumAtoms() == 0:
        print(f"Invalid SMILES string after correction at index {idx}: {smiles}")
        continue

    num_components = Chem.GetMolFrags(mol_from_smiles, asMols=False, sanitizeFrags=False)
    if len(num_components) > 1:
        print(f"Disconnected components in molecule at index {idx}: {smiles}, from source {source}")
        disconnected_mols.append({'Index': idx, 'SMILES': corrected_smiles, 'Source': source})
        all_df.at[idx, 'SMILES'] = corrected_smiles  # Update SMILES with corrected version

# Recompute 'mol' column from corrected SMILES
all_df['mol'] = all_df['SMILES'].apply(lambda x: Chem.MolFromSmiles(x))

# Convert the list to a DataFrame and save to JSON
disconnected_mols_df = pd.DataFrame(disconnected_mols)
disconnected_mols_df.to_json('disconnected_mols_V2.json')

# Optionally, print or return the DataFrame of disconnected molecules
disconnected_mols_df

In [13]:
# check the SMILES column in all_df if they do not have dot in the string
dissconnected_molecules = all_df[all_df['SMILES'].str.contains('\.')]

# save the dissconnected molecules to json
dissconnected_molecules.to_json('mols_with_salt_V2.json')


In [14]:
dissconnected_molecules

Unnamed: 0,SMILES,source,mol
20005,COc1ccc(-n2c(-c3ccccc3)c[n+]3c2CCc2ccccc2-3)cc...,enamine,<rdkit.Chem.rdchem.Mol object at 0x7f05a0e50cf0>
20023,Br.c1ccc(N=c2scc(-c3cccs3)n2-c2ccccc2)cc1,enamine,<rdkit.Chem.rdchem.Mol object at 0x7f05a0e514d0>
20025,COc1ccc(-c2n(-c3ccccn3)c3ccccc3[n+]2-c2ccccn2)...,enamine,<rdkit.Chem.rdchem.Mol object at 0x7f05a0e515b0>
20030,Br.COc1ccc(-c2csc(Nc3ccccc3)[n+]2CC(O)c2ccc([N...,enamine,<rdkit.Chem.rdchem.Mol object at 0x7f05a0e517e0>
20040,Br.Cc1ccc(N=c2scc(-c3ccc(C)cc3)n2-c2ccc(C)cc2)cc1,enamine,<rdkit.Chem.rdchem.Mol object at 0x7f05a0e51c40>
...,...,...,...
500913,CC(C)O.COc1ccc(CCN2COc3ccc4c(c3C2)O/C(=C\c2ccc...,life_chemicals,<rdkit.Chem.rdchem.Mol object at 0x7f05ad3531b0>
500998,Cl.O=C(O)c1cccc(Nc2ncnc3c2oc2ccccc23)c1,life_chemicals,<rdkit.Chem.rdchem.Mol object at 0x7f05ad339770>
501000,Cl.O=C(O)c1ccc(Nc2ncnc3c2oc2ccccc23)cc1,life_chemicals,<rdkit.Chem.rdchem.Mol object at 0x7f05ad339850>
501077,CN(C)CCN(C(=O)Cc1cccc2ccccc12)c1nc2ccc(F)cc2s1.Cl,life_chemicals,<rdkit.Chem.rdchem.Mol object at 0x7f05ad33ba00>


In [None]:
from chembl_structure_pipeline import standardizer
from rdkit import Chem

def get_parent_mol(smiles):
    """Remove salt from SMILES and return the parent molecule's SMILES."""
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        parent_mol, _ = standardizer.get_parent_mol(mol)  # Unpack the returned tuple
        if parent_mol:  # Ensure parent_mol is not None
            parent_smiles = Chem.MolToSmiles(parent_mol)
            return parent_smiles
    return None

# Assuming dissconnected_molecules is a DataFrame containing molecules
removed_salts = []
for idx, row in dissconnected_molecules.iterrows():
    smiles = row['SMILES']
    source = row['source']
    parent_smiles = get_parent_mol(smiles)
    if parent_smiles is None:
        print(f"Invalid SMILES string at index {idx}: {smiles}")
        continue
    removed_salts.append({'Index': idx, 'SMILES': parent_smiles, 'Source': source})
    # Assuming you want to update all_df too
    all_df.at[idx, 'SMILES'] = parent_smiles  # Update SMILES with corrected version

# Process the removed_salts list as needed
removed_salts_df = pd.DataFrame(removed_salts)
removed_salts_df.to_json('removed_salts_df_V2.json')



In [16]:
salt_check = all_df[all_df['SMILES'].str.contains('\.')].copy()
salt_check


Unnamed: 0,SMILES,source,mol
20025,COc1ccc(-c2n(-c3ccccn3)c3ccccc3[n+]2-c2ccccn2)...,enamine,<rdkit.Chem.rdchem.Mol object at 0x7f05a0e515b0>
20440,[O-][Cl+3]([O-])([O-])[O-].c1ccc(-c2[se]c3nccc...,enamine,<rdkit.Chem.rdchem.Mol object at 0x7f05a0eb0cf0>
20867,OC1CSc2n(-c3ccccc3)c(-c3ccccc3)c(-c3ccccc3)[n+...,enamine,<rdkit.Chem.rdchem.Mol object at 0x7f05a0e24970>
24830,[O-][Cl+3]([O-])([O-])[O-].c1ccc(-c2ccc(-c3n(-...,enamine,<rdkit.Chem.rdchem.Mol object at 0x7f05ae071e70>
28659,COc1ccc(-c2cc(=Nc3cccc(C)c3C)c3cc(C)ccc3o2)cc1...,enamine,<rdkit.Chem.rdchem.Mol object at 0x7f05a3667840>
28794,CN(C)c1ccc(/C=C/c2cc(-c3ccccc3)c3ccccc3[o+]2)c...,enamine,<rdkit.Chem.rdchem.Mol object at 0x7f05a35fb3e0>


In [17]:
# from salt_check, remove the [O-][Cl+3]([O-])([O-])[O-] from the SMILES
salt_check['SMILES'] = salt_check['SMILES'].str.replace(r'[O-][Cl+3]([O-])([O-])[O-].', '')
salt_check['SMILES'] = salt_check['SMILES'].str.replace(r'.[O-][Cl+3]([O-])([O-])[O-]', '')
salt_check['SMILES'] = salt_check['SMILES'].str.replace(r'.[O-][Cl+3]([O-])([O-])O', '')


In [18]:
for x in salt_check['SMILES']:
    print(x)

COc1ccc(-c2n(-c3ccccn3)c3ccccc3[n+]2-c2ccccn2)cc1
c1ccc(-c2[se]c3nccc[n+]3c2-c2ccccc2)cc1
OC1CSc2n(-c3ccccc3)c(-c3ccccc3)c(-c3ccccc3)[n+]2C1
c1ccc(-c2ccc(-c3n(-c4ccccn4)c4ccccc4[n+]3-c3ccccn3)cc2)cc1
COc1ccc(-c2cc(=Nc3cccc(C)c3C)c3cc(C)ccc3o2)cc1
CN(C)c1ccc(/C=C/c2cc(-c3ccccc3)c3ccccc3[o+]2)cc1


In [19]:
all_df['SMILES']  = all_df['SMILES'].str.replace(r'[O-][Cl+3]([O-])([O-])[O-].', '')
all_df['SMILES']  = all_df['SMILES'].str.replace(r'.[O-][Cl+3]([O-])([O-])[O-]', '')
all_df['SMILES']  = all_df['SMILES'].str.replace(r'.[O-][Cl+3]([O-])([O-])O', '')

In [20]:
all_df[all_df['SMILES'].str.contains('\.')]

Unnamed: 0,SMILES,source,mol


In [None]:
# Initialize a list to store molecules with disconnected components
disconnected_mols = []

# Process each row in all_df
for idx, row in all_df.iterrows():
    smiles = row['SMILES']
    source = row['source']

    # Standardize and correct SMILES
    standardized_smiles = standardize_smiles(smiles)
    if standardized_smiles is None:
        print(f"Invalid SMILES string at index {idx}: {smiles}")
        continue

    corrected_smiles = correct_smiles(standardized_smiles)
    mol_from_smiles = Chem.MolFromSmiles(corrected_smiles)

    if mol_from_smiles is None or mol_from_smiles.GetNumAtoms() == 0:
        print(f"Invalid SMILES string after correction at index {idx}: {smiles}")
        continue

    num_components = Chem.GetMolFrags(mol_from_smiles, asMols=False, sanitizeFrags=False)
    if len(num_components) > 1:
        print(f"Disconnected components in molecule at index {idx}: {smiles}, from source {source}")
        disconnected_mols.append({'Index': idx, 'SMILES': corrected_smiles, 'Source': source})
        all_df.at[idx, 'SMILES'] = corrected_smiles  # Update SMILES with corrected version

# Recompute 'mol' column from corrected SMILES
all_df['mol'] = all_df['SMILES'].apply(lambda x: Chem.MolFromSmiles(x))

# Convert the list to a DataFrame and save to JSON
disconnected_mols_2_df = pd.DataFrame(disconnected_mols)
disconnected_mols_2_df.to_json('disconnected_mols_2_df_V2.json')

# Optionally, print or return the DataFrame of disconnected molecules
disconnected_mols_2_df

In [22]:
# drop the mol column
all_df.drop(columns=['mol'], inplace=True)

# compute mol from SMILES
all_df['mol'] = all_df['SMILES'].apply(lambda x: Chem.MolFromSmiles(x))

# drop SMILES column
all_df.drop(columns=['SMILES'], inplace=True)



In [23]:
# compute SMILES from mol
all_df['SMILES'] = all_df['mol'].apply(Chem.MolToSmiles, canonical=True)

In [24]:
all_df[all_df['SMILES'].str.contains('\.')]

Unnamed: 0,source,mol,SMILES


In [25]:
# using source column, divide the all_df into 5 different dataframes
chemdiv_df = all_df[all_df['source'] == 'chemdiv']
enamine_rna_df = all_df[all_df['source'] == 'enamine']
enamine_protein_df = all_df[all_df['source'] == 'enamine_protein']
life_chemicals_df = all_df[all_df['source'] == 'life_chemicals']
robin_df = all_df[all_df['source'] == 'robin']

# statistics of the number of molecules in each dataset
print(f'Chemdiv: {len(chemdiv_df)}')
print(f'Enamine: {len(enamine_rna_df)}')
print(f'Enamine Protein: {len(enamine_protein_df)}')
print(f'Life Chemicals: {len(life_chemicals_df)}')
print(f'Robin: {len(robin_df)}')


Chemdiv: 20000
Enamine: 15520
Enamine Protein: 460160
Life Chemicals: 5544
Robin: 2003


In [26]:
# create a folder for standardized data
if not os.path.exists('../data/standardized_data'):
    os.makedirs('../data/standardized_data')

In [27]:
# save standardized data as pickle files
chemdiv_df.to_pickle('../data/standardized_data/chemdiv_df.pkl')
life_chemicals_df.to_pickle('../data/standardized_data/life_chemicals_df.pkl')
enamine_rna_df.to_pickle('../data/standardized_data/enamine_rna_df.pkl')
enamine_protein_df.to_pickle('../data/standardized_data/enamine_protein_df.pkl')
robin_df.to_pickle('../data/standardized_data/robin_df.pkl')


In [28]:
# create folder for sdf files in standardized_data folder
if not os.path.exists('../data/standardized_data/sdf_files'):
    os.makedirs('../data/standardized_data/sdf_files')

In [29]:
# save standardized data as sdf files using rdkit
Chem.PandasTools.WriteSDF(chemdiv_df, '../data/standardized_data/sdf_files/chemdiv_df.sdf', molColName='mol', properties=list(chemdiv_df.columns))
Chem.PandasTools.WriteSDF(life_chemicals_df, '../data/standardized_data/sdf_files/life_chemicals_df.sdf', molColName='mol', properties=list(life_chemicals_df.columns))
Chem.PandasTools.WriteSDF(enamine_rna_df, '../data/standardized_data/sdf_files/enamine_rna_df.sdf', molColName='mol', properties=list(enamine_rna_df.columns))
Chem.PandasTools.WriteSDF(enamine_protein_df, '../data/standardized_data/sdf_files/enamine_protein_df.sdf', molColName='mol', properties=list(enamine_protein_df.columns))
Chem.PandasTools.WriteSDF(robin_df, '../data/standardized_data/sdf_files/robin_df.sdf', molColName='mol', properties=list(robin_df.columns))
