# Standardization + Data downloading

This notebook downloads data and standardize them using ChEMBL Structure Pipeline:
https://jcheminf.biomedcentral.com/articles/10.1186/s13321-020-00456-1
https://github.com/chembl/ChEMBL_Structure_Pipeline



In [1]:
# Import necessary libraries
import os
from zipfile import ZipFile
from io import BytesIO
import pandas as pd
import numpy as np
from rdkit.Chem import AllChem
from rdkit.Chem import PandasTools
from rdkit import Chem
from chembl_structure_pipeline import standardizer
from rdkit.Chem import Descriptors
from IPython.display import display
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.error')


[19:32:07] Initializing Normalizer


In [2]:
# Define functions
def process_mol(mol):
    """Converts 3D coordinates to 2D and standardizes molecule."""
    mol = Chem.Mol(mol)  
    AllChem.Compute2DCoords(mol)
    mol = standardizer.standardize_mol(mol)
    return mol

def process_smiles(smiles):
    """Converts SMILES to 2D molecule and standardizes."""
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        mol = process_mol(mol)
    return mol

In [3]:
# Load SDF files and convert to DataFrames
rna_binders_sdf = Chem.SDMolSupplier('data_dataset2/ROBIN_RNA_Binders_3D.sdf')
rna_binders_df = pd.DataFrame([{'smiles': Chem.MolToSmiles(mol)} for mol in rna_binders_sdf if mol is not None])

rna_non_binders_sdf = Chem.SDMolSupplier('data_dataset2/SMM_RNA_Non_Binder_3D.sdf')
rna_non_binders_df = pd.DataFrame([{'smiles': Chem.MolToSmiles(mol)} for mol in rna_non_binders_sdf if mol is not None])

# %%
# Load CSV files
protein_binders_df = pd.read_csv('data_dataset2/drugs_protbin_2952_orig.csv')
protein_non_binders_df = pd.read_csv('data_dataset2/zinc_dark_matter.csv')

In [4]:
protein_binders_df.shape

(2952, 38)

In [4]:
#from protein_binders_df remove rows with smiles shorter than 20 characters to remove the metallic compounds
protein_binders_df = protein_binders_df[protein_binders_df['smiles'].str.len() > 20]
protein_binders_df

Unnamed: 0,pdid,name,smiles,inchi,inchikey,probe,experimental probe,calculated probe,available,approved drug,...,hbd,rb,rc,arc,logp,tpsa,fcsp3,ncc,lrs,qed
0,PD000042,ETOFYLLINE,Cn1c(=O)c2c(ncn2CCO)n(C)c1=O,InChI=1S/C9H12N4O3/c1-11-7-6(8(15)12(2)9(11)16...,NWPRCRWQMGIBOT-UHFFFAOYSA-N,,,,1.0,1,...,1.0,2.0,2.0,2.0,-1.57,82.05,0.44,0.0,6.0,0.67
2,PD000047,PODOFILOX,COc1cc([C@@H]2c3cc4c(cc3[C@H](O)[C@H]3COC(=O)[...,InChI=1S/C22H22O8/c1-25-16-4-10(5-17(26-2)21(1...,YJGVMLPVUAXIQN-XVVDYKMHSA-N,1.0,,1.0,1.0,1,...,1.0,4.0,5.0,2.0,2.41,92.68,0.41,4.0,6.0,0.76
3,PD000048,METOLAZONE,Cc1ccccc1N1C(=O)c2cc(S(N)(=O)=O)c(Cl)cc2NC1C,InChI=1S/C16H16ClN3O3S/c1-9-5-3-4-6-14(9)20-10...,AQCHWTWZEMGIFD-UHFFFAOYSA-N,,,,1.0,1,...,2.0,2.0,3.0,2.0,2.71,92.50,0.19,1.0,6.0,0.86
4,PD000049,METYROSINE,C[C@](N)(Cc1ccc(O)cc1)C(=O)O,"InChI=1S/C10H13NO3/c1-10(11,9(13)14)6-7-2-4-8(...",NHTGHBARYWONDQ-JTQLQIEISA-N,,,,1.0,1,...,3.0,3.0,1.0,1.0,0.74,83.55,0.30,1.0,6.0,0.66
5,PD000054,PINACIDIL,CC(N/C(=N/C#N)Nc1ccncc1)C(C)(C)C,"InChI=1S/C13H19N5/c1-10(13(2,3)4)17-12(16-9-14...",IVVNZDGDKPTYHK-UHFFFAOYSA-N,,,,1.0,1,...,2.0,2.0,1.0,1.0,2.35,73.10,0.46,1.0,6.0,0.48
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2947,PD173641,eltrombopag,Cc1ccc(-n2[nH]c(C)c(N=Nc3cccc(-c4cccc(C(=O)O)c...,InChI=1S/C25H22N4O4/c1-14-10-11-19(12-15(14)2)...,SVOQIEJWJCQGDQ-CYYJNZCTSA-N,,,,1.0,1,...,3.0,5.0,4.0,4.0,5.58,120.04,0.12,0.0,6.0,0.35
2948,PD173677,Intedanib,COC(=O)c1ccc2c(C(=Nc3ccc(N(C)C(=O)CN4CCN(C)CC4...,InChI=1S/C31H33N5O4/c1-34-15-17-36(18-16-34)20...,CPMDPSXJELVGJG-UUDCSCGESA-N,,,,1.0,1,...,2.0,7.0,5.0,4.0,4.04,101.47,0.26,0.0,6.0,0.27
2949,PD173704,Polyestradiol Phosphate,C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1CC[C...,InChI=1S/C18H25O5P/c1-18-9-8-14-13-5-3-12(19)1...,BBWXLCKRYRQQPL-ZBRFXRBCSA-N,,,,,1,...,3.0,2.0,4.0,1.0,3.73,86.99,0.67,5.0,6.0,0.71
2950,PD173712,SKI-758,COc1cc(N=c2c(C#N)c[nH]c3cc(-c4coc(CN5CCN(C)CC5...,InChI=1S/C28H27Cl2N5O3/c1-34-4-6-35(7-5-34)15-...,ANUHLKPVOXDYSK-UHFFFAOYSA-N,,,,,1,...,1.0,6.0,5.0,4.0,5.60,90.02,0.29,0.0,6.0,0.33


In [5]:
rna_binders_df.columns, rna_non_binders_df.columns, protein_binders_df.columns, protein_non_binders_df.columns

(Index(['smiles'], dtype='object'),
 Index(['smiles'], dtype='object'),
 Index(['pdid', 'name', 'smiles', 'inchi', 'inchikey', 'probe',
        'experimental probe', 'calculated probe', 'available', 'approved drug',
        'P&D approved', 'PROTAC', 'covalent binder', 'biased GPCR ligand',
        'inorganic', 'structural alert', 'PAINS Family A', 'PAINS Family B',
        'PAINS Family C', 'Aggregator', 'Obsolete', 'Nuisance', 'no. targets',
        'Drug Status', 'cas', 'synonyms', 'mw', 'hba', 'hbd', 'rb', 'rc', 'arc',
        'logp', 'tpsa', 'fcsp3', 'ncc', 'lrs', 'qed'],
       dtype='object'),
 Index(['0', '1', '2', '3', '4', '5'], dtype='object'))

In [6]:
print(f'RNA binders from ROBIN:                {rna_binders_df.shape[0]}',
      f'RNA non-binders from ROBIN:            {rna_non_binders_df.shape[0]}',
      f'Protein binders Drugs % Probes:        {protein_binders_df.shape[0]}',
      f'Zinc dark matter(Protein Non-Binders): {protein_non_binders_df.shape[0]}',
      sep='\n')

RNA binders from ROBIN:                2003
RNA non-binders from ROBIN:            22489
Protein binders Drugs % Probes:        2725
Zinc dark matter(Protein Non-Binders): 25000


In [7]:
# createa a library with statistics before processing
statistics_before = {
    'rna_binders': rna_binders_df.shape[0],
    'rna_non_binders': rna_non_binders_df.shape[0],
    'protein_binders': protein_binders_df.shape[0],
    'protein_non_binders': protein_non_binders_df.shape[0]
}

In [8]:
# Rename columns and keep only SMILES column
rna_binders_df = rna_binders_df.rename(columns={'SMILES': 'smiles'})[['smiles']]
rna_non_binders_df = rna_non_binders_df.rename(columns={'SMILES': 'smiles'})[['smiles']]
protein_binders_df = protein_binders_df.rename(columns={'0': 'smiles'})[['smiles']]
protein_non_binders_df = protein_non_binders_df.rename(columns={'0': 'smiles'})[['smiles']]

In [9]:
# Convert SMILES to 2D molecules and standardize
datasets = [rna_binders_df, rna_non_binders_df, protein_binders_df, protein_non_binders_df]

for dataset in datasets:
    dataset['mol'] = dataset['smiles'].apply(process_smiles)
    dataset.drop(columns=['smiles'], inplace=True)

print('Normalization complete')

[13:27:40] Running Normalizer
[13:27:40] Running Uncharger
[13:27:40] Running Normalizer
[13:27:40] Running Uncharger
[13:27:40] Running Normalizer
[13:27:40] Running Uncharger
[13:27:40] Running Normalizer
[13:27:40] Running Uncharger
[13:27:40] Running Normalizer
[13:27:40] Running Uncharger
[13:27:40] Running Normalizer
[13:27:40] Running Uncharger
[13:27:40] Removed positive charge.
[13:27:40] Running Normalizer
[13:27:40] Running Uncharger
[13:27:40] Running Normalizer
[13:27:40] Running Uncharger
[13:27:40] Running Normalizer
[13:27:40] Running Uncharger
[13:27:40] Running Normalizer
[13:27:40] Running Uncharger
[13:27:40] Removed positive charge.
[13:27:40] Running Normalizer
[13:27:40] Running Uncharger
[13:27:40] Running Normalizer
[13:27:40] Running Uncharger
[13:27:40] Running Normalizer
[13:27:40] Running Uncharger
[13:27:40] Running Normalizer
[13:27:40] Running Uncharger
[13:27:40] Running Normalizer
[13:27:40] Running Uncharger
[13:27:40] Running Normalizer
[13:27:40] Ru

Normalization complete


[13:29:32] Running Normalizer
[13:29:32] Running Uncharger
[13:29:32] Running Normalizer
[13:29:32] Running Uncharger
[13:29:32] Running Normalizer
[13:29:32] Running Uncharger
[13:29:32] Running Normalizer
[13:29:32] Running Uncharger
[13:29:32] Running Normalizer
[13:29:32] Running Uncharger
[13:29:32] Running Normalizer
[13:29:32] Running Uncharger
[13:29:32] Running Normalizer
[13:29:32] Running Uncharger
[13:29:32] Running Normalizer
[13:29:32] Running Uncharger
[13:29:32] Running Normalizer
[13:29:32] Running Uncharger
[13:29:32] Running Normalizer
[13:29:32] Running Uncharger
[13:29:32] Running Normalizer
[13:29:32] Running Uncharger
[13:29:32] Running Normalizer
[13:29:32] Running Uncharger
[13:29:32] Running Normalizer
[13:29:32] Running Uncharger
[13:29:32] Running Normalizer
[13:29:32] Running Uncharger
[13:29:32] Running Normalizer
[13:29:32] Running Uncharger
[13:29:32] Running Normalizer
[13:29:32] Running Uncharger
[13:29:32] Running Normalizer
[13:29:32] Running Uncharg

In [10]:
# Assign a source label to each dataset
rna_binders_df['source'] = 'robin_b'
rna_non_binders_df['source'] = 'robin_n'
protein_binders_df['source'] = 'probes_drugs'
protein_non_binders_df['source'] = 'zinc_dark_m'


In [11]:
# Generate a cannonical SMILES column for each dataset
for dataset in datasets:
    dataset['smiles'] = dataset['mol'].apply(lambda mol: Chem.MolToSmiles(mol, canonical=True))


In [12]:
# Concatenate all provided datasets into one DataFrame
all_df = pd.concat([rna_binders_df, rna_non_binders_df, protein_binders_df, protein_non_binders_df], ignore_index=True)


In [13]:
dissconnected_molecules = all_df[all_df['smiles'].str.contains('\.')]
dissconnected_molecules['source'].value_counts()

source
probes_drugs    36
robin_n          3
Name: count, dtype: int64

In [14]:
all_df.shape

(52217, 3)

In [15]:
# remove the disconnected molecules from all_df
all_df = all_df[~all_df['smiles'].str.contains('\.')]
all_df.shape

(52178, 3)

In [16]:
# create a folder for standardized data
if not os.path.exists('data_dataset2/standardized_data'):
    os.makedirs('data_dataset2/standardized_data')

In [17]:
# sort the dataframe by source to datasets
rna_binders_df = all_df[all_df['source'] == 'robin_b']
rna_non_binders_df = all_df[all_df['source'] == 'robin_n']
protein_binders_df = all_df[all_df['source'] == 'probes_drugs']
protein_non_binders_df = all_df[all_df['source'] == 'zinc_dark_m']

In [18]:
# save standardized data as pickle files
rna_binders_df.to_pickle('data_dataset2/standardized_data/rna_binders.pkl')
rna_non_binders_df.to_pickle('data_dataset2/standardized_data/rna_non_binders.pkl')
protein_binders_df.to_pickle('data_dataset2/standardized_data/protein_binders.pkl')
protein_non_binders_df.to_pickle('data_dataset2/standardized_data/protein_non_binders.pkl')


In [19]:
# in each dataset show nuber of duplicates
print(f'RNA Binders (ROBIN): {rna_binders_df.duplicated("smiles").sum()}')
print(f'RNA Non-Binders (ROBIN): {rna_non_binders_df.duplicated("smiles").sum()}')
print(f'Protein Binders (PD): {protein_binders_df.duplicated("smiles").sum()}')
print(f'Protein Non-Binders (Dark Matter ZINC): {protein_non_binders_df.duplicated("smiles").sum()}')


RNA Binders (ROBIN): 0
RNA Non-Binders (ROBIN): 78
Protein Binders (PD): 0
Protein Non-Binders (Dark Matter ZINC): 0


In [20]:
all_df = pd.concat([rna_binders_df, rna_non_binders_df, protein_binders_df, protein_non_binders_df], ignore_index=True)

In [21]:
duplicates = all_df[all_df.duplicated('smiles')]
duplicates['source'].value_counts()

source
probes_drugs    410
robin_n          93
zinc_dark_m      55
Name: count, dtype: int64

In [22]:
# remove duplicates
all_df = all_df.drop_duplicates('smiles', keep=False)

In [23]:
# check for dissconnected molecules and duplicates
print('Dissconnected molecules:', all_df[all_df['smiles'].str.contains('\.')])
print(f'Duplicates: {all_df[all_df.duplicated("smiles")].shape[0]}')

Dissconnected molecules: Empty DataFrame
Columns: [mol, source, smiles]
Index: []
Duplicates: 0


In [24]:
# sort the dataframe by source to datasets
rna_binders_df = all_df[all_df['source'] == 'robin_b']
rna_non_binders_df = all_df[all_df['source'] == 'robin_n']
protein_binders_df = all_df[all_df['source'] == 'probes_drugs']
protein_non_binders_df = all_df[all_df['source'] == 'zinc_dark_m']


In [25]:
# Statistics of the number of molecules in each dataset
print(f'RNA Binders (ROBIN): {len(rna_binders_df)}')
print(f'RNA Non-Binders (ROBIN): {len(rna_non_binders_df)}')
print(f'Protein Binders (PD): {len(protein_binders_df)}')
print(f'Protein Non-Binders (Dark Matter ZINC): {len(protein_non_binders_df)}')

RNA Binders (ROBIN): 1961
RNA Non-Binders (ROBIN): 21883
Protein Binders (PD): 2276
Protein Non-Binders (Dark Matter ZINC): 24945


In [26]:
# in each dataset show nuber of duplicates
print(f'RNA Binders (ROBIN): {rna_binders_df.duplicated("smiles").sum()}')
print(f'RNA Non-Binders (ROBIN): {rna_non_binders_df.duplicated("smiles").sum()}')
print(f'Protein Binders (PD): {protein_binders_df.duplicated("smiles").sum()}')
print(f'Protein Non-Binders (Dark Matter ZINC): {protein_non_binders_df.duplicated("smiles").sum()}')


RNA Binders (ROBIN): 0
RNA Non-Binders (ROBIN): 0
Protein Binders (PD): 0
Protein Non-Binders (Dark Matter ZINC): 0


In [27]:

    
# create a folder for deduplicated data
if not os.path.exists('data_dataset2/standardized_data/deduplicated_data'):
    os.makedirs('data_dataset2/standardized_data/deduplicated_data')

In [28]:
# save standardized data as pickle files
rna_binders_df.to_pickle('data_dataset2/standardized_data/deduplicated_data/rna_binders_dedup.pkl')
rna_non_binders_df.to_pickle('data_dataset2/standardized_data/deduplicated_data/rna_non_binders_dedup.pkl')
protein_binders_df.to_pickle('data_dataset2/standardized_data/deduplicated_data/protein_binders_dedup.pkl')
protein_non_binders_df.to_pickle('data_dataset2/standardized_data/deduplicated_data/protein_non_binders_dedup.pkl')


In [29]:
# create statistics after processing and show difference in table 
statistics_after = {
    'rna_binders': len(rna_binders_df),
    'rna_non_binders': len(rna_non_binders_df),
    'protein_binders': len(protein_binders_df),
    'protein_non_binders': len(protein_non_binders_df)
}

# make a table from the statistics
statistics_df = pd.DataFrame([statistics_before, statistics_after]).T
statistics_df.columns = ['Before', 'After']
statistics_df['Difference'] = statistics_df['Before'] - statistics_df['After']
statistics_df

Unnamed: 0,Before,After,Difference
rna_binders,2003,1961,42
rna_non_binders,22489,21883,606
protein_binders,2725,2276,449
protein_non_binders,25000,24945,55


In [30]:
# # create folder for sdf files in standardized_data folder
# if not os.path.exists('data_dataset2/standardized_data/sdf_files'):
#     os.makedirs('data_dataset2/standardized_data/sdf_files')

In [31]:
# # save standardized data as sdf files using rdkit
# Chem.PandasTools.WriteSDF(rna_binders_df, 'data_dataset2/standardized_data/sdf_files/rna_binders.sdf', molColName='mol')
# Chem.PandasTools.WriteSDF(rna_non_binders_df, 'data_dataset2/standardized_data/sdf_files/rna_non_binders.sdf', molColName='mol')
# Chem.PandasTools.WriteSDF(protein_binders_df, 'data_dataset2/standardized_data/sdf_files/protein_binders.sdf', molColName='mol')
# Chem.PandasTools.WriteSDF(protein_non_binders_df, 'data_dataset2/standardized_data/sdf_files/protein_non_binders.sdf', molColName='mol')
