In [1]:
from functions.molecule_processor import MoleculeProcessor
from functions.diversity_picker import DiversityPicker
import pandas as pd


[14:10:27] Initializing Normalizer


Paths for raw data


In [25]:
robin_path = ['data_mvi/raw_data/robin.sdf']
life_chemicals_path = ['data_mvi/raw_data/life_chemicals.sdf']
chemdiv_path = ['data_mvi/raw_data/chemdiv.sdf']
enamine_rna_path = ['data_mvi/raw_data/enamine_rna.sdf']
enamine_protein_path = ['data_mvi/raw_data/enamine_protein.sdf']


In [28]:
from rdkit import Chem

def count_molecules(file_paths):
    molecule_counts = {}
    for path in file_paths:
        supplier = Chem.SDMolSupplier(path)
        count = sum(1 for _ in supplier if _ is not None)
        molecule_counts[path] = count
    return molecule_counts

# Combine all paths into a single list
all_paths = (robin_path + life_chemicals_path + chemdiv_path +
             enamine_rna_path + enamine_protein_path)

# Get molecule counts
molecule_counts = count_molecules(all_paths)

# Print the results
for path, count in molecule_counts.items():
    print(f"{path}: {count} molecules")



data_mvi/raw_data/robin.sdf: 2003 molecules
data_mvi/raw_data/life_chemicals.sdf: 5544 molecules
data_mvi/raw_data/chemdiv.sdf: 20000 molecules
data_mvi/raw_data/enamine_rna.sdf: 15520 molecules
data_mvi/raw_data/enamine_protein.sdf: 460160 molecules


In [32]:
# Print the results
for path, count in molecule_counts.items():
    print(f"{path}: {count} molecules")

print('')
print(f'All molecules: {sum(molecule_counts.values())}')

data_mvi/raw_data/robin.sdf: 2003 molecules
data_mvi/raw_data/life_chemicals.sdf: 5544 molecules
data_mvi/raw_data/chemdiv.sdf: 20000 molecules
data_mvi/raw_data/enamine_rna.sdf: 15520 molecules
data_mvi/raw_data/enamine_protein.sdf: 460160 molecules

All molecules: 503227


Normalize the data

In [3]:
# processor_1 = MoleculeProcessor(enamine_protein_path, output_format='sdf', 
#                               deduplicated_filename='data_mvi/normalized_data/enamine_protein_standardized', 
#                               duplicates_filename='data_mvi/duplicates/enamine_prot_duplicates')
# processor_1.run()


In [4]:
# processor_2 = MoleculeProcessor(robin_path, output_format='sdf',
#                                 deduplicated_filename='data_mvi/normalized_data/robin_standardized', 
#                                 duplicates_filename='data_mvi/duplicates/robin_duplicates',)
# processor_2.run()


In [5]:
# processor_3 = MoleculeProcessor(life_chemicals_path, output_format='sdf',
#                                 deduplicated_filename='data_mvi/normalized_data/life_chemicals_standardized', 
#                                 duplicates_filename='data_mvi/duplicates/life_chemicals_duplicates',)
# processor_3.run()

In [6]:
# processor_4 = MoleculeProcessor(chemdiv_path, output_format='sdf',
#                                 deduplicated_filename='data_mvi/normalized_data/chemdiv_standardized', 
#                                 duplicates_filename='data_mvi/duplicates/chemdiv_duplicates',)
# processor_4.run()

In [7]:
# processor_5 = MoleculeProcessor(enamine_rna_path, output_format='sdf',
#                                 deduplicated_filename='data_mvi/normalized_data/enamine_rna_standardized', 
#                                 duplicates_filename='data_mvi/duplicates/enamine_rna_duplicates',)
# processor_5.run()

Diversity picking from normalized protein

In [8]:
# import pandas as pd
# from rdkit import Chem
# from rdkit.Chem import AllChem
# from rdkit.SimDivFilters.rdSimDivPickers import MaxMinPicker
# from rdkit import DataStructs

# def load_sdf_to_dataframe(sdf_path):
#     suppl = Chem.SDMolSupplier(sdf_path)
#     rows = []
#     for mol in suppl:
#         if mol is not None:
#             smiles = Chem.MolToSmiles(mol)
#             # Assuming 'rna' is a molecule property stored in the SDF
#             rna = mol.GetProp('rna') if mol.HasProp('rna') else 'NA'
#             rows.append({'mol': mol, 'SMILES': smiles, 'rna': rna})
#     return pd.DataFrame(rows)


# def add_fingerprints_to_dataframe(df, radius=3, nBits=2048):
#     # Compute fingerprints and add them as a new column
#     df['fingerprints'] = df['mol'].apply(lambda x: AllChem.GetMorganFingerprintAsBitVect(x, radius, nBits))


# def diversity_picking(df, colname, num_to_pick):
#     fps = list(df[colname])
#     mmp = MaxMinPicker()
    
#     # Generate a distance matrix function based on Tanimoto similarity
#     def distij(i, j, fps=fps):
#         return 1 - DataStructs.TanimotoSimilarity(fps[i], fps[j])
    
#     # Now use the distance function and the fps length for picking
#     picks = mmp.LazyPick(distij, len(fps), num_to_pick, [] )  # No seed, starting with an empty list of picked indices
    
#     picked_df = pd.DataFrame({
#         'mol': df['mol'].iloc[picks],
#         'smiles': df['SMILES'].iloc[picks],
#         colname: df[colname].iloc[picks],
#         'rna': df['rna'].iloc[picks]
#     })

#     return picked_df


# # Path to your SDF file
# sdf_path = 'data_mvi/normalized_data/enamine_protein_standardized.sdf'
# df = load_sdf_to_dataframe(sdf_path)

# add_fingerprints_to_dataframe(df)



# # Perform diversity picking
# picked_df = diversity_picking(df, 'fingerprints', 50000)


# # save picked_df as SDF
# w = Chem.SDWriter('data_mvi/normalized_data/enamine_protein_standardized_diversity_picked.sdf')
# for i, row in picked_df.iterrows():
#     w.write(row['mol'])
# w.close()

In [9]:
# # Usage
# sdf_path =  'data_mvi/normalized_data/enamine_protein_standardized.sdf'
# num_to_pick = 50000  # Adjust as needed
# output_sdf_path = 'data_mvi/normalized_data/enamine_protein_standardized_diversity_picked.sdf'

# picker = DiversityPicker(sdf_path, num_to_pick)
# picker.run()
# picker.save_to_sdf(output_sdf_path)

Deduplicate

In [10]:
robin_sdf_path = 'data_mvi/normalized_data/robin_standardized.sdf'
life_chemicals_sdf_pth = 'data_mvi/normalized_data/life_chemicals_standardized.sdf'
chemdiv_sdf_pth = 'data_mvi/normalized_data/chemdiv_standardized.sdf'
enamine_rna_sdf_pth = 'data_mvi/normalized_data/enamine_rna_standardized.sdf'

protein_sdf_pth = 'data_mvi/normalized_data/enamine_protein_standardized_diversity_picked.sdf'


In [33]:
from rdkit import Chem

def count_molecules(file_paths):
    molecule_counts = {}
    for path in file_paths:
        supplier = Chem.SDMolSupplier(path)
        count = sum(1 for _ in supplier if _ is not None)
        molecule_counts[path] = count
    return molecule_counts

# Combine all new paths into a single list
new_paths = [robin_sdf_path, life_chemicals_sdf_pth, chemdiv_sdf_pth,
             enamine_rna_sdf_pth, protein_sdf_pth]

# Get molecule counts for new paths
new_molecule_counts = count_molecules(new_paths)

# Print the results for new paths
for path, count in new_molecule_counts.items():
    print(f"{path}: {count} molecules")

print('')
print(f'All molecules together after normalization and diversity picking: {sum(new_molecule_counts.values())}')



data_mvi/normalized_data/robin_standardized.sdf: 2003 molecules
data_mvi/normalized_data/life_chemicals_standardized.sdf: 5544 molecules
data_mvi/normalized_data/chemdiv_standardized.sdf: 20000 molecules
data_mvi/normalized_data/enamine_rna_standardized.sdf: 15520 molecules
data_mvi/normalized_data/enamine_protein_standardized_diversity_picked.sdf: 50000 molecules

All molecules together after normalization and diversity picking: 93067


In [11]:
# Now call the static method with the correct parameter
robin_df = DiversityPicker.load_sdf_to_dataframe(robin_sdf_path)
life_chemicals_df = DiversityPicker.load_sdf_to_dataframe(life_chemicals_sdf_pth)
chemdiv_df = DiversityPicker.load_sdf_to_dataframe(chemdiv_sdf_pth)
enamine_df = DiversityPicker.load_sdf_to_dataframe(enamine_rna_sdf_pth)



In [12]:
robin_df.shape, life_chemicals_df.shape, chemdiv_df.shape, enamine_df.shape

((2003, 3), (5544, 3), (20000, 3), (15520, 3))

In [13]:
# add column source to each dataframe
robin_df['source'] = 'robin'
life_chemicals_df['source'] = 'life_chemicals'
chemdiv_df['source'] = 'chemdiv'
enamine_df['source'] = 'enamine'


In [14]:
enamine_df[enamine_df['SMILES'].str.contains('\.')]
# delete the rows with '.' in the SMILES column
enamine_df = enamine_df[~enamine_df['SMILES'].str.contains('\.')]

In [15]:
rna_len = len(robin_df) + len(life_chemicals_df) + len(chemdiv_df) + len(enamine_df)
rna_len

43061

In [16]:
protein_df = DiversityPicker.load_sdf_to_dataframe(protein_sdf_pth).head(43061)
# Use the DataFrame as needed
protein_df['source'] = 'protein'


In [17]:
all_df = pd.concat([robin_df, life_chemicals_df, chemdiv_df, enamine_df, protein_df])


In [18]:
all_df.shape, all_df.columns


((86122, 4), Index(['mol', 'SMILES', 'rna', 'source'], dtype='object'))

In [19]:
all_dedup_df = MoleculeProcessor.deduplicate_and_final_check(all_df)


Final deduplicated dataset contains 85081 unique molecules.
Duplicates removed: 1041


In [20]:
all_dedup_df

[(<rdkit.Chem.rdchem.Mol at 0x7f8ed3190c10>, 'robin'),
 (<rdkit.Chem.rdchem.Mol at 0x7f8ed3190740>, 'robin'),
 (<rdkit.Chem.rdchem.Mol at 0x7f8ed3190d60>, 'robin'),
 (<rdkit.Chem.rdchem.Mol at 0x7f8ed3190dd0>, 'robin'),
 (<rdkit.Chem.rdchem.Mol at 0x7f8ed3190e40>, 'robin'),
 (<rdkit.Chem.rdchem.Mol at 0x7f8ed3190eb0>, 'robin'),
 (<rdkit.Chem.rdchem.Mol at 0x7f8ed3190f20>, 'robin'),
 (<rdkit.Chem.rdchem.Mol at 0x7f8ed3190f90>, 'robin'),
 (<rdkit.Chem.rdchem.Mol at 0x7f8ed31a3040>, 'robin'),
 (<rdkit.Chem.rdchem.Mol at 0x7f8ed31a30b0>, 'robin'),
 (<rdkit.Chem.rdchem.Mol at 0x7f8ed31a3120>, 'robin'),
 (<rdkit.Chem.rdchem.Mol at 0x7f8ed31a3190>, 'robin'),
 (<rdkit.Chem.rdchem.Mol at 0x7f8ed31a3200>, 'robin'),
 (<rdkit.Chem.rdchem.Mol at 0x7f8ed31a3270>, 'robin'),
 (<rdkit.Chem.rdchem.Mol at 0x7f8ed31a32e0>, 'robin'),
 (<rdkit.Chem.rdchem.Mol at 0x7f8ed31a3350>, 'robin'),
 (<rdkit.Chem.rdchem.Mol at 0x7f8ed31a33c0>, 'robin'),
 (<rdkit.Chem.rdchem.Mol at 0x7f8ed31a3430>, 'robin'),
 (<rdkit.C

In [21]:
from rdkit import Chem

# Convert the list of tuples into a DataFrame
df = pd.DataFrame(all_dedup_df, columns=['Molecule', 'Source'])

# Optionally, convert molecule objects to SMILES strings or another representation
df['SMILES'] = df['Molecule'].apply(lambda x: Chem.MolToSmiles(x))


In [22]:

# save the df as pickle
import pickle
with open('data_mvi/normalized_data/all_dedup_df.pkl', 'wb') as f:
    pickle.dump(df, f)

In [23]:
# look for duplicates using smiles in df
duplicates = df[df.duplicated(subset='SMILES', keep=False)]
duplicates

Unnamed: 0,Molecule,Source,SMILES


Create graphs


In [34]:
# show count of each in column Source from df
print(f'Count of each source in the column Source: \n{df["Source"].value_counts()}')

Count of each source in the column Source: 
Source
protein           42692
chemdiv           19960
enamine           15033
life_chemicals     5403
robin              1993
Name: count, dtype: int64


In [8]:
from dgl.data.utils import save_graphs, load_graphs
import dgl


In [25]:
# Load data and prepare for training
reloaded_df = pd.read_csv("data_mvi/combined_df.csv")
graphs, labels_dict = dgl.load_graphs("data_mvi/graphs.bin")

labels = reloaded_df['binds_to_rna'].values

In [26]:
# number of 0 and 1 in labels
print(f'Number of 0 in labels: {len(labels) - sum(labels)}')
print(f'Number of 1 in labels: {sum(labels)}')

Number of 0 in labels: 36931
Number of 1 in labels: 36937


In [13]:
labels.shape, len(graphs),

((73868,), 73868)

In [33]:
# load pkl files to df 
chemdiv_rna_df = pd.read_pickle('data_mvi/data_for_ml/chemdiv_rna_df_ml.pkl')