In [1]:
from functions.molecule_processor import MoleculeProcessor
from functions.diversity_picker import DiversityPicker
import pandas as pd
from rdkit import Chem
import pandas as pd

import dgl
from dgl.data.utils import save_graphs, load_graphs
import torch
from rdkit import Chem
from dgllife.utils import CanonicalAtomFeaturizer, CanonicalBondFeaturizer
from dgllife.utils import mol_to_bigraph

[17:40:11] Initializing Normalizer


In [2]:
chemdiv_rna_df = pd.read_pickle('data_mvi/data_for_ml/chemdiv_rna_df_ml.pkl')
enamine_rna_df = pd.read_pickle('data_mvi/data_for_ml/enamine_rna_df_ml.pkl')
life_chemicals_rna_df = pd.read_pickle('data_mvi/data_for_ml/life_chemicals_rna_df_ml.pkl')
robin_rna_df = pd.read_pickle('data_mvi/data_for_ml/robin_rna_df_ml.pkl')

enamine_protein = pd.read_pickle('data_mvi/data_for_ml/enamine_protein_df_ml.pkl')

rna_df = pd.concat([chemdiv_rna_df, enamine_rna_df, life_chemicals_rna_df, robin_rna_df])
rna_df.shape, enamine_protein.shape


((38710, 6), (38710, 6))

In [3]:
all_molecules = pd.concat([rna_df, enamine_protein])
all_molecules.columns

Index(['mol', 'smiles', 'source', 'ecfp6', 'bit_info_map', 'rna'], dtype='object')

In [4]:
all_molecules.shape

(77420, 6)

In [5]:
# reset index in all_molecules
all_molecules.reset_index(drop=True, inplace=True)
# shuffle rows 
all_molecules = all_molecules.sample(frac=1).reset_index(drop=True)
all_molecules.head(5)

Unnamed: 0,mol,smiles,source,ecfp6,bit_info_map,rna
0,<rdkit.Chem.rdchem.Mol object at 0x7fa1f68cbae0>,CCOc1ccc(CC(=O)Nc2ccc3c(c2)CCC(=O)N3Cc2ccccc2)cc1,chemdiv,0000000000000000000000000000000010000000010000...,"{32: [(25, 3)], 41: [(8, 1)], 69: [(2, 2)], 80...",1
1,<rdkit.Chem.rdchem.Mol object at 0x7fa1d2a15090>,Cc1sc(=O)n(CC(=O)N2CCC(c3ccccc3)C2(C)C)c1C,enamine_protein,0000000000000000000000000000000000000000000000...,"{63: [(3, 1)], 80: [(6, 0)], 132: [(12, 3)], 1...",0
2,<rdkit.Chem.rdchem.Mol object at 0x7fa1d1e23770>,COc1ccc(-c2cc(C(F)(F)F)c(C#N)c(SCCC(=O)O)n2)cc1,enamine_protein,0000000000000000000000000000000010000000000000...,"{32: [(16, 1)], 46: [(18, 2)], 80: [(18, 0), (...",0
3,<rdkit.Chem.rdchem.Mol object at 0x7fa1f5f15ef0>,Cn1nccc1C(=O)N1CCC[C@H]1c1nnc(-c2ccc(F)cc2)[nH]1,chemdiv,0001000000000000000000000000000000000000000100...,"{3: [(16, 2)], 43: [(2, 1)], 90: [(20, 2)], 18...",1
4,<rdkit.Chem.rdchem.Mol object at 0x7fa1d1d92ea0>,O=S(=O)(Nc1nc2ccc(Br)cn2n1)c1cccc(F)c1,enamine_protein,0000100000000000000000000000000000000000000000...,"{4: [(1, 2)], 73: [(12, 3)], 80: [(3, 2)], 177...",0


In [6]:
# Initialize a list to store molecules with disconnected components
disconnected_mols = []

# Check for valid SMILES and Mols
for idx, row in all_molecules.iterrows():
    smiles = row['smiles']
    mol = row['mol']
    source = row['source']

    # Check if the SMILES string can be converted to a valid RDKit Mol object
    mol_from_smiles = Chem.MolFromSmiles(smiles)
    if mol_from_smiles is None:
        print(f"Invalid SMILES string at index {idx}: {smiles}")
        continue

    # Check if SMILES string do not have "." in it
    if '.' in smiles:
        print(f"Disconnected components in SMILES string at index {idx}: {smiles}, from source {source}")
        continue
    
    # Check if the Mol object is valid (no disconnected components)
    if mol is None or mol.GetNumAtoms() == 0:
        print(f"Invalid Mol object at index {idx}: {smiles}")
        continue

    # Check for disconnected components in Mol objects
    if mol is not None:
        # Get the number of disconnected components
        num_components = Chem.GetMolFrags(mol, asMols=False, sanitizeFrags=False)
        if len(num_components) > 1:
            print(f"Disconnected components in molecule at index {idx}: {smiles}, from source {source}")
            disconnected_mols.append(row)
            all_molecules.drop(idx, inplace=True)

# Convert the list to a DataFrame
disconnected_mols_df = pd.DataFrame(disconnected_mols)
# save disconnected_mols_df to json file
disconnected_mols_df.to_json('disconnected_mols_df.json')

if disconnected_mols_df.shape[0] < 1:
    print("No disconnected molecules found!")

No disconnected molecules found!


In [7]:


def create_graphs_from_dataframe(df):
    graphs = []
    labels = []

    for _, row in df.iterrows():
        mol = Chem.MolFromSmiles(row['smiles'])  # Assuming SMILES representation
        label = row['rna']  # Assuming a single label column

        # Convert RDKit molecule to a DGLGraph with features
        graph = mol_to_bigraph(
            mol,
            node_featurizer=CanonicalAtomFeaturizer(),
            edge_featurizer=CanonicalBondFeaturizer(self_loop=True),
            explicit_hydrogens=False,
            add_self_loop=True
        )
        
        graphs.append(graph)
        labels.append(label)

    # Convert list of labels into a torch tensor and wrap in a dictionary
    label_tensor = torch.tensor(labels).unsqueeze(-1)
    graph_labels = {'labels': label_tensor}

    return graphs, graph_labels




In [8]:
# Assume 'df' is your DataFrame containing molecules and labels
graphs, graph_labels = create_graphs_from_dataframe(all_molecules)

# Save the graphs and labels to a binary file
save_graphs("data_mvi/graphs.bin", graphs, graph_labels)