In [1]:
#imports

from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit import Chem
import networkx as nx
from karateclub import Graph2Vec
import pandas as pd

import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import rdChemReactions
from rdkit.Chem.EnumerateStereoisomers import EnumerateStereoisomers, StereoEnumerationOptions

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

In [2]:
def mol_to_nx(mol):
    G = nx.Graph()

    for atom in mol.GetAtoms():
        G.add_node(atom.GetIdx(),
                   atomic_num=atom.GetAtomicNum(),
                   is_aromatic=atom.GetIsAromatic(),
                   atom_symbol=atom.GetSymbol())
        
    for bond in mol.GetBonds():
        G.add_edge(bond.GetBeginAtomIdx(),
                   bond.GetEndAtomIdx(),
                   bond_type=bond.GetBondType())
        
    return G

In [3]:
# Path to the CSV file
csv_path = "/Users/oliviacullen/Downloads/chembl30_kor_inhibitors.csv"

# Import CSV as DataFrame
df = pd.read_csv(csv_path)

## Creating Graph Embeddings ##

In [4]:
print(">>> read the data file ... ")
df = pd.read_csv(csv_path)
print(">>> data shape = ", df.shape)
print(">>> data columns = ", df.columns, "\n")
print(df)
print()

print(">>> create mol from smiles ... ")
df['mol'] = df['SMILES'].apply(lambda x: Chem.MolFromSmiles(x)) 

print(">>> create nx from mol ... ")
df['graph'] = df['mol'].apply(lambda x: mol_to_nx(x))

print(">>> create graph embedding ... ")
model = Graph2Vec()
model.fit(df['graph'])
df_graph2vec = model.get_embedding()

df_graph2vec = pd.DataFrame(df_graph2vec)
print(">>> df_graph2vec shape = ", df_graph2vec.shape)
print(df_graph2vec)
print()

>>> read the data file ... 
>>> data shape =  (2056, 3)
>>> data columns =  Index(['Name', 'SMILES', 'KOR Inhibitor'], dtype='object') 

              Name                                             SMILES  \
0         CHEMBL10  C[S+]([O-])c1ccc(-c2nc(-c3ccc(F)cc3)c(-c3ccncc...   
1       CHEMBL1006                                  NCCCNCCSP(=O)(O)O   
2       CHEMBL1009                     N[C@@H](Cc1ccc(O)c(O)c1)C(=O)O   
3        CHEMBL101              CCCCC1C(=O)N(c2ccccc2)N(c2ccccc2)C1=O   
4     CHEMBL101168                                     Nn1nnc2ccccc21   
...            ...                                                ...   
2051   CHEMBL98350              O=c1cc(N2CCOCC2)oc2c(-c3ccccc3)cccc12   
2052     CHEMBL989  CC1(C)O[C@@H]2C[C@H]3[C@@H]4C[C@H](F)C5=CC(=O)...   
2053     CHEMBL990             CN(Cc1ccc(C(C)(C)C)cc1)Cc1cccc2ccccc12   
2054     CHEMBL991          Cc1cn([C@H]2C=C[C@@H](CO)O2)c(=O)[nH]c1=O   
2055     CHEMBL998       CCOC(=O)N1CCC(=C2c3ccc(Cl)cc3CCc3cc

In [5]:
# Assuming df and df_graph2vec are your dataframes

# Create an empty list to store the appended rows
appended_rows = []

# Iterate over the rows of both dataframes
for index, row in df.iterrows():
    # Get the corresponding row from df2 as a list
    row_list = list(df_graph2vec.iloc[index])
    
    # Append the row list to the current row of df1
    appended_row = row.append(pd.Series(row_list))
    
    # Append the combined row to the list
    appended_rows.append(appended_row)

# Create a new dataframe from the appended rows list
df_combined = pd.DataFrame(appended_rows)

# Reset the index of the combined dataframe if needed
df_combined.reset_index(drop=True, inplace=True)

## Creating a dataframe with all the LB properties ##

In [6]:
def calculate_num_amide_bonds(molecule):
    # Define the amide bond SMARTS pattern
    amide_bond_smarts = Chem.MolFromSmarts("[*;$(C(=O)N);*]")
    
    # Use the SMARTS pattern to match amide bonds in the molecule
    amide_bonds = molecule.GetSubstructMatches(amide_bond_smarts)
    
    # Return the number of amide bonds
    return len(amide_bonds)

def calculate_num_heterocycles(molecule):
    # Get the number of SSSR in the molecule
    num_sssr = Chem.GetSSSR(molecule)
    
    # Count the number of rings that contain heteroatoms
    num_heterocycles = 0
    for ring in molecule.GetRingInfo().AtomRings():
        has_heteroatom = any(molecule.GetAtomWithIdx(atom_idx).GetAtomicNum() != 6 for atom_idx in ring)
        if has_heteroatom:
            num_heterocycles += 1
    
    # Return the number of heterocycles
    return num_heterocycles

def calculate_num_spiroatoms(molecule):
    num_spiroatoms = 0
    
    # Iterate over the atoms in the molecule
    for atom in molecule.GetAtoms():
        if atom.IsInRing() and atom.GetTotalDegree() >= 3:
            ring_count = 0
            for bond in atom.GetBonds():
                if bond.GetOtherAtom(atom).IsInRing():
                    ring_count += 1
            if ring_count >= 2:
                num_spiroatoms += 1
    
    # Return the number of spiroatoms
    return num_spiroatoms

In [7]:
# Create a new dataframe to store the descriptors
descriptor_df = pd.DataFrame()

# Iterate over the SMILES column in your original dataframe
for index, row in df.iterrows():
    smiles = row['SMILES']
    
    # Convert the SMILES to a molecule object
    molecule = Chem.MolFromSmiles(smiles)
    
    # Calculate the descriptors
    descriptors = {}
    descriptors['exactmw'] = Descriptors.ExactMolWt(molecule)
    descriptors['amw'] = Descriptors.MolWt(molecule)
    descriptors['lipinskiHBA'] = Descriptors.NumHAcceptors(molecule)
    descriptors['lipinskiHBD'] = Descriptors.NumHDonors(molecule)
    descriptors['NumRotatableBonds'] = Descriptors.NumRotatableBonds(molecule)
    descriptors['NumHBD'] = Descriptors.NumHDonors(molecule)
    descriptors['NumHBA'] = Descriptors.NumHAcceptors(molecule)
    descriptors['NumHeavyAtoms'] = Descriptors.HeavyAtomCount(molecule)
    descriptors['NumHeteroatoms'] = Descriptors.NumHeteroatoms(molecule)
    descriptors['NumAmideBonds'] = calculate_num_amide_bonds(molecule)
    descriptors['FractionCSP3'] = Descriptors.FractionCSP3(molecule)
    descriptors['NumRings'] = Descriptors.RingCount(molecule)
    descriptors['NumAromaticRings'] = Descriptors.NumAromaticRings(molecule)
    descriptors['NumAliphaticRings'] = Descriptors.NumAliphaticRings(molecule)
    descriptors['NumSaturatedRings'] = Descriptors.NumSaturatedRings(molecule)
    descriptors['NumHeterocycles'] = calculate_num_heterocycles(molecule)
    descriptors['NumAromaticHeterocycles'] = Descriptors.NumAromaticHeterocycles(molecule)
    descriptors['NumSaturatedHeterocycles'] = Descriptors.NumSaturatedHeterocycles(molecule)
    descriptors['NumAliphaticHeterocycles'] = Descriptors.NumAliphaticHeterocycles(molecule)
    descriptors['NumSpiroAtoms'] = calculate_num_spiroatoms(molecule)
    descriptors['NumBridgeheadAtoms'] = rdMolDescriptors.CalcNumBridgeheadAtoms(molecule)
    descriptors['NumAtomStereoCenters'] = len(tuple(EnumerateStereoisomers(molecule)))
    descriptors['labuteASA'] = Descriptors.LabuteASA(molecule)
    descriptors['tpsa'] = Descriptors.TPSA(molecule)
    descriptors['CrippenClogP'] = Descriptors.MolLogP(molecule)
    descriptors['CrippenMR'] = Descriptors.MolMR(molecule)
    descriptors['chi0v'] = Descriptors.Chi0v(molecule)
    descriptors['chi1v'] = Descriptors.Chi1v(molecule)
    descriptors['chi2v'] = Descriptors.Chi2v(molecule)
    descriptors['chi3v'] = Descriptors.Chi3v(molecule)
    descriptors['chi4v'] = Descriptors.Chi4v(molecule)
    descriptors['chi0n'] = Descriptors.Chi0n(molecule)
    descriptors['chi1n'] = Descriptors.Chi1n(molecule)
    descriptors['chi2n'] = Descriptors.Chi2n(molecule)
    descriptors['chi3n'] = Descriptors.Chi3n(molecule)
    descriptors['chi4n'] = Descriptors.Chi4n(molecule)
    descriptors['hallKierAlpha'] = Descriptors.HallKierAlpha(molecule)
    descriptors['kappa1'] = Descriptors.Kappa1(molecule)
    descriptors['kappa2'] = Descriptors.Kappa2(molecule)
    descriptors['kappa3'] = Descriptors.Kappa3(molecule)
    
    # Append the descriptors to the new dataframe
    descriptor_df = descriptor_df.append(descriptors, ignore_index=True)

# Merge the original dataframe with the descriptor dataframe
result_df = pd.concat([df, descriptor_df], axis=1)

In [8]:
list_of_lists = descriptor_df.values.tolist()

In [9]:
#appending descriptor properties as a 1D array to ligand dataframe
df['lb_prop'] = list_of_lists

In [10]:
# droping unnecessary columns
columns_to_drop = ['mol', 'graph',]  # List of column names to drop

# Create a new DataFrame without the specified columns
df_new = df.drop(columns=columns_to_drop)

In [11]:
df_new.to_csv('KOH_inhibitor_molecules.csv', index=False)

In [12]:
#creating molecular representations and their labels for model
lb_props = df_new['lb_prop'].tolist()
labels1 = df_new['KOR Inhibitor'].tolist()

In [13]:
#turning the lists into tensors
labels = torch.tensor(labels1, dtype=torch.float32).unsqueeze(1)
ligands = torch.tensor(lb_props, dtype=torch.float32)

In [14]:
# Define the neural network model
class LigandClassifier(nn.Module):
    def __init__(self, input_size, hidden_sizes, output_size):
        super(LigandClassifier, self).__init__()
        self.fc_layers = nn.ModuleList()
        prev_size = input_size
        for size in hidden_sizes:
            self.fc_layers.append(nn.Linear(prev_size, size))
            self.fc_layers.append(nn.ReLU())
            prev_size = size
        self.fc_layers.append(nn.Linear(prev_size, output_size))
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        for layer in self.fc_layers:
            x = layer(x)
        out = self.sigmoid(x)
        return out

# Define a custom dataset
class LigandDataset(Dataset):
    def __init__(self, ligands, labels):
        self.ligands = torch.tensor(ligands, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.ligands)

    def __getitem__(self, idx):
        return self.ligands[idx], self.labels[idx]

ligand_dataset = LigandDataset(ligands, labels)

# Define the hyperparameters
input_size = len(ligand_dataset[0][0])
hidden_sizes = [128, 64, 32]
output_size = 1
learning_rate = 0.00007
batch_size = 32
num_epochs = 100
num_folds = 5

# Create the neural network model
model = LigandClassifier(input_size, hidden_sizes, output_size)

# Define the loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Perform 5-fold cross-validation
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)
fold_aurocs = []

for fold, (train_indices, val_indices) in enumerate(skf.split(ligand_dataset.ligands, ligand_dataset.labels)):
    print(f"Fold [{fold+1}/{num_folds}]")

    # Create train and validation datasets for the current fold
    train_dataset = LigandDataset(ligand_dataset.ligands[train_indices], ligand_dataset.labels[train_indices])
    val_dataset = LigandDataset(ligand_dataset.ligands[val_indices], ligand_dataset.labels[val_indices])

    # Create train and validation data loaders
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # Initialize the weights using Xavier uniform initialization
    for layer in model.fc_layers:
        if isinstance(layer, nn.Linear):
            nn.init.xavier_uniform_(layer.weight)

    # Train the model
    for epoch in range(num_epochs):
        for ligands_batch, labels_batch in train_dataloader:
            # Forward pass
            outputs = model(ligands_batch)

            # Calculate the loss
            loss = criterion(outputs, labels_batch)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # Print the loss for every epoch
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

    # Calculate AUROC for the validation set
    val_predictions = []
    val_labels_list = []

    with torch.no_grad():
        for val_ligands, val_labels in val_dataloader:
            val_outputs = model(val_ligands)
            val_predictions += val_outputs.flatten().tolist()
            val_labels_list += val_labels.tolist()

    fold_auroc = roc_auc_score(val_labels_list, val_predictions)
    fold_aurocs.append(fold_auroc)
    print(f"AUROC for Fold {fold+1}: {fold_auroc:.4f}")

# Calculate and print the average AUROC
average_auroc = sum(fold_aurocs) / num_folds
print(f"\nAverage AUROC: {average_auroc:.4f}")


  self.ligands = torch.tensor(ligands, dtype=torch.float32)
  self.labels = torch.tensor(labels, dtype=torch.float32)
  self.ligands = torch.tensor(ligands, dtype=torch.float32)
  self.labels = torch.tensor(labels, dtype=torch.float32)


Fold [1/5]
Epoch [1/100], Loss: 21.5822
Epoch [2/100], Loss: 21.1056
Epoch [3/100], Loss: 8.3333
Epoch [4/100], Loss: 65.2653
Epoch [5/100], Loss: 21.7628
Epoch [6/100], Loss: 15.5648
Epoch [7/100], Loss: 21.3834
Epoch [8/100], Loss: 25.0000
Epoch [9/100], Loss: 8.3333
Epoch [10/100], Loss: 8.3333
Epoch [11/100], Loss: 16.6667
Epoch [12/100], Loss: 33.3344
Epoch [13/100], Loss: 41.6667
Epoch [14/100], Loss: 23.7258
Epoch [15/100], Loss: 25.0000
Epoch [16/100], Loss: 16.6667
Epoch [17/100], Loss: 33.3333
Epoch [18/100], Loss: 48.4501
Epoch [19/100], Loss: 32.2108
Epoch [20/100], Loss: 16.6667
Epoch [21/100], Loss: 8.3333
Epoch [22/100], Loss: 24.0160
Epoch [23/100], Loss: 0.0000
Epoch [24/100], Loss: 58.3333
Epoch [25/100], Loss: 16.6667
Epoch [26/100], Loss: 33.3333
Epoch [27/100], Loss: 8.3333
Epoch [28/100], Loss: 15.6303
Epoch [29/100], Loss: 16.6667
Epoch [30/100], Loss: 33.3333
Epoch [31/100], Loss: 33.3333
Epoch [32/100], Loss: 16.6667
Epoch [33/100], Loss: 33.3333
Epoch [34/100]

  self.ligands = torch.tensor(ligands, dtype=torch.float32)
  self.labels = torch.tensor(labels, dtype=torch.float32)


Epoch [3/100], Loss: 0.6609
Epoch [4/100], Loss: 0.6400
Epoch [5/100], Loss: 0.5422
Epoch [6/100], Loss: 0.6023
Epoch [7/100], Loss: 0.3653
Epoch [8/100], Loss: 0.5583
Epoch [9/100], Loss: 0.6804
Epoch [10/100], Loss: 0.7362
Epoch [11/100], Loss: 0.2695
Epoch [12/100], Loss: 0.7575
Epoch [13/100], Loss: 0.5420
Epoch [14/100], Loss: 0.5932
Epoch [15/100], Loss: 0.5688
Epoch [16/100], Loss: 0.4050
Epoch [17/100], Loss: 0.3834
Epoch [18/100], Loss: 0.2393
Epoch [19/100], Loss: 0.7159
Epoch [20/100], Loss: 0.6564
Epoch [21/100], Loss: 0.6748
Epoch [22/100], Loss: 0.5953
Epoch [23/100], Loss: 0.4906
Epoch [24/100], Loss: 0.3723
Epoch [25/100], Loss: 0.3372
Epoch [26/100], Loss: 0.4886
Epoch [27/100], Loss: 0.3403
Epoch [28/100], Loss: 0.3229
Epoch [29/100], Loss: 0.4115
Epoch [30/100], Loss: 0.3505
Epoch [31/100], Loss: 0.4572
Epoch [32/100], Loss: 0.5395
Epoch [33/100], Loss: 0.3921
Epoch [34/100], Loss: 0.4521
Epoch [35/100], Loss: 0.2567
Epoch [36/100], Loss: 0.3577
Epoch [37/100], Loss:

  self.ligands = torch.tensor(ligands, dtype=torch.float32)
  self.labels = torch.tensor(labels, dtype=torch.float32)


Epoch [2/100], Loss: 0.5659
Epoch [3/100], Loss: 0.3631
Epoch [4/100], Loss: 0.5900
Epoch [5/100], Loss: 0.4720
Epoch [6/100], Loss: 0.5924
Epoch [7/100], Loss: 0.3669
Epoch [8/100], Loss: 0.4680
Epoch [9/100], Loss: 0.6414
Epoch [10/100], Loss: 0.2676
Epoch [11/100], Loss: 0.3341
Epoch [12/100], Loss: 0.6548
Epoch [13/100], Loss: 0.3943
Epoch [14/100], Loss: 0.3710
Epoch [15/100], Loss: 0.2738
Epoch [16/100], Loss: 0.4740
Epoch [17/100], Loss: 0.4653
Epoch [18/100], Loss: 0.3588
Epoch [19/100], Loss: 0.4319
Epoch [20/100], Loss: 0.3255
Epoch [21/100], Loss: 0.4307
Epoch [22/100], Loss: 0.2112
Epoch [23/100], Loss: 0.6178
Epoch [24/100], Loss: 0.5899
Epoch [25/100], Loss: 0.2324
Epoch [26/100], Loss: 0.3228
Epoch [27/100], Loss: 0.6470
Epoch [28/100], Loss: 0.3282
Epoch [29/100], Loss: 0.2920
Epoch [30/100], Loss: 0.4651
Epoch [31/100], Loss: 0.5876
Epoch [32/100], Loss: 0.4851
Epoch [33/100], Loss: 0.4477
Epoch [34/100], Loss: 0.3098
Epoch [35/100], Loss: 0.5086
Epoch [36/100], Loss: 

  self.ligands = torch.tensor(ligands, dtype=torch.float32)
  self.labels = torch.tensor(labels, dtype=torch.float32)


Epoch [3/100], Loss: 0.5600
Epoch [4/100], Loss: 0.4316
Epoch [5/100], Loss: 0.4144
Epoch [6/100], Loss: 0.6433
Epoch [7/100], Loss: 0.4366
Epoch [8/100], Loss: 0.4924
Epoch [9/100], Loss: 0.5968
Epoch [10/100], Loss: 0.4919
Epoch [11/100], Loss: 0.5345
Epoch [12/100], Loss: 0.6025
Epoch [13/100], Loss: 8.1507
Epoch [14/100], Loss: 0.5173
Epoch [15/100], Loss: 0.5248
Epoch [16/100], Loss: 0.4509
Epoch [17/100], Loss: 0.3367
Epoch [18/100], Loss: 0.7217
Epoch [19/100], Loss: 0.4764
Epoch [20/100], Loss: 0.3875
Epoch [21/100], Loss: 0.4435
Epoch [22/100], Loss: 0.5297
Epoch [23/100], Loss: 0.6581
Epoch [24/100], Loss: 0.5573
Epoch [25/100], Loss: 0.7265
Epoch [26/100], Loss: 0.4429
Epoch [27/100], Loss: 0.4288
Epoch [28/100], Loss: 0.5822
Epoch [29/100], Loss: 0.4065
Epoch [30/100], Loss: 0.8901
Epoch [31/100], Loss: 0.7752
Epoch [32/100], Loss: 0.7194
Epoch [33/100], Loss: 0.8210
Epoch [34/100], Loss: 0.3880
Epoch [35/100], Loss: 0.5131
Epoch [36/100], Loss: 0.3485
Epoch [37/100], Loss:

  self.ligands = torch.tensor(ligands, dtype=torch.float32)
  self.labels = torch.tensor(labels, dtype=torch.float32)


Epoch [4/100], Loss: 0.4981
Epoch [5/100], Loss: 0.4463
Epoch [6/100], Loss: 0.3664
Epoch [7/100], Loss: 0.4114
Epoch [8/100], Loss: 0.4136
Epoch [9/100], Loss: 0.4609
Epoch [10/100], Loss: 0.2205
Epoch [11/100], Loss: 0.2817
Epoch [12/100], Loss: 0.2439
Epoch [13/100], Loss: 0.4218
Epoch [14/100], Loss: 0.4276
Epoch [15/100], Loss: 0.3502
Epoch [16/100], Loss: 0.3310
Epoch [17/100], Loss: 0.3345
Epoch [18/100], Loss: 0.4089
Epoch [19/100], Loss: 0.6301
Epoch [20/100], Loss: 0.8561
Epoch [21/100], Loss: 0.6750
Epoch [22/100], Loss: 0.4921
Epoch [23/100], Loss: 0.3317
Epoch [24/100], Loss: 0.3604
Epoch [25/100], Loss: 0.5519
Epoch [26/100], Loss: 0.4589
Epoch [27/100], Loss: 0.4356
Epoch [28/100], Loss: 0.2973
Epoch [29/100], Loss: 0.4149
Epoch [30/100], Loss: 0.3244
Epoch [31/100], Loss: 0.2803
Epoch [32/100], Loss: 0.7039
Epoch [33/100], Loss: 0.3557
Epoch [34/100], Loss: 0.4207
Epoch [35/100], Loss: 0.2998
Epoch [36/100], Loss: 0.4219
Epoch [37/100], Loss: 0.3017
Epoch [38/100], Loss