In [2]:
import csv
import math
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.optimize as opt
import seaborn as sns
from scipy import stats
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.decomposition import IncrementalPCA
from tabulate import tabulate
import tensorflow as tf
import deepchem as dc
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from tensorflow import keras
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
import warnings
warnings.filterwarnings('ignore')

Instructions for updating:
experimental_relax_shapes is deprecated, use reduce_retracing instead


In [3]:

ddi_fp = r"C:\Users\sreej\Desktop\Capstone\drugbank.tab"
ddi = pd.read_csv(ddi_fp, sep='\t')

#kaggle_fp = "Data Files\\SMILES-Kaggle\\chembl_22_clean_1576904_sorted_std_final.smi"
#smiles = pd.read_csv(kaggle_fp, sep='\t')

ddi["Y"] = ddi["Y"].astype("category")
ddi["Map"] = ddi["Map"].astype("category")

#counting interaction types for potential later weighting
interaction_counts = pd.DataFrame(ddi['Y'].value_counts().rename_axis('y').reset_index(name='count')).sort_values(by='count', ascending=False)
interaction_counts['row_num'] = interaction_counts.index + 1
interaction_counts['log_count'] = np.log(interaction_counts['count'])

#listing longer explanations of interaction types for later use
interaction_types = ddi[['Y','Map']].drop_duplicates(subset=['Y'])

#remove longer name of interaction type from main dataset
ddi = ddi.drop("Map",axis=1)

ddi.head(10)

Unnamed: 0,ID1,ID2,Y,X1,X2
0,DB04571,DB00460,1,CC1=CC2=CC3=C(OC(=O)C=C3C)C(C)=C2O1,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...
1,DB00855,DB00460,1,NCC(=O)CCC(O)=O,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...
2,DB09536,DB00460,1,O=[Ti]=O,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...
3,DB01600,DB00460,1,CC(C(O)=O)C1=CC=C(S1)C(=O)C1=CC=CC=C1,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...
4,DB09000,DB00460,1,CC(CN(C)C)CN1C2=CC=CC=C2SC2=C1C=C(C=C2)C#N,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...
5,DB11630,DB00460,1,OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)...,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...
6,DB00553,DB00460,1,COC1=C2OC(=O)C=CC2=CC2=C1OC=C2,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...
7,DB06261,DB00460,1,[H]N([H])CC(=O)CCC(=O)OCCCCCC,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...
8,DB01878,DB00460,1,O=C(C1=CC=CC=C1)C1=CC=CC=C1,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...
9,DB00140,DB00460,1,CC1=C(C)C=C2N(C[C@H](O)[C@H](O)[C@H](O)CO)C3=N...,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...


In [None]:
print(ddi.shape)

In [4]:
#quick function to turn a list of size 1 lists of strings into a list of strings, for later use
def delist(list_of_lists):
    list_of_strings = []
    for inner_list in list_of_lists:
        string = inner_list[0]
        list_of_strings.append(string)
    return list_of_strings

Preprocessing

In [6]:

# counting drugs by number of mentions in database
old = pd.DataFrame()
old["total"] = ddi['ID1'].value_counts()
old = old.reset_index()
old.columns = ['ID', 'count'] 
new = pd.DataFrame()
new["total"] = ddi['ID2'].value_counts()
new = new.reset_index()
new.columns = ['ID', 'count'] 
drug_counts = pd.merge(old,new,how='outer',on='ID').fillna(0)
drug_counts['total'] = drug_counts['count_x'] + drug_counts['count_y']

drug_counts = drug_counts.sort_values(by='total')
drug_counts_one = pd.DataFrame(drug_counts[drug_counts['total']==1]['ID'])

#removing drugs only in database once
ddi_proc = ddi[ ~ddi['ID1'].isin(drug_counts_one['ID'])]
ddi_proc = ddi_proc[ ~ddi_proc['ID2'].isin(drug_counts_one['ID'])]

#removing one particular drug with a problematic SMILES code
ddi_proc = ddi_proc[ddi_proc['X1']!="OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1"]


In [7]:
#create main datasets

data = dc.data.NumpyDataset(X=ddi_proc[['X1','X2']], y=ddi[['Y']])
df = data.to_dataframe()
df = df.sample(frac=1).reset_index(drop=True)

X_one = delist(df[["X1"]].values.tolist())
X_two = delist(df[["X2"]].values.tolist())

search_string = "nan"

count = X_one.count(search_string)
if count > 0:
    print(f"'{search_string}' found {count} times in X_one.")
count = X_two.count(search_string)
if count > 0:
    print(f"'{search_string}' found {count} times in X_two.")

In [8]:
# reduce dataset down to equal number of top 20 categories to create an equalized dataset

top20 = interaction_counts[interaction_counts['row_num'] <= 20]

reduced_df = df.merge(top20['y'], on='y')
reduced_df=reduced_df.sample(frac=1).reset_index(drop=True)

eq_df = reduced_df.groupby('y').apply(lambda x: x.sample(min(len(x), 500))).reset_index(drop=True)

eq_df

Unnamed: 0,X1,X2,y,w,ids
0,CO[C@H]1\C=C\O[C@@]2(C)OC3=C(C2=O)C2=C(C(O)=C3...,CO[C@H]1[C@H](O)CC(=O)O[C@H](C)C\C=C\C=C\[C@H]...,4,1.0,2729
1,CCCC(C)C1(CC=C)C(=O)NC(=O)NC1=O,CNCCCC1C2=CC=CC=C2C=CC2=CC=CC=C12,4,1.0,3885
2,OP(O)(=O)OCN1C(=O)NC(C1=O)(C1=CC=CC=C1)C1=CC=C...,OC(=O)CSC1=NN=C(Br)N1C1=CC=C(C2CC2)C2=C1C=CC=C2,4,1.0,3603
3,CO[C@H]1\C=C\O[C@@]2(C)OC3=C(C2=O)C2=C(C(O)=C3...,COC1=CC=CC2=C1C(=O)C1=C(O)C3=C(C[C@](O)(C[C@@H...,4,1.0,878
4,CCC1(C(=O)NCNC1=O)C1=CC=CC=C1,[H][C@@]12CC[C@](C)(O)[C@@]1(C)CC[C@@]1([H])[C...,4,1.0,1346
...,...,...,...,...,...
9995,[H][C@@]12CC[C@](O)(C(C)=O)[C@@]1(C)C[C@H](O)[...,NS(=O)(=O)C1=CC2=C(NC(CC3=CC=CC=C3)NS2(=O)=O)C...,83,1.0,190771
9996,CC(CC1=CC=C(O)C=C1)NCC(O)C1=CC(O)=CC(O)=C1,CC1CC2=CC=CC=C2N1NC(=O)C1=CC(=C(Cl)C=C1)S(N)(=...,83,1.0,191238
9997,[H][C@@]12C[C@@H](C)[C@](O)(C(=O)CO)[C@@]1(C)C...,CN1C(CSCC(F)(F)F)NC2=CC(Cl)=C(C=C2S1(=O)=O)S(N...,83,1.0,190274
9998,CNC[C@H](O)C1=CC(O)=C(O)C=C1,CCCCNC1=C(OC2=CC=CC=C2)C(=CC(=C1)C(O)=O)S(N)(=...,83,1.0,190477


In [9]:
print(df.columns)  
print(top20.columns)
print(interaction_counts)

Index(['X1', 'X2', 'y', 'w', 'ids'], dtype='object')
Index(['y', 'count', 'row_num', 'log_count'], dtype='object')
     y  count  row_num  log_count
0   49  60751        1  11.014539
1   47  34360        2  10.444648
2   73  23779        3  10.076558
3   75   9470        4   9.155884
4   60   8397        5   9.035630
..  ..    ...      ...        ...
81  28     11       82   2.397895
82   1     11       83   2.397895
83  52     10       84   2.302585
84  26      7       85   1.945910
85  42      6       86   1.791759

[86 rows x 4 columns]


Information

Each atom in a molecule is a node on the graph. 

Lines connecting nodes represent chemical bonds between atoms, with the type of bond potentially encoded as edge attributes. 



In [10]:

import torch 
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data 
from torch_geometric.nn import GCNConv
from torch_geometric.nn import GINEConv
from torch_geometric.nn import global_mean_pool
import torch.optim as optim
from rdkit import Chem


def smiles_to_graphs1(smiles): 

    # creat RDKit Mol object
    mol = Chem.MolFromSmiles(smiles)

    if mol is None: 
        print(f'Invalid SMILES Code: {smiles}')
        return None

    # atom features
    node_features = []
    for n in mol.GetAtoms(): 
        node_features.append([n.GetAtomicNum()])
    
    # bond features 
    bond_map = {
    Chem.BondType.SINGLE: 1, 
    Chem.BondType.DOUBLE: 2, 
    Chem.BondType.TRIPLE: 3, 
    Chem.BondType.AROMATIC: 4
    } 

    bond_indices = []
    bond_type_features = []
    for b in mol.GetBonds(): 

        # bond types (bi-drectional)
        bond_type = bond_map.get(b.GetBondType(), 0)
        bond_type_features.append([bond_type])
        bond_type_features.append([bond_type])

        # bond indices (bi-drectional)
        a1_idx = b.GetBeginAtomIdx()
        a2_idx = b.GetEndAtomIdx()

        bond_indices.append([a1_idx, a2_idx])
        bond_indices.append([a2_idx, a1_idx])

    # tensors 
    #node_tensor = torch.tensor(node_features, dtype = torch.float)
    #bond_index_tensor = torch.tensor(bond_indices, dtype = torch.long).t().contiguous()
    #bond_type_tensor = torch.tensor(bond_type_features)

    data = Data(x = torch.tensor(node_features, dtype = torch.float).view(-1, 1), 
                edge_index = torch.tensor(bond_indices, dtype = torch.long).t().contiguous(), 
                edge_attr = torch.tensor(bond_type_features, dtype = torch.float))

    return data




In [11]:
class MolGraphGNN(nn.Module):
    def __init__(self, input_dim = 1, edge_dim = 1, hidden_dim = 128, out_dim = 128):
        super(MolGraphGNN, self).__init__()

        # use GINEConv due to edge_attr handling
        self.conv1 = GINEConv(nn.Linear(input_dim, hidden_dim), edge_dim = edge_dim)
        self.conv2 = GINEConv(nn.Linear(hidden_dim, hidden_dim), edge_dim = edge_dim)
        
    def forward(self, x, edge_index, edge_attr): 
        x = self.conv1(x, edge_index, edge_attr)
        x = F.relu(x)
        x = self.conv2(x, edge_index, edge_attr)
        x = F.relu(x)

        x = x.mean(dim = 0)
        return x 
    
class DDIClassifier(nn.Module): 
    def __init__(self, gnn, embedding_dim = 128, num_classes = 20):
        super(DDIClassifier, self).__init__()
        self.gnn = gnn
        self.fc1 = nn.Linear(embedding_dim * 2, 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, drug1, drug2): 
        emb1 = self.gnn(drug1.x, drug1.edge_index, drug1.edge_attr)
        emb2 = self.gnn(drug2.x, drug2.edge_index, drug2.edge_attr)

        x = torch.cat((emb1, emb2), dim = -1)
        x = x.unsqueeze(0)
        
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x
    