In [106]:
import csv
import math
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.optimize as opt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, KBinsDiscretizer, RobustScaler, PolynomialFeatures, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score
from sklearn.pipeline import Pipeline
from sklearn.decomposition import IncrementalPCA
from tabulate import tabulate
import tensorflow as tf
import deepchem as dc
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from tensorflow import keras
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.loader import DataLoader
from torch_geometric.data import Data
warnings.filterwarnings("ignore", category=DeprecationWarning)

## Initial Loading

In [108]:

ddi_fp = "Data Files\\drugbank\\drugbank.tab"
ddi = pd.read_csv(ddi_fp, sep='\t')

kaggle_fp = "Data Files\\SMILES-Kaggle\\chembl_22_clean_1576904_sorted_std_final.smi"
smiles = pd.read_csv(kaggle_fp, sep='\t')

ddi["Y"] = ddi["Y"].astype("category")
ddi["Map"] = ddi["Map"].astype("category")

#counting interaction types for potential later weighting
interaction_counts = pd.DataFrame(ddi['Y'].value_counts().rename_axis('y').reset_index(name='count')).sort_values(by='count', ascending=False)
interaction_counts['row_num'] = interaction_counts.index + 1
interaction_counts['log_count'] = np.log(interaction_counts['count'])

#listing longer explanations of interaction types for later use
interaction_types = ddi[['Y','Map']].drop_duplicates(subset=['Y'])

#remove longer name of interaction type from main dataset
ddi = ddi.drop("Map",axis=1)

ddi.head(10)

Unnamed: 0,ID1,ID2,Y,X1,X2
0,DB04571,DB00460,1,CC1=CC2=CC3=C(OC(=O)C=C3C)C(C)=C2O1,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...
1,DB00855,DB00460,1,NCC(=O)CCC(O)=O,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...
2,DB09536,DB00460,1,O=[Ti]=O,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...
3,DB01600,DB00460,1,CC(C(O)=O)C1=CC=C(S1)C(=O)C1=CC=CC=C1,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...
4,DB09000,DB00460,1,CC(CN(C)C)CN1C2=CC=CC=C2SC2=C1C=C(C=C2)C#N,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...
5,DB11630,DB00460,1,OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)...,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...
6,DB00553,DB00460,1,COC1=C2OC(=O)C=CC2=CC2=C1OC=C2,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...
7,DB06261,DB00460,1,[H]N([H])CC(=O)CCC(=O)OCCCCCC,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...
8,DB01878,DB00460,1,O=C(C1=CC=CC=C1)C1=CC=CC=C1,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...
9,DB00140,DB00460,1,CC1=C(C)C=C2N(C[C@H](O)[C@H](O)[C@H](O)CO)C3=N...,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...


In [109]:
ddi.shape

(191808, 5)

In [110]:
#quick function to turn a list of size 1 lists of strings into a list of strings, for later use
def delist(list_of_lists):
    list_of_strings = []
    for inner_list in list_of_lists:
        string = inner_list[0]
        list_of_strings.append(string)
    return list_of_strings

## Preprocessing

In [112]:

# counting drugs by number of mentions in database
old = pd.DataFrame()
old["total"] = ddi['ID1'].value_counts()
old = old.reset_index()
old.columns = ['ID', 'count'] 
new = pd.DataFrame()
new["total"] = ddi['ID2'].value_counts()
new = new.reset_index()
new.columns = ['ID', 'count'] 
drug_counts = pd.merge(old,new,how='outer',on='ID').fillna(0)
drug_counts['total'] = drug_counts['count_x'] + drug_counts['count_y']

drug_counts = drug_counts.sort_values(by='total')
drug_counts_one = pd.DataFrame(drug_counts[drug_counts['total']==1]['ID'])

#removing drugs only in database once
ddi_proc = ddi[ ~ddi['ID1'].isin(drug_counts_one['ID'])]
ddi_proc = ddi_proc[ ~ddi_proc['ID2'].isin(drug_counts_one['ID'])]

#removing one particular drug with a problematic SMILES code
ddi_proc = ddi_proc[ddi_proc['X1']!="OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1"]


  ddi_proc = ddi_proc[ddi_proc['X1']!="OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1"]


In [113]:
#create main datasets

data = dc.data.NumpyDataset(X=ddi_proc[['X1','X2']], y=ddi[['Y']])
df = data.to_dataframe()
df = df.sample(frac=1).reset_index(drop=True)

X_one = delist(df[["X1"]].values.tolist())
X_two = delist(df[["X2"]].values.tolist())

search_string = "nan"

count = X_one.count(search_string)
if count > 0:
    print(f"'{search_string}' found {count} times in X_one.")
count = X_two.count(search_string)
if count > 0:
    print(f"'{search_string}' found {count} times in X_two.")

In [114]:
# reduce dataset down to equal number of top 20 categories to create an equalized dataset

top20 = interaction_counts[interaction_counts['row_num'] <= 20]

reduced_df = df.merge(top20['y'], on='y')
reduced_df=reduced_df.sample(frac=1).reset_index(drop=True)

eq_df = reduced_df.groupby('y').apply(lambda x: x.sample(min(len(x), 500))).reset_index(drop=True)

eq_df

Unnamed: 0,X1,X2,y,w,ids
0,NC(=O)N1C2=CC=CC=C2C=CC2=CC=CC=C12,[H][C@@]12C[C@]1([H])N([C@@H](C2)C#N)C(=O)[C@@...,4,1.0,3389
1,CC1=CC=C(NC(=O)C2(CC2)C2=CC=C3OC(F)(F)OC3=C2)N...,OC(C1CCCCN1)C1=CC(=NC2=C1C=CC=C2C(F)(F)F)C(F)(F)F,4,1.0,4013
2,[H][C@]12SC(C)(C)[C@@H](N1C(=O)[C@H]2NC(=O)C1=...,COC1=CC=C(C=C1)[C@@H]1SC2=C(C=CC=C2)N(CCN(C)C)...,4,1.0,5432
3,NC(=O)N1C2=CC=CC=C2C=CC2=CC=CC=C12,CCC1=C(C)CN(C(=O)NCCC2=CC=C(C=C2)S(=O)(=O)NC(=...,4,1.0,3506
4,CCCC(C)C1(CC)C(=O)NC(=S)NC1=O,O=C1C(C(=O)C2=CC=CC=C12)C1=CC=CC=C1,4,1.0,2230
...,...,...,...,...,...
9995,CN(C)CC\C=C1/C2=CC=CC=C2CSC2=CC=CC=C12,CNC(C)(C)CC1=CC=CC=C1,83,1.0,191318
9996,[H][C@@]12C[C@@H](C)[C@](O)(C(=O)CO)[C@@]1(C)C...,NS(=O)(=O)C1=C(Cl)C=C2NC=NS(=O)(=O)C2=C1,83,1.0,190454
9997,[H][C@@]12C[C@@]3([H])[C@]4([H])C[C@H](F)C5=CC...,NS(=O)(=O)C1=CC2=C(NC(CC3=CC=CC=C3)NS2(=O)=O)C...,83,1.0,190846
9998,[H][C@@]12C[C@@H](C)[C@H](C(=O)CO)[C@@]1(C)C[C...,CCC(=C)C(=O)C1=C(Cl)C(Cl)=C(OCC(O)=O)C=C1,83,1.0,190217


## Featurization and Dimensionality Reduction

In [116]:
#featurize using circular fingerprint

cf_featurizer = dc.feat.CircularFingerprint()

def circular_fingerprint(smiles):
    try:
        mol = cf_featurizer(smiles)
        return mol
    except Exception as e:
        print(f"Error fingerprinting {smiles}: {e}")
        return None  # Skipping invalid SMILES


#other way to featurize a molecule
#cm_featurizer=dc.feat.ConvMolFeaturizer(per_atom_fragmentation=False)

#example process with just 10000 rows
df_example = df.head(10000)
df_example = df_example.reset_index()
df_example.rename(columns={'index':'col'},  inplace=True)

df_example_cf = pd.DataFrame()
df_example_cf['x1_cf'] = df_example['X1'].apply(circular_fingerprint)
df_example_cf['x2_cf'] = df_example['X2'].apply(circular_fingerprint)
df_example_cf['col'] = df_example_cf.index
df_example_cf.dropna(subset=['x1_cf', 'x2_cf'], inplace=True)


df_example_x1_cf = pd.DataFrame(delist(df_example_cf['x1_cf']))
df_example_x1_cf.rename(columns=lambda x: "x1_cf_"+str(x+1), inplace=True)
df_example_x1_cf['col'] = df_example_cf['col']

df_example_x2_cf = pd.DataFrame(delist(df_example_cf['x2_cf']))
df_example_x2_cf.rename(columns=lambda x: "x2_cf_"+str(x+1), inplace=True)
df_example_x2_cf['col'] = df_example_cf['col']


df_example = df_example.merge(df_example_x1_cf, on="col")
df_example = df_example.merge(df_example_x2_cf, on="col")
df_example.dropna(inplace=True)

df_example



Error fingerprinting nan: 'float' object is not iterable




Error fingerprinting nan: 'float' object is not iterable




Error fingerprinting nan: 'float' object is not iterable




Error fingerprinting nan: 'float' object is not iterable




Error fingerprinting nan: 'float' object is not iterable




Error fingerprinting nan: 'float' object is not iterable


[11:33:48] SMILES Parse Error: syntax error while parsing: OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1
[11:33:48] SMILES Parse Error: check for mistakes around position 76:
[11:33:48] C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C
[11:33:48] ~~~~~~~~~~~~~~~~~~~~^
[11:33:48] SMILES Parse Error: Failed parsing SMILES 'OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1' for input: 'OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1'
Failed to featurize datapoint 0, None. Appending empty array
Exception message: Python argument types in
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(class RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True, bool includeAtomMaps=True, 

Unnamed: 0,col,X1,X2,y,w,ids,x1_cf_1,x1_cf_2,x1_cf_3,x1_cf_4,...,x2_cf_2039,x2_cf_2040,x2_cf_2041,x2_cf_2042,x2_cf_2043,x2_cf_2044,x2_cf_2045,x2_cf_2046,x2_cf_2047,x2_cf_2048
0,0,OC1N=C(C2=CC=CC=C2Cl)C2=C(NC1=O)C=CC(Cl)=C2,CC(C)C(=O)OC(C)OC(=O)NCC1(CC(O)=O)CCCCC1,49,1.0,124084,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,CN(CCOC1=CC=C(NS(C)(=O)=O)C=C1)CCC1=CC=C(NS(C)...,C[C@@H](N1CCC(=C)CC1)[C@](O)(CN1C=NC=N1)C1=C(F...,73,1.0,168968,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,CN[C@H]1[C@H](O)[C@@H](O)[C@H](CO)O[C@H]1O[C@H...,C\C(O)=C(/C#N)C(=O)NC1=CC=C(C=C1)C(F)(F)F,72,1.0,153512,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,OCCN1CCN(CCCN2C3=CC=CC=C3SC3=C2C=C(Cl)C=C3)CC1,CN(C)CCCN1C2=CC=CC=C2SC2=C1C=C(C=C2)C(C)=O,49,1.0,110760,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,4,C[C@H]1COC2=C3N1C=C(C(O)=O)C(=O)C3=CC(F)=C2N1C...,CC(CN(C)C)CN1C2=CC=CC=C2CCC2=CC=CC=C12,20,1.0,20371,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9989,9992,O=C1N(C2CCC(=O)NC2=O)C(=O)C2=CC=CC=C12,CC(C)NCC(O)COC1=C(C)C(C)=C(OC(C)=O)C(C)=C1,49,1.0,76726,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9990,9993,NC[C@H]1O[C@H](O[C@@H]2[C@@H](N)C[C@@H](N)[C@H...,CC(C(O)=O)C1=CC=C2SC3=C(C=CC=C3)C(=O)CC2=C1,72,1.0,154339,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9991,9994,OCCOCCN1CCN(CC1)C1=NC2=CC=CC=C2SC2=CC=CC=C12,CN(C)CCC(C1=CC=C(Br)C=C1)C1=CC=CC=N1,49,1.0,111359,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9992,9995,CCCN[C@H]1CCC2=C(C1)SC(N)=N2,CCCC1=NC2=C(C=C(C=C2C)C2=NC3=CC=CC=C3N2C)N1CC1...,49,1.0,71324,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [117]:
#Basic dimensionality reduction down to 100 components from 4096

df_X = df_example.iloc[:,6:]
df_Y = to_categorical(df_example["y"])

n_components = 100
batch_size = 1000

ipca = IncrementalPCA(n_components=n_components)

for i in range(0, df_X.shape[0], batch_size):
    X_batch = df_X[i:i + batch_size]
    ipca.partial_fit(X_batch)

X_transformed = ipca.transform(df_X)

#prove that X's shape has changed
#print("Original shape:", df_X.shape)
#print("Transformed shape:", X_transformed.shape)

df_X_proc = pd.DataFrame(X_transformed)


## Initial basic model implementation

In [119]:
#basic/rough neural network implementation

def calc_layers(X_size, Y_size):
    layers = [X_size+1]
    layer = 2
    while layer <= X_size:
        layer = int(layer * 2)
    layers.append(layer)
    if X_size > Y_size:
        while layer / 2 > Y_size and layer > 2:
            layer = layer / 2
            layers.append(int(layer))
    layers.append(Y_size)
    return layers
        
def DNN(X, Y, Epochs, batchsize, layernum=1, verbose=False):

    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=27)
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    #define layers and nodes in each layer
    input = len(X.columns)
    output = Y.shape[1]
    if layernum=='many':
        layers = calc_layers(input,output)
    elif type(layernum)==int:
        layers = [input]
        for i in range(layernum, 1, -1):
            layer = int(round((i * (input + output) / (layernum+1)), 0))
            if layer > output:
                layers.append(layer)
        layers.append(output)
    else:
        print(f"incorrect layernum {layernum}")
        return None

    model = keras.models.Sequential()

    model.add(Dense(layers[0], activation='relu'))
    model.add(keras.layers.Dropout(0.2))

    for layer_size in layers[1:-1]:
        model.add(Dense(layer_size, activation='relu'))
        model.add(keras.layers.Dropout(0.2))
        
    model.add(Dense(layers[-1], activation='softmax'))
    
    model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy','AUC','precision','recall'])
    
    model.fit(X_train, y_train, epochs=Epochs, batch_size=batchsize, validation_split=0.1, verbose=verbose)
    
    loss, accuracy, AUC, precision, recall = model.evaluate(X_test, y_test, verbose=0)
    if verbose:
        print(f"Test Loss: {loss}")
        print(f"Test Accuracy: {accuracy}")
        print(f"Test AUC: {AUC}")
        print(f"Test Precision: {precision}")
        print(f"Test Recall: {recall}")
    return model, accuracy

In [120]:
dr_model, acc = DNN(df_X_proc, df_Y, 5, 1000, 1)

Epoch 1/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 116ms/step - AUC: 0.4478 - accuracy: 0.0112 - loss: 4.9604 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_AUC: 0.5874 - val_accuracy: 0.0262 - val_loss: 4.5029 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 2/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - AUC: 0.6110 - accuracy: 0.0367 - loss: 4.4522 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_AUC: 0.7355 - val_accuracy: 0.0525 - val_loss: 4.0795 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 3/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - AUC: 0.7450 - accuracy: 0.0703 - loss: 4.0305 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_AUC: 0.8252 - val_accuracy: 0.0925 - val_loss: 3.7303 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 4/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - AUC: 0.8345 - accuracy: 0.1103 - loss: 3.6731 - pre

In [121]:
#comparison to without Dimensionality reduction

full_model, acc = DNN(df_X, df_Y, 5, 1000, 1)

#calc_layers(len(df_X.iloc[:,1:].columns), df_Y.shape[1])

Epoch 1/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 564ms/step - AUC: 0.7282 - accuracy: 0.1418 - loss: 4.5112 - precision: 0.2182 - recall: 0.0985 - val_AUC: 0.7825 - val_accuracy: 0.1925 - val_loss: 4.7700 - val_precision: 0.2108 - val_recall: 0.1363
Epoch 2/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 476ms/step - AUC: 0.9483 - accuracy: 0.5455 - loss: 1.7447 - precision: 0.6366 - recall: 0.4660 - val_AUC: 0.7706 - val_accuracy: 0.1825 - val_loss: 5.1054 - val_precision: 0.1865 - val_recall: 0.1350
Epoch 3/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 474ms/step - AUC: 0.9854 - accuracy: 0.7405 - loss: 0.8690 - precision: 0.7944 - recall: 0.6599 - val_AUC: 0.7562 - val_accuracy: 0.1963 - val_loss: 5.3342 - val_precision: 0.2104 - val_recall: 0.1612
Epoch 4/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 472ms/step - AUC: 0.9958 - accuracy: 0.8572 - loss: 0.4637 - precision: 0.8932 - recall: 0.8101 - val_AUC: 0.

This demonstrates that dimensionality reduction is not productive to a neural network in this case.

## Adding in other variables

In [124]:
#code from Beth Farr
def compute_features(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if not mol:
            return None

        # Computing Molecular Descriptors
        mol_wt = Descriptors.MolWt(mol)
        logp = Descriptors.MolLogP(mol)
        h_donors = Descriptors.NumHDonors(mol)
        h_acceptors = Descriptors.NumHAcceptors(mol)
        tpsa = Descriptors.TPSA(mol)
        return [mol_wt, logp, h_donors, h_acceptors, tpsa]

    except Exception as e:
        print(f"Error computing features for {smiles}: {e}")
        return None  # Skipping invalid SMILES

# Using function to X1 and X2 to extract features
df_feat = df_example
df_feat['features_X1'] = df_example['X1'].apply(compute_features)
df_feat['features_X2'] = df_example['X2'].apply(compute_features)
df_feat = df_feat.dropna(subset=['features_X1', 'features_X2'])
features_X1_df = pd.DataFrame(df_feat['features_X1'].tolist(), columns=['MolWt_X1', 'LogP_X1', 'NumHDonors_X1', 'NumHAcceptors_X1', 'TPSA_X1'])
features_X1_df['col'] = df_feat['col']
features_X2_df = pd.DataFrame(df_feat['features_X2'].tolist(), columns=['MolWt_X2', 'LogP_X2', 'NumHDonors_X2', 'NumHAcceptors_X2', 'TPSA_X2'])
features_X2_df['col'] = df_feat['col']

df_feat = pd.merge(df_feat, pd.merge(features_X1_df, features_X2_df,on='col'), on='col')
df_feat = df_feat.drop(['X1', 'X2', 'w', 'features_X1', 'features_X2'], axis=1)

df_feat_X = df_feat.iloc[:,3:]
df_feat_Y = to_categorical(df_feat["y"])

feat_model, acc = DNN(df_feat_X, df_feat_Y, 5, 1000, 1)


[11:35:34] SMILES Parse Error: syntax error while parsing: OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1
[11:35:34] SMILES Parse Error: check for mistakes around position 76:
[11:35:34] C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C
[11:35:34] ~~~~~~~~~~~~~~~~~~~~^
[11:35:34] SMILES Parse Error: Failed parsing SMILES 'OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1' for input: 'OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1'


Epoch 1/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 479ms/step - AUC: 0.7280 - accuracy: 0.1460 - loss: 4.4930 - precision: 0.1999 - recall: 0.0936 - val_AUC: 0.7793 - val_accuracy: 0.1800 - val_loss: 4.8433 - val_precision: 0.2095 - val_recall: 0.1375
Epoch 2/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 418ms/step - AUC: 0.9509 - accuracy: 0.5814 - loss: 1.6583 - precision: 0.6586 - recall: 0.5002 - val_AUC: 0.7534 - val_accuracy: 0.1988 - val_loss: 5.2750 - val_precision: 0.2068 - val_recall: 0.1600
Epoch 3/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 435ms/step - AUC: 0.9849 - accuracy: 0.7507 - loss: 0.8477 - precision: 0.8054 - recall: 0.6903 - val_AUC: 0.7582 - val_accuracy: 0.1950 - val_loss: 5.4061 - val_precision: 0.2127 - val_recall: 0.1713
Epoch 4/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 424ms/step - AUC: 0.9948 - accuracy: 0.8681 - loss: 0.4533 - precision: 0.9026 - recall: 0.8217 - val_AUC: 0.

## Adding In GNN

In [126]:
#Creating GNN Model
class GNN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

#Using PyTorch Geometric Graph Format
def create_graph_features(features, labels):
    num_nodes = len(features)

    # creating the nodes
    edge_index = torch.tensor(
        np.array([[i, i] for i in range(num_nodes)]).T, dtype=torch.long
    )

    graphs = []
    for i in range(len(features)):
        x = torch.tensor(features[i], dtype=torch.float).unsqueeze(0)
        y = torch.tensor([labels[i]], dtype=torch.long)
        graph = Data(x=x, edge_index=edge_index, y=y)
        graphs.append(graph)
    return graphs

#Training the GNN Model
def GNN_train(model, train_loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for data in train_loader:
        data = data.to(torch.device("cpu"))  # Ensure correct device
        optimizer.zero_grad()
        out = model(data)
        loss = criterion(out.squeeze(1), data.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

#Evaluating the Model
def GNN_test(model, loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data in loader:
            data = data.to(torch.device("cpu"))  # Ensure correct device
            out = model(data)
            pred = out.argmax(dim=1)
            correct += (pred == data.y).sum().item()
            total += data.y.cpu().size(0)
    acc = correct / total
    return acc

def perform_GNN(X, Y, epochs, batchsize, verbose=False):

    #Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
                                                    
    train_graphs = create_graph_features(X_train, y_train)
    test_graphs = create_graph_features(X_test, y_test)

    # Creating data loaders
    train_loader = DataLoader(train_graphs, batch_size=batchsize, shuffle=True)
    test_loader = DataLoader(test_graphs, batch_size=batchsize, shuffle=False)

    #Initialising the Model
    input_dim = X.shape[1]
    output_dim = len(np.unique(Y))
    hidden_dim = int(round((input_dim + output_dim)/2,0))
    
    model = GNN(input_dim, hidden_dim, output_dim)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    criterion = torch.nn.CrossEntropyLoss()

    #Running Training
    accuracy = 0
    for epoch in range(epochs):
        loss = GNN_train(model, train_loader, optimizer, criterion)
        acc = GNN_test(model, test_loader)
        if verbose:
            print(f"Epoch {epoch+1}: Loss={loss:.4f}, Test Accuracy={acc:.4f}")
        if acc > accuracy:
            accuracy = acc

    #Saving the model for future use
    torch.save(model.state_dict(), "gnn_model.pth")
    return model, accuracy
    

In [127]:
label_encoder = LabelEncoder()
graph_y = label_encoder.fit_transform(df_feat['y'])

model_gnn = perform_GNN(df_feat_X.values, graph_y, 5, 1000)

Epoch 1: Loss=223.5673, Test Accuracy=0.0285
Epoch 2: Loss=75.5342, Test Accuracy=0.1827
Epoch 3: Loss=3.2814, Test Accuracy=0.3083
Epoch 4: Loss=2.7074, Test Accuracy=0.3243
Epoch 5: Loss=2.5524, Test Accuracy=0.3148


In [128]:
def ddi_featurize(df, X1_name, X2_name):
    df_cf = df
    df_cf['x1_cf'] = df[X1_name].apply(circular_fingerprint)
    df_cf['x2_cf'] = df[X2_name].apply(circular_fingerprint)
    df_cf['col'] = df_cf.index
    df_cf.dropna(subset=['x1_cf', 'x2_cf'], inplace=True)
    
    df_x1_cf = pd.DataFrame(delist(df_cf['x1_cf']))
    df_x1_cf.rename(columns=lambda x: "x1_cf_"+str(x+1), inplace=True)
    df_x1_cf['col'] = df_cf['col']
    
    df_x2_cf = pd.DataFrame(delist(df_example_cf['x2_cf']))
    df_x2_cf.rename(columns=lambda x: "x2_cf_"+str(x+1), inplace=True)
    df_x2_cf['col'] = df_cf['col']

    print(df_cf.columns)
    df_temp = df_cf.drop(['x1_cf', 'x2_cf'], axis=1)
    df_temp = df_temp.merge(df_x1_cf, on="col")
    df_temp = df_temp.merge(df_x2_cf, on="col")
    df_temp.dropna(inplace=True)

    df_temp['features_X1'] = df_temp['X1'].apply(compute_features)
    df_temp['features_X2'] = df_temp['X2'].apply(compute_features)
    df_temp = df_temp.dropna(subset=['features_X1', 'features_X2'])
    features_X1_df = pd.DataFrame(df_temp['features_X1'].tolist(), columns=['MolWt_X1', 'LogP_X1', 'NumHDonors_X1', 'NumHAcceptors_X1', 'TPSA_X1'])
    features_X1_df['col'] = df_temp['col']
    features_X2_df = pd.DataFrame(df_temp['features_X2'].tolist(), columns=['MolWt_X2', 'LogP_X2', 'NumHDonors_X2', 'NumHAcceptors_X2', 'TPSA_X2'])
    features_X2_df['col'] = df_temp['col']
    df_temp.drop(['features_X1', 'features_X2'], axis=1, inplace = True)
    
    df_final = pd.merge(df_temp, pd.merge(features_X1_df, features_X2_df,on='col'), on='col')
    df_final.drop(['col'], axis=1, inplace = True)

    return df_final

In [129]:
eq_df['col'] = eq_df.index
eq_df_feat = ddi_featurize(eq_df, "X1","X2")
eq_df_feat = eq_df_feat.drop(['X1', 'X2', 'w'], axis=1)



Index(['X1', 'X2', 'y', 'w', 'ids', 'col', 'x1_cf', 'x2_cf'], dtype='object')


In [130]:
eq_model_dnn, eq_dnn_acc = DNN(eq_df_feat.iloc[:,2:], to_categorical(eq_df_feat['y']), 5, 1000, 1)

Epoch 1/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 472ms/step - AUC: 0.7966 - accuracy: 0.2937 - loss: 3.3605 - precision: 0.4981 - recall: 0.2304 - val_AUC: 0.9267 - val_accuracy: 0.5713 - val_loss: 2.1379 - val_precision: 0.6211 - val_recall: 0.5450
Epoch 2/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 392ms/step - AUC: 0.9832 - accuracy: 0.8181 - loss: 0.6713 - precision: 0.8435 - recall: 0.7922 - val_AUC: 0.9165 - val_accuracy: 0.5913 - val_loss: 2.2305 - val_precision: 0.6286 - val_recall: 0.5713
Epoch 3/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 409ms/step - AUC: 0.9941 - accuracy: 0.9014 - loss: 0.3561 - precision: 0.9134 - recall: 0.8886 - val_AUC: 0.9098 - val_accuracy: 0.5763 - val_loss: 2.4498 - val_precision: 0.6092 - val_recall: 0.5612
Epoch 4/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 435ms/step - AUC: 0.9975 - accuracy: 0.9548 - loss: 0.1588 - precision: 0.9601 - recall: 0.9466 - val_AUC: 0.

In [131]:
eq_graph_y = label_encoder.fit_transform(eq_df_feat['y'])

eq_model_gnn, eq_gnn_acc = perform_GNN(eq_df_feat.iloc[:,2:].values, eq_graph_y, 5, 1000)

Epoch 1: Loss=194.8450, Test Accuracy=0.0490
Epoch 2: Loss=93.6070, Test Accuracy=0.0315
Epoch 3: Loss=3.3337, Test Accuracy=0.0965
Epoch 4: Loss=2.8994, Test Accuracy=0.1011
Epoch 5: Loss=2.8154, Test Accuracy=0.1201


In [132]:
def opt_hps(func, X, Y, epochs, batchsizes):
    best_epochs = 0
    best_batchsize = 0
    best_acc = 0
    for epoch_num in epochs:
        for batchsize in batchsizes:
            model, acc = func(X, Y, epoch_num, batchsize)
            if acc > best_acc:
                best_epochs = epoch_num
                best_batchsize = batchsize
                best_acc = acc
    print(f"Optimal hyperparams: {best_epochs} epochs, {best_batchsize} batchsize. Accuracy: {best_acc}")

In [133]:
my_epochs = [2,3,4,5,6,7,8,9,10]
my_batchsizes = [100, 250, 500, 1000, 2000]

opt_hps(DNN, eq_df_feat.iloc[:,2:], to_categorical(eq_df_feat['y']), my_epochs, my_batchsizes)

Epoch 1/2
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 222ms/step - AUC: 0.8190 - accuracy: 0.3707 - loss: 4.0468 - precision: 0.4521 - recall: 0.3391 - val_AUC: 0.8923 - val_accuracy: 0.5763 - val_loss: 3.0065 - val_precision: 0.6065 - val_recall: 0.5587
Epoch 2/2
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 214ms/step - AUC: 0.9601 - accuracy: 0.7647 - loss: 1.3480 - precision: 0.7871 - recall: 0.7465 - val_AUC: 0.8970 - val_accuracy: 0.5825 - val_loss: 2.6986 - val_precision: 0.6053 - val_recall: 0.5675
Test Loss: 3.2960054874420166
Test Accuracy: 0.5057528614997864
Test AUC: 0.8739632964134216
Test Precision: 0.534223735332489
Test Recall: 0.4802401065826416
Epoch 1/2
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 292ms/step - AUC: 0.8354 - accuracy: 0.3570 - loss: 3.4554 - precision: 0.4837 - recall: 0.3127 - val_AUC: 0.8962 - val_accuracy: 0.5888 - val_loss: 2.6744 - val_precision: 0.6185 - val_recall: 0.5775
Epoch 2/2
[1

In [154]:
opt_hps(perform_GNN, eq_df_feat.iloc[:,2:].values, eq_graph_y, my_epochs, my_batchsizes)

Epoch 1: Loss=34.3050, Test Accuracy=0.2076
Epoch 2: Loss=2.3404, Test Accuracy=0.3197
Epoch 1: Loss=73.2264, Test Accuracy=0.0925
Epoch 2: Loss=2.7740, Test Accuracy=0.1686
Epoch 1: Loss=96.8776, Test Accuracy=0.0635
Epoch 2: Loss=2.9977, Test Accuracy=0.0900
Epoch 1: Loss=178.2184, Test Accuracy=0.0795
Epoch 2: Loss=83.9553, Test Accuracy=0.1091
Epoch 1: Loss=83.3185, Test Accuracy=0.0700
Epoch 2: Loss=302.5802, Test Accuracy=0.0925
Epoch 1: Loss=28.0350, Test Accuracy=0.1611
Epoch 2: Loss=2.4461, Test Accuracy=0.3117
Epoch 3: Loss=2.0100, Test Accuracy=0.4327
Epoch 1: Loss=60.1111, Test Accuracy=0.1071
Epoch 2: Loss=2.7798, Test Accuracy=0.1906
Epoch 3: Loss=2.5182, Test Accuracy=0.2436
Epoch 1: Loss=140.6786, Test Accuracy=0.0740
Epoch 2: Loss=3.4828, Test Accuracy=0.1261
Epoch 3: Loss=2.7708, Test Accuracy=0.1796
Epoch 1: Loss=154.2788, Test Accuracy=0.1381
Epoch 2: Loss=42.2980, Test Accuracy=0.0790
Epoch 3: Loss=2.9264, Test Accuracy=0.1131
Epoch 1: Loss=103.0816, Test Accuracy=