In [120]:
import csv
import math
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.optimize as opt
import seaborn as sns
from scipy import stats
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.decomposition import IncrementalPCA
from tabulate import tabulate
import tensorflow as tf
import deepchem as dc
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from tensorflow import keras
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.loader import DataLoader
from torch_geometric.data import Data
import warnings
warnings.filterwarnings('ignore')

## Initial Loading

In [122]:

ddi_fp = "Data Files\\drugbank\\drugbank.tab"
ddi = pd.read_csv(ddi_fp, sep='\t')

kaggle_fp = "Data Files\\SMILES-Kaggle\\chembl_22_clean_1576904_sorted_std_final.smi"
smiles = pd.read_csv(kaggle_fp, sep='\t')

ddi["Y"] = ddi["Y"].astype("category")
ddi["Map"] = ddi["Map"].astype("category")

#counting interaction types for potential later weighting
interaction_counts = pd.DataFrame(ddi['Y'].value_counts().rename_axis('y').reset_index(name='count')).sort_values(by='count', ascending=False)
interaction_counts['row_num'] = interaction_counts.index + 1
interaction_counts['log_count'] = np.log(interaction_counts['count'])

#listing longer explanations of interaction types for later use
interaction_types = ddi[['Y','Map']].drop_duplicates(subset=['Y'])

#remove longer name of interaction type from main dataset
ddi = ddi.drop("Map",axis=1)

ddi.head(10)

Unnamed: 0,ID1,ID2,Y,X1,X2
0,DB04571,DB00460,1,CC1=CC2=CC3=C(OC(=O)C=C3C)C(C)=C2O1,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...
1,DB00855,DB00460,1,NCC(=O)CCC(O)=O,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...
2,DB09536,DB00460,1,O=[Ti]=O,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...
3,DB01600,DB00460,1,CC(C(O)=O)C1=CC=C(S1)C(=O)C1=CC=CC=C1,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...
4,DB09000,DB00460,1,CC(CN(C)C)CN1C2=CC=CC=C2SC2=C1C=C(C=C2)C#N,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...
5,DB11630,DB00460,1,OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)...,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...
6,DB00553,DB00460,1,COC1=C2OC(=O)C=CC2=CC2=C1OC=C2,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...
7,DB06261,DB00460,1,[H]N([H])CC(=O)CCC(=O)OCCCCCC,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...
8,DB01878,DB00460,1,O=C(C1=CC=CC=C1)C1=CC=CC=C1,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...
9,DB00140,DB00460,1,CC1=C(C)C=C2N(C[C@H](O)[C@H](O)[C@H](O)CO)C3=N...,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...


In [123]:
ddi.shape

(191808, 5)

In [124]:
#quick function to turn a list of size 1 lists of strings into a list of strings, for later use
def delist(list_of_lists):
    list_of_strings = []
    for inner_list in list_of_lists:
        string = inner_list[0]
        list_of_strings.append(string)
    return list_of_strings

## Preprocessing

In [126]:

# counting drugs by number of mentions in database
old = pd.DataFrame()
old["total"] = ddi['ID1'].value_counts()
old = old.reset_index()
old.columns = ['ID', 'count'] 
new = pd.DataFrame()
new["total"] = ddi['ID2'].value_counts()
new = new.reset_index()
new.columns = ['ID', 'count'] 
drug_counts = pd.merge(old,new,how='outer',on='ID').fillna(0)
drug_counts['total'] = drug_counts['count_x'] + drug_counts['count_y']

drug_counts = drug_counts.sort_values(by='total')
drug_counts_one = pd.DataFrame(drug_counts[drug_counts['total']==1]['ID'])

#removing drugs only in database once
ddi_proc = ddi[ ~ddi['ID1'].isin(drug_counts_one['ID'])]
ddi_proc = ddi_proc[ ~ddi_proc['ID2'].isin(drug_counts_one['ID'])]

#removing one particular drug with a problematic SMILES code
ddi_proc = ddi_proc[ddi_proc['X1']!="OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1"]


In [127]:
#create main datasets

data = dc.data.NumpyDataset(X=ddi_proc[['X1','X2']], y=ddi[['Y']])
df = data.to_dataframe()
df = df.sample(frac=1).reset_index(drop=True)

X_one = delist(df[["X1"]].values.tolist())
X_two = delist(df[["X2"]].values.tolist())

search_string = "nan"

count = X_one.count(search_string)
if count > 0:
    print(f"'{search_string}' found {count} times in X_one.")
count = X_two.count(search_string)
if count > 0:
    print(f"'{search_string}' found {count} times in X_two.")

In [158]:
# reduce dataset down to equal number of top 20 categories to create an equalized dataset

top20 = interaction_counts[interaction_counts['row_num'] <= 20]

reduced_df = df.merge(top20['y'], on='y')
reduced_df=reduced_df.sample(frac=1).reset_index(drop=True)

eq_df = reduced_df.groupby('y').apply(lambda x: x.sample(min(len(x), 500))).reset_index(drop=True)

eq_df

Unnamed: 0,X1,X2,y,w,ids
0,CC1=C2NC(=O)C3=C(N=CC=C3)N(C3CC3)C2=NC=C1,CC1=CN([C@H]2C[C@H](N=[N+]=[N-])[C@@H](CO)O2)C...,4,1.0,2751
1,CO[C@H]1\C=C\O[C@@]2(C)OC3=C(C2=O)C2=C(C(O)=C3...,OCCN1CCN(CCCN2C3=CC=CC=C3SC3=C2C=C(Cl)C=C3)CC1,4,1.0,5489
2,O=C1NC(=O)C(N1)(C1=CC=CC=C1)C1=CC=CC=C1,S=P(N1CC1)(N1CC1)N1CC1,4,1.0,5841
3,CCC1(C(=O)NCNC1=O)C1=CC=CC=C1,COC1=CC=C(\C=C\C(=O)NC2=CC=CC=C2C(O)=O)C=C1OC,4,1.0,3164
4,CCC1(C(=O)NCNC1=O)C1=CC=CC=C1,OCC1=C(O)C=CC(=C1)C(O)CNCCCCCCOCCCCC1=CC=CC=C1,4,1.0,4221
...,...,...,...,...,...
9995,[H][C@@]12C[C@@]3([H])[C@]4([H])C[C@H](F)C5=CC...,NS(=O)(=O)C1=CC2=C(NC(CC3=CC=CC=C3)NS2(=O)=O)C...,83,1.0,190869
9996,[H][C@@]12C[C@@H](C)[C@H](C(=O)CO)[C@@]1(C)C[C...,NS(=O)(=O)C1=C(Cl)C=C2NCNS(=O)(=O)C2=C1,83,1.0,191250
9997,CC(C)(C)NCC(O)C1=CC(O)=CC(O)=C1,NS(=O)(=O)C1=CC2=C(NC(CC3=CC=CC=C3)NS2(=O)=O)C...,83,1.0,190880
9998,[H][C@@]12C[C@H]3OC(C)(C)O[C@@]3(C(=O)CCl)[C@@...,NS(=O)(=O)C1=C(Cl)C=C2NC=NS(=O)(=O)C2=C1,83,1.0,190924


## Featurization and Dimensionality Reduction

In [50]:
#featurize using circular fingerprint

cf_featurizer = dc.feat.CircularFingerprint()

def circular_fingerprint(smiles):
    try:
        mol = cf_featurizer(smiles)
        return mol
    except Exception as e:
        print(f"Error fingerprinting {smiles}: {e}")
        return None  # Skipping invalid SMILES


#other way to featurize a molecule
#cm_featurizer=dc.feat.ConvMolFeaturizer(per_atom_fragmentation=False)

#example process with just 10000 rows
df_example = df.head(10000)
df_example = df_example.reset_index()
df_example.rename(columns={'index':'col'},  inplace=True)

df_example_cf = pd.DataFrame()
df_example_cf['x1_cf'] = df_example['X1'].apply(circular_fingerprint)
df_example_cf['x2_cf'] = df_example['X2'].apply(circular_fingerprint)
df_example_cf['col'] = df_example_cf.index
df_example_cf.dropna(subset=['x1_cf', 'x2_cf'], inplace=True)


df_example_x1_cf = pd.DataFrame(delist(df_example_cf['x1_cf']))
df_example_x1_cf.rename(columns=lambda x: "x1_cf_"+str(x+1), inplace=True)
df_example_x1_cf['col'] = df_example_cf['col']

df_example_x2_cf = pd.DataFrame(delist(df_example_cf['x2_cf']))
df_example_x2_cf.rename(columns=lambda x: "x2_cf_"+str(x+1), inplace=True)
df_example_x2_cf['col'] = df_example_cf['col']


df_example = df_example.merge(df_example_x1_cf, on="col")
df_example = df_example.merge(df_example_x2_cf, on="col")
df_example.dropna(inplace=True)

df_example



Error fingerprinting nan: 'float' object is not iterable




Error fingerprinting nan: 'float' object is not iterable




Error fingerprinting nan: 'float' object is not iterable
Error fingerprinting nan: 'float' object is not iterable




Error fingerprinting nan: 'float' object is not iterable




Error fingerprinting nan: 'float' object is not iterable




Error fingerprinting nan: 'float' object is not iterable


[14:24:00] SMILES Parse Error: syntax error while parsing: OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1
[14:24:00] SMILES Parse Error: check for mistakes around position 76:
[14:24:00] C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C
[14:24:00] ~~~~~~~~~~~~~~~~~~~~^
[14:24:00] SMILES Parse Error: Failed parsing SMILES 'OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1' for input: 'OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1'
Failed to featurize datapoint 0, None. Appending empty array
Exception message: Python argument types in
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(class RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True, bool includeAtomMaps=True, 

Error fingerprinting nan: 'float' object is not iterable




Error fingerprinting nan: 'float' object is not iterable
Error fingerprinting nan: 'float' object is not iterable
Error fingerprinting nan: 'float' object is not iterable




Error fingerprinting nan: 'float' object is not iterable




Unnamed: 0,col,X1,X2,y,w,ids,x1_cf_1,x1_cf_2,x1_cf_3,x1_cf_4,...,x2_cf_2039,x2_cf_2040,x2_cf_2041,x2_cf_2042,x2_cf_2043,x2_cf_2044,x2_cf_2045,x2_cf_2046,x2_cf_2047,x2_cf_2048
0,0,CCCCN(CCCC)CCCOC1=CC=C(C=C1)C(=O)C1=C(CCCC)OC2...,[H][C@]12CC[C@]([H])([C@H]([C@H](C1)OC(=O)C1=C...,47,1.0,51716,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,CCC1=C(C)CN(C(=O)NCCC2=CC=C(C=C2)S(=O)(=O)NC(=...,C[C@@H](C1=NC=NC=C1F)[C@](O)(CN1C=NC=N1)C1=C(F...,73,1.0,176154,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,C#CCN[C@@H]1CCC2=CC=CC=C12,CCOC(=O)C1=C(COCCN)NC(C)=C(C1C1=CC=CC=C1Cl)C(=...,60,1.0,137494,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,[H][C@@]12CC3=CNC4=CC=CC(=C34)C1=C[C@H](CN2C)C...,OC1(CCN(CCCC(=O)C2=CC=C(F)C=C2)CC1)C1=CC=C(Cl)...,49,1.0,69742,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,4,CCCOC1=C(C=C(C=C1)S(=O)(=O)NCCC1CCCN1C)C1=NC(=...,CCOC(=O)C1=C(C)NC(C)=C(C1C1=C(Cl)C(Cl)=CC=C1)C...,10,1.0,11771,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9983,9989,CCN(CC)CCN1C2=C(C=C(Cl)C=C2)C(=NCC1=O)C1=CC=CC...,ClC1=CC2=C(C=C1)C(=C1CCNCC1)C1=C(CC2)C=CC=N1,49,1.0,120528,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9984,9990,[H][C@@]12CCC[C@]1([H])N([C@@H](C2)C(O)=O)C(=O...,CC(C)NCC(O)C1=CC=C(NS(C)(=O)=O)C=C1,49,1.0,102323,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9985,9991,CN1C2=C(NC=N2)C(=O)N(C)C1=O,CCN1C=C(C(O)=O)C(=O)C2=CC(F)=C(N=C12)N1CCNCC1,47,1.0,59086,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9986,9992,CN1N=NC(=N1)C1=CC=C(C=N1)C1=CC=C(C=C1F)N1C[C@H...,FC1=CC=C(C=C1)C(CCCN1CCC2(CC1)N(CNC2=O)C1=CC=C...,49,1.0,126526,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [51]:
#Basic dimensionality reduction down to 100 components from 4096

df_X = df_example.iloc[:,6:]
df_Y = to_categorical(df_example["y"])

n_components = 100
batch_size = 1000

ipca = IncrementalPCA(n_components=n_components)

for i in range(0, df_X.shape[0], batch_size):
    X_batch = df_X[i:i + batch_size]
    ipca.partial_fit(X_batch)

X_transformed = ipca.transform(df_X)

#prove that X's shape has changed
#print("Original shape:", df_X.shape)
#print("Transformed shape:", X_transformed.shape)

df_X_proc = pd.DataFrame(X_transformed)


## Initial basic model implementation

In [53]:
#basic/rough neural network implementation

def calc_layers(X_size, Y_size):
    layers = [X_size+1]
    layer = 2
    while layer <= X_size:
        layer = int(layer * 2)
    layers.append(layer)
    if X_size > Y_size:
        while layer / 2 > Y_size and layer > 2:
            layer = layer / 2
            layers.append(int(layer))
    layers.append(Y_size)
    return layers
        
def DNN(X, Y, Epochs, batchsize, layernum):

    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=27)
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    #define layers and nodes in each layer
    input = len(X.columns)
    output = Y.shape[1]
    if layernum=='many':
        layers = calc_layers(input,output)
    elif type(layernum)==int:
        layers = [input]
        for i in range(layernum, 1, -1):
            layer = int(round((i * (input + output) / (layernum+1)), 0))
            if layer > output:
                layers.append(layer)
        layers.append(output)
    else:
        print(f"incorrect layernum {layernum}")
        return None

    model = keras.models.Sequential()

    model.add(Dense(layers[0], activation='relu'))
    model.add(keras.layers.Dropout(0.2))

    for layer_size in layers[1:-1]:
        model.add(Dense(layer_size, activation='relu'))
        model.add(keras.layers.Dropout(0.2))
        
    model.add(Dense(layers[-1], activation='softmax'))
    
    model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy','AUC','precision','recall'])
    
    model.fit(X_train, y_train, epochs=Epochs, batch_size=batchsize, validation_split=0.1)
    
    full_loss, full_accuracy, full_AUC, full_precision, full_recall = model.evaluate(X_test, y_test, verbose=0)
    print(f"Test Accuracy: {full_accuracy}")
    print(f"Test Loss: {full_loss}")
    return model

In [54]:
dr_model = DNN(df_X_proc, df_Y, 5, 1000, 1)

Epoch 1/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 113ms/step - AUC: 0.5195 - accuracy: 0.0217 - loss: 4.7393 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_AUC: 0.6641 - val_accuracy: 0.0563 - val_loss: 4.2748 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 2/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - AUC: 0.6863 - accuracy: 0.0617 - loss: 4.2186 - precision: 0.4815 - recall: 1.5234e-04 - val_AUC: 0.7994 - val_accuracy: 0.1026 - val_loss: 3.8546 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 3/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - AUC: 0.8040 - accuracy: 0.1081 - loss: 3.8067 - precision: 0.0615 - recall: 1.3430e-04 - val_AUC: 0.8645 - val_accuracy: 0.1477 - val_loss: 3.5169 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 4/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - AUC: 0.8636 - accuracy: 0.1445 - loss: 3.4850 - precision: 

In [55]:
#comparison to without Dimensionality reduction

full_model = DNN(df_X, df_Y, 5, 1000, 1)

#calc_layers(len(df_X.iloc[:,1:].columns), df_Y.shape[1])

Epoch 1/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 617ms/step - AUC: 0.7244 - accuracy: 0.1462 - loss: 4.5305 - precision: 0.1954 - recall: 0.0948 - val_AUC: 0.7905 - val_accuracy: 0.1852 - val_loss: 4.6611 - val_precision: 0.2210 - val_recall: 0.1477
Epoch 2/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 637ms/step - AUC: 0.9465 - accuracy: 0.5499 - loss: 1.7822 - precision: 0.6219 - recall: 0.4667 - val_AUC: 0.7761 - val_accuracy: 0.2103 - val_loss: 4.8647 - val_precision: 0.2116 - val_recall: 0.1602
Epoch 3/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 592ms/step - AUC: 0.9851 - accuracy: 0.7275 - loss: 0.8822 - precision: 0.7803 - recall: 0.6612 - val_AUC: 0.7726 - val_accuracy: 0.2178 - val_loss: 5.0044 - val_precision: 0.2312 - val_recall: 0.1727
Epoch 4/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 634ms/step - AUC: 0.9941 - accuracy: 0.8401 - loss: 0.5026 - precision: 0.8762 - recall: 0.7914 - val_AUC: 0.

This demonstrates that dimensionality reduction is counterproductive to a neural network in this case.

## Adding in other variables

In [58]:
#code from Beth Farr
def compute_features(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if not mol:
            return None

        # Computing Molecular Descriptors
        mol_wt = Descriptors.MolWt(mol)
        logp = Descriptors.MolLogP(mol)
        h_donors = Descriptors.NumHDonors(mol)
        h_acceptors = Descriptors.NumHAcceptors(mol)
        tpsa = Descriptors.TPSA(mol)
        return [mol_wt, logp, h_donors, h_acceptors, tpsa]

    except Exception as e:
        print(f"Error computing features for {smiles}: {e}")
        return None  # Skipping invalid SMILES

# Using function to X1 and X2 to extract features
df_feat = df_example
df_feat['features_X1'] = df_example['X1'].apply(compute_features)
df_feat['features_X2'] = df_example['X2'].apply(compute_features)
df_feat = df_feat.dropna(subset=['features_X1', 'features_X2'])
features_X1_df = pd.DataFrame(df_feat['features_X1'].tolist(), columns=['MolWt_X1', 'LogP_X1', 'NumHDonors_X1', 'NumHAcceptors_X1', 'TPSA_X1'])
features_X1_df['col'] = df_feat['col']
features_X2_df = pd.DataFrame(df_feat['features_X2'].tolist(), columns=['MolWt_X2', 'LogP_X2', 'NumHDonors_X2', 'NumHAcceptors_X2', 'TPSA_X2'])
features_X2_df['col'] = df_feat['col']

df_feat = pd.merge(df_feat, pd.merge(features_X1_df, features_X2_df,on='col'), on='col')
df_feat = df_feat.drop(['X1', 'X2', 'w', 'features_X1', 'features_X2'], axis=1)

df_feat_X = df_feat.iloc[:,3:]
df_feat_Y = to_categorical(df_feat["y"])

feat_model = DNN(df_feat_X, df_feat_Y, 5, 1000, 1)


[14:25:39] SMILES Parse Error: syntax error while parsing: OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1
[14:25:39] SMILES Parse Error: check for mistakes around position 76:
[14:25:39] C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C
[14:25:39] ~~~~~~~~~~~~~~~~~~~~^
[14:25:39] SMILES Parse Error: Failed parsing SMILES 'OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1' for input: 'OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1'


Epoch 1/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 551ms/step - AUC: 0.7391 - accuracy: 0.1547 - loss: 4.4020 - precision: 0.2204 - recall: 0.1046 - val_AUC: 0.7916 - val_accuracy: 0.2003 - val_loss: 4.5560 - val_precision: 0.2330 - val_recall: 0.1539
Epoch 2/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 463ms/step - AUC: 0.9444 - accuracy: 0.5544 - loss: 1.7946 - precision: 0.6327 - recall: 0.4787 - val_AUC: 0.7698 - val_accuracy: 0.2140 - val_loss: 4.9186 - val_precision: 0.2433 - val_recall: 0.1827
Epoch 3/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 473ms/step - AUC: 0.9844 - accuracy: 0.7324 - loss: 0.8830 - precision: 0.7868 - recall: 0.6778 - val_AUC: 0.7748 - val_accuracy: 0.2228 - val_loss: 5.1017 - val_precision: 0.2418 - val_recall: 0.1852
Epoch 4/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 512ms/step - AUC: 0.9956 - accuracy: 0.8529 - loss: 0.4772 - precision: 0.8913 - recall: 0.8076 - val_AUC: 0.

## Adding In GNN

In [60]:
#Creating GNN Model
class GNN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

#Using PyTorch Geometric Graph Format
def create_graph_features(features, labels):
    num_nodes = len(features)

    # creating the nodes
    edge_index = torch.tensor(
        np.array([[i, i] for i in range(num_nodes)]).T, dtype=torch.long
    )

    graphs = []
    for i in range(len(features)):
        x = torch.tensor(features[i], dtype=torch.float).unsqueeze(0)
        y = torch.tensor([labels[i]], dtype=torch.long)
        graph = Data(x=x, edge_index=edge_index, y=y)
        graphs.append(graph)
    return graphs

#Training the GNN Model
def GNN_train(model, train_loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for data in train_loader:
        data = data.to(torch.device("cpu"))  # Ensure correct device
        optimizer.zero_grad()
        out = model(data)
        loss = criterion(out.squeeze(1), data.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

#Evaluating the Model
def GNN_test(model, loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data in loader:
            data = data.to(torch.device("cpu"))  # Ensure correct device
            out = model(data)
            pred = out.argmax(dim=1)
            correct += (pred == data.y).sum().item()
            total += data.y.size(0)
    return correct / total

def perform_GNN(X, Y, epochs, batchsize):

    #Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
                                                    
    train_graphs = create_graph_features(X_train, y_train)
    test_graphs = create_graph_features(X_test, y_test)

    # Creating data loaders
    train_loader = DataLoader(train_graphs, batch_size=batchsize, shuffle=True)
    test_loader = DataLoader(test_graphs, batch_size=batchsize, shuffle=False)

    #Initialising the Model
    input_dim = X.shape[1]
    output_dim = len(np.unique(Y))
    hidden_dim = int(round((input_dim + output_dim)/2,0))
    
    model = GNN(input_dim, hidden_dim, output_dim)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    criterion = torch.nn.CrossEntropyLoss()

    #Running Training
    for epoch in range(epochs):
        loss = GNN_train(model, train_loader, optimizer, criterion)
        acc = GNN_test(model, test_loader)
        print(f"Epoch {epoch+1}: Loss={loss:.4f}, Test Accuracy={acc:.4f}")
    
    #Saving the model for future use
    torch.save(model.state_dict(), "gnn_model.pth")
    

In [61]:
label_encoder = LabelEncoder()
graph_y = label_encoder.fit_transform(df_feat['y'])

perform_GNN(df_feat_X.values, graph_y, 5, 1000)

Epoch 1: Loss=199.8636, Test Accuracy=0.0225
Epoch 2: Loss=46.9862, Test Accuracy=0.1853
Epoch 3: Loss=3.6616, Test Accuracy=0.0826
Epoch 4: Loss=3.0726, Test Accuracy=0.2919
Epoch 5: Loss=2.7645, Test Accuracy=0.2979


In [162]:
def ddi_featurize(df, X1_name, X2_name):
    df_cf = pd.DataFrame()
    df_cf['x1_cf'] = df[X1_name].apply(circular_fingerprint)
    df_cf['x2_cf'] = df_example[X2_name].apply(circular_fingerprint)
    df_cf['col'] = df_cf.index
    df_cf.dropna(subset=['x1_cf', 'x2_cf'], inplace=True)
    
    df_x1_cf = pd.DataFrame(delist(df_cf['x1_cf']))
    df_x1_cf.rename(columns=lambda x: "x1_cf_"+str(x+1), inplace=True)
    df_x1_cf['col'] = df_cf['col']
    
    df_x2_cf = pd.DataFrame(delist(df_example_cf['x2_cf']))
    df_x2_cf.rename(columns=lambda x: "x2_cf_"+str(x+1), inplace=True)
    df_x2_cf['col'] = df_cf['col']
    
    df_temp = df_temp.merge(df_x1_cf, on="col")
    df_temp = df_temp.merge(df_x2_cf, on="col")
    df_temp.dropna(inplace=True)

    df_temp['features_X1'] = df_temp[X1_name].apply(compute_features)
    df_temp['features_X2'] = df_temp[X2_name].apply(compute_features)
    df_temp = df_temp.dropna(subset=['features_X1', 'features_X2'])
    features_X1_df = pd.DataFrame(df_temp['features_X1'].tolist(), columns=['MolWt_X1', 'LogP_X1', 'NumHDonors_X1', 'NumHAcceptors_X1', 'TPSA_X1'])
    features_X1_df['col'] = df_temp['col']
    features_X2_df = pd.DataFrame(df_temp['features_X2'].tolist(), columns=['MolWt_X2', 'LogP_X2', 'NumHDonors_X2', 'NumHAcceptors_X2', 'TPSA_X2'])
    features_X2_df['col'] = df_temp['col']
    
    df_final = pd.merge(df_temp, pd.merge(features_X1_df, features_X2_df,on='col'), on='col')

    return df_final

In [182]:
eq_df['col'] = eq_df.index
eq_df_feat = ddi_featurize(eq_df, "X1","X2")
eq_df_feat = eq_df_feat.drop(['X1', 'X2', 'w', 'features_X1', 'features_X2'], axis=1)

[15:59:54] SMILES Parse Error: syntax error while parsing: OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1
[15:59:54] SMILES Parse Error: check for mistakes around position 76:
[15:59:54] C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C
[15:59:54] ~~~~~~~~~~~~~~~~~~~~^
[15:59:54] SMILES Parse Error: Failed parsing SMILES 'OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1' for input: 'OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1'
Failed to featurize datapoint 0, None. Appending empty array
Exception message: Python argument types in
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(class RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True, bool includeAtomMaps=True, 

In [184]:
eq_model = DNN(eq_df_feat.iloc[:,3:], to_categorical(eq_df_feat['y']), 5, 1000, 1)

Epoch 1/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 467ms/step - AUC: 0.8131 - accuracy: 0.3027 - loss: 3.2576 - precision: 0.5009 - recall: 0.2339 - val_AUC: 0.9123 - val_accuracy: 0.5519 - val_loss: 2.2146 - val_precision: 0.6125 - val_recall: 0.5144
Epoch 2/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 379ms/step - AUC: 0.9858 - accuracy: 0.8206 - loss: 0.6569 - precision: 0.8493 - recall: 0.7995 - val_AUC: 0.9087 - val_accuracy: 0.5582 - val_loss: 2.5612 - val_precision: 0.5945 - val_recall: 0.5394
Epoch 3/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 385ms/step - AUC: 0.9936 - accuracy: 0.9101 - loss: 0.3234 - precision: 0.9223 - recall: 0.8943 - val_AUC: 0.8940 - val_accuracy: 0.5782 - val_loss: 2.6292 - val_precision: 0.6057 - val_recall: 0.5557
Epoch 4/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 387ms/step - AUC: 0.9977 - accuracy: 0.9527 - loss: 0.1711 - precision: 0.9600 - recall: 0.9459 - val_AUC: 0.