In [102]:
import csv
import math
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.optimize as opt
import seaborn as sns
from scipy import stats
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.decomposition import IncrementalPCA
from tabulate import tabulate
import tensorflow as tf
import deepchem as dc
from rdkit import Chem
from rdkit.Chem import AllChem
from tensorflow import keras
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

In [66]:
#initial loading

ddi_fp = "drugbank\drugbank.tab"
ddi = pd.read_csv(ddi_fp, sep='\t')

kaggle_fp = "SMILES-Kaggle\chembl_22_clean_1576904_sorted_std_final.smi"
smiles = pd.read_csv(kaggle_fp, sep='\t')

drug_names_fp = "drugs.txt"
drug_names = pd.read_csv(drug_names_fp, sep='\t')

ddi["Y"] = ddi["Y"].astype("category")
ddi["Map"] = ddi["Map"].astype("category")

interaction_counts = pd.DataFrame(ddi['Y'].value_counts().rename_axis('value').reset_index(name='count')).sort_values(by='count', ascending=False)
interaction_counts['row_num'] = interaction_counts.index + 1
interaction_counts['log_count'] = np.log(interaction_counts['count'])

interaction_types = ddi[['Y','Map']].drop_duplicates(subset=['Y'])

ddi = ddi.drop("Map",axis=1)

ddi

  ddi_fp = "drugbank\drugbank.tab"
  kaggle_fp = "SMILES-Kaggle\chembl_22_clean_1576904_sorted_std_final.smi"


Unnamed: 0,ID1,ID2,Y,X1,X2
0,DB04571,DB00460,1,CC1=CC2=CC3=C(OC(=O)C=C3C)C(C)=C2O1,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...
1,DB00855,DB00460,1,NCC(=O)CCC(O)=O,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...
2,DB09536,DB00460,1,O=[Ti]=O,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...
3,DB01600,DB00460,1,CC(C(O)=O)C1=CC=C(S1)C(=O)C1=CC=CC=C1,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...
4,DB09000,DB00460,1,CC(CN(C)C)CN1C2=CC=CC=C2SC2=C1C=C(C=C2)C#N,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...
...,...,...,...,...,...
191803,DB00437,DB00492,86,OC1=NC=NC2=C1C=NN2,CCC(=O)O[C@@H](O[P@](=O)(CCCCC1=CC=CC=C1)CC(=O...
191804,DB00437,DB09477,86,OC1=NC=NC2=C1C=NN2,[H][C@@](C)(N[C@@]([H])(CCC1=CC=CC=C1)C(O)=O)C...
191805,DB00437,DB00790,86,OC1=NC=NC2=C1C=NN2,[H][C@]12C[C@H](N(C(=O)[C@H](C)N[C@@H](CCC)C(=...
191806,DB00415,DB00437,86,[H][C@]12SC(C)(C)[C@@H](N1C(=O)[C@H]2NC(=O)[C@...,OC1=NC=NC2=C1C=NN2


In [67]:
def delist(list_of_lists):
    list_of_strings = []
    for inner_list in list_of_lists:
        string = inner_list[0]
        list_of_strings.append(string)
    return list_of_strings

In [68]:
#preprocessing

old = pd.DataFrame()
old["total"] = ddi['ID1'].value_counts()
old = old.reset_index()
old.columns = ['ID', 'count'] 
new = pd.DataFrame()
new["total"] = ddi['ID2'].value_counts()
new = new.reset_index()
new.columns = ['ID', 'count'] 

drug_counts = pd.merge(old,new,how='outer',on='ID').fillna(0)
drug_counts['total'] = drug_counts['count_x'] + drug_counts['count_y']

drug_counts = drug_counts.sort_values(by='total')
drug_counts_one = pd.DataFrame(drug_counts[drug_counts['total']==1]['ID'])

ddi_proc = ddi[ ~ddi['ID1'].isin(drug_counts_one['ID'])]
ddi_proc = ddi_proc[ ~ddi_proc['ID2'].isin(drug_counts_one['ID'])]
ddi_proc = ddi_proc[ddi_proc['X1']!="OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1"]


  ddi_proc = ddi_proc[ddi_proc['X1']!="OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1"]


In [69]:
#create main datasets

data = dc.data.NumpyDataset(X=ddi_proc[['X1','X2']], y=ddi[['Y']])
df = data.to_dataframe()
df = df.sample(frac=1).reset_index(drop=True)

X_one = delist(df[["X1"]].values.tolist())
X_two = delist(df[["X2"]].values.tolist())

In [74]:
#featurize using circular fingerprint

cf_featurizer = dc.feat.CircularFingerprint()

#other way to featurize a molecule
#cm_featurizer=dc.feat.ConvMolFeaturizer(per_atom_fragmentation=False)

#Need Discovery access to run these
#df.rename(columns={'ids':'col'},  inplace=True)
#df_x1_cf = pd.DataFrame(cf_featurizer(X_one))
#df_x1_cf.rename(columns=lambda x: "x1_cf_"+str(x+1), inplace=True)
#df_x1_cf['col'] = df_x1_cf.index
#df_x2_cf = pd.DataFrame(cf_featurizer(X_two))
#df_x2_cf.rename(columns=lambda x: "x2_cf_"+str(x+1), inplace=True)
#df_x2_cf['col'] = df_x2_cf.index

#df = df.merge(df_x1_cf, on="col")
#df = df.merge(df_x2_cf, on="col")

#example process with just 8000 rows
df_example = df.head(1000)
df_example['col'] = df_example.index

df_example_x1_cf = pd.DataFrame(cf_featurizer(X_one[0:1000]))
df_example_x1_cf.rename(columns=lambda x: "x1_cf_"+str(x+1), inplace=True)
df_example_x1_cf['col'] = df_example_x1_cf.index
df_example_x2_cf = pd.DataFrame(cf_featurizer(X_two[0:1000]))
df_example_x2_cf.rename(columns=lambda x: "x2_cf_"+str(x+1), inplace=True)
df_example_x2_cf['col'] = df_example_x2_cf.index

df_example = df_example.merge(df_example_x1_cf, on="col")
df_example = df_example.merge(df_example_x2_cf, on="col")

df_example

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_example['col'] = df_example.index


Unnamed: 0,X1,X2,y,w,ids,col,x1_cf_1,x1_cf_2,x1_cf_3,x1_cf_4,...,x2_cf_2039,x2_cf_2040,x2_cf_2041,x2_cf_2042,x2_cf_2043,x2_cf_2044,x2_cf_2045,x2_cf_2046,x2_cf_2047,x2_cf_2048
0,CC(C)OC(=O)C(C)(C)OC1=CC=C(C=C1)C(=O)C1=CC=C(C...,FC1=CC=C(CC2=NNC(=O)C3=CC=CC=C23)C=C1C(=O)N1CC...,47,1.0,67210,0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,CN(C)CCCN1C2=CC=CC=C2SC2=C1C=C(Cl)C=C2,NS(=O)(=O)C1=C(Cl)C=C2NCNS(=O)(=O)C2=C1,49,1.0,116447,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,CN1CCN(CCCN2C3=CC=CC=C3SC3=C2C=C(C=C3)C(F)(F)F...,CCC1=C(C)NC2=C1C(=O)C(CN1CCOCC1)CC2,49,1.0,119616,2,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CCN1N=C(C(O)=O)C(=O)C2=CC3=C(OCO3)C=C12,OC[C@@H](O)[C@@H](O)[C@H](O)[C@@H](O)C(=O)O[Ca...,67,1.0,143645,3,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,NC1=CC=C(C=C1)S(=O)(=O)NC1=CC=NN1C1=CC=CC=C1,CC(C)N(CC[C@H](C1=CC=CC=C1)C1=C(OC(=O)C(C)C)C=...,77,1.0,189253,4,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,CN(C)CCOC(C1=CC=CC=C1)C1=CC=CC=C1C,CN(C)CCOC(C1=CC=C(Cl)C=C1)C1=CC=CC=N1,16,1.0,16226,995,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
996,CC(C)NCC(O)COC1=CC=CC=C1CC=C,COC1=CC(C)=NN1C1=NC(C)=CC(OC)=N1,37,1.0,30828,996,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
997,CN[C@@H](C)[C@@H](O)C1=CC=CC=C1,[OH-].[OH-].[OH-].[Al+3],72,1.0,153643,997,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
998,COC1=C(C=C(C=C1)C1=CC2=C(C=C1)C=C(C=C2)C(O)=O)...,[H][C@@]12C[C@@]3([H])[C@]4([H])CCC5=CC(=O)C=C...,49,1.0,96227,998,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [104]:
#Basic dimensionality reduction down to 20 components from 4097

df_X = df_example.iloc[:,5:]
df_Y = to_categorical(df_example["y"])

n_components = 20
batch_size = 20

ipca = IncrementalPCA(n_components=n_components)

for i in range(0, df_X.shape[0], batch_size):
    X_batch = df_X[i:i + batch_size]
    ipca.partial_fit(X_batch)

X_transformed = ipca.transform(df_X)

#prove that X's shape has changed
#print("Original shape:", df_X.shape)
#print("Transformed shape:", X_transformed.shape)

df_X_proc = pd.DataFrame(X_transformed)

X_train, X_test, y_train, y_test = train_test_split(df_X_proc, df_Y, test_size=0.2, random_state=1337)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [110]:
#basic/rough neural network implementation

model = keras.models.Sequential([
    Dense(20, activation='relu'),
    keras.layers.Dropout(0.2),
    Dense(40, activation='relu'),
    Dense(80, activation='relu'),
    Dense(160, activation='relu'),
    Dense(86, activation='softmax')
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(X_train, y_train, epochs=10, batch_size=20, validation_split=0.1)

loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {accuracy}")

Epoch 1/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.0342 - loss: 0.5300 - val_accuracy: 0.3625 - val_loss: 0.0554
Epoch 2/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.3115 - loss: 0.0540 - val_accuracy: 0.4625 - val_loss: 0.0431
Epoch 3/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.3165 - loss: 0.0488 - val_accuracy: 0.4625 - val_loss: 0.0411
Epoch 4/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.3016 - loss: 0.0462 - val_accuracy: 0.3625 - val_loss: 0.0405
Epoch 5/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.3313 - loss: 0.0431 - val_accuracy: 0.4500 - val_loss: 0.0387
Epoch 6/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.2887 - loss: 0.0427 - val_accuracy: 0.4125 - val_loss: 0.0379
Epoch 7/10
[1m36/36[0m [32m━━━━━━━━━━