In [58]:
import pandas as pd
import numpy as np

In [59]:
from rdkit import Chem, DataStructs
from rdkit.Chem import Draw, Descriptors, AllChem

In [61]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef
from sklearn.cluster import KMeans

In [62]:
import torch
from torch import nn
from torch.autograd import Variable

In [64]:
traindata = pd.read_csv('data/trainDILI.csv')

In [65]:
traindata

Unnamed: 0,SMILES,Drug_Name,IsDILI
0,BrC(Cl)C(F)(F)F,Halothane,1.0
1,Br[C@@H](C(C)C)C(=O)NC(=O)N,Bromisoval,1.0
2,Brc1[nH]c2c3c1CC1N(CC(C=C1c3ccc2)C(=O)N[C@]1(O...,Bromocriptine,0.0
3,Brc1c(nc(nc1Oc1c(cc(cc1C)C#N)C)Nc1ccc(cc1)C#N)N,Etravirine,1.0
4,Brc1c2nccnc2ccc1NC=1NCCN=1,Brimonidine,0.0
...,...,...,...
961,s1cccc1CN(CCN(C)C)c1ncccc1,Methapyrilene,1.0
962,s1cccc1C\C(=C/c1n(Cc2ccc(cc2)C(O)=O)c(nc1)CCCC...,Eprosartan,0.0
963,s1cccc1\C=C\C1=NCCCN1C,Pyrantel,0.0
964,s1ccnc1NC(=O)C=1N(S(=O)(=O)c2c(cccc2)C=1O)C,Sudoxicam,1.0


In [66]:
trainpos = traindata[traindata['IsDILI']==1.0]

In [67]:
trainpos

Unnamed: 0,SMILES,Drug_Name,IsDILI
0,BrC(Cl)C(F)(F)F,Halothane,1.0
1,Br[C@@H](C(C)C)C(=O)NC(=O)N,Bromisoval,1.0
3,Brc1c(nc(nc1Oc1c(cc(cc1C)C#N)C)Nc1ccc(cc1)C#N)N,Etravirine,1.0
6,Brc1cc(cc(Br)c1O)C(=O)c1c2c(oc1CC)cccc2,Benzbromarone,1.0
7,Brc1ccc(S(=O)(=O)N\C=N\CCSCc2nc(sc2)\N=C(\N)/N...,Ebrotidine,1.0
...,...,...,...
957,s1cccc1CC(=O)NC1C2SCC(COC(=O)C)=C(N2C1=O)C(O)=O,Cephalothin,1.0
958,s1cccc1CC(=O)NC1C2SCC(C[n+]3ccccc3)=C(N2C1=O)C...,Cephaloridine,1.0
961,s1cccc1CN(CCN(C)C)c1ncccc1,Methapyrilene,1.0
964,s1ccnc1NC(=O)C=1N(S(=O)(=O)c2c(cccc2)C=1O)C,Sudoxicam,1.0


In [68]:
trainneg = traindata[traindata['IsDILI']==0.0]

In [69]:
trainneg

Unnamed: 0,SMILES,Drug_Name,IsDILI
2,Brc1[nH]c2c3c1CC1N(CC(C=C1c3ccc2)C(=O)N[C@]1(O...,Bromocriptine,0.0
4,Brc1c2nccnc2ccc1NC=1NCCN=1,Brimonidine,0.0
5,Brc1cc(Cl)c(Nc2c(cc3n(cnc3c2F)C)C(=O)NOCCO)cc1,Selumetinib,0.0
9,Brc1ccc(cc1)C(CCN(C)C)c1ncccc1,Brompheniramine,0.0
12,ClC(Cl)(P(O)(O)=O)P(O)(O)=O,Clodronic acid,0.0
...,...,...,...
956,s1cccc1C1OC2C(OC(OC3C4C(C(c5c3cc3OCOc3c5)c3cc(...,Teniposide,0.0
959,s1cccc1CC(=O)N[C@@]1(OC)C2SCC(COC(=O)N)=C(N2C1...,Cefoxitin,0.0
960,s1cccc1CCN1CCC(N(C(=O)CC)c2ccccc2)(CC1)COC,Sufentanil,0.0
962,s1cccc1C\C(=C/c1n(Cc2ccc(cc2)C(O)=O)c(nc1)CCCC...,Eprosartan,0.0


In [63]:
def validSMILES(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return False
    return True

def morganArrayFromSmiles(smiles):
    """Accepts a SMILES string and produces a numpy vector representing its Morgan fingerprint.
    
    Details: The Morgan fingerprint is basically a reimplementation of the extended conectivity fingerprint (ECFP).
    There is a paper describing it if you want more details but in essence you go through each atom of the molecule
    and obtain all possible paths through this atom with a specific radius. Then each unique path is hashed into
    a number with a maximum based on bit number. The higher the radius, the bigger fragments are encoded.
    So a Morgan radius 2 has all paths found in Morgan radius 1 and then some additional ones. In general,
    people use radius 2 (similar to ECFP4) and 3 (similar to ECFP6). As for number of bits it depends on your
    dataset. The higher bit number the more discriminative your fingerprint can be.
    If you have a large and diverse dataset but only have 32 bits, it will not be good. 
    I would start at 1024 bits but also check higher numbers and see if you are losing too much information."""
    RADIUS = 3 # this parameter defines how many atoms away from the base atom the fingerprint looks for
    nBits = 2048 #defines the hash size of the fingerprint, and thus the length of the vector
    mol = Chem.MolFromSmiles(smiles)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=RADIUS, nBits=nBits)
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr
    

In [74]:
def createDataset(df):
    """takes a pandas dataframe with smiles and a target and creates the 
    X feature matrix and y target vector to train an ML models
    The features are a morgan fingerprint vector of length equal to morgansize"""
    df['Valid'] = df['SMILES'].apply(validSMILES)
    cleandf = df[df['Valid']==True]
    X = [morganArrayFromSmiles(sm) for sm in cleandf['SMILES']]
    X = np.array(X)
    y = np.array(cleandf['IsDILI'])
    assert len(X)==len(y)
    print(len(df)-len(cleandf),"invalid SMILES strings discarded")
    return X,y

In [75]:
X,y = createDataset(traindata)

0 invalid SMILES strings discarded


In [78]:
X.shape

(966, 2048)

In [80]:
kmeans = KMeans(n_clusters=2, random_state=0).fit(X)

In [81]:
kmeans.labels_

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,

In [82]:
y

array([1., 1., 0., 1., 0., 0., 1., 1., 1., 0., 1., 1., 0., 1., 1., 1., 1.,
       1., 1., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0.,
       1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0.,
       1., 1., 0., 1., 0., 1., 1., 1., 0., 0., 1., 1., 1., 0., 0., 0., 1.,
       0., 0., 1., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 0., 0.,
       0., 1., 1., 0., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1.,
       1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 0., 1., 1., 0., 0., 0., 1.,
       0., 1., 0., 0., 1., 1., 1., 1., 1., 0., 1., 1., 0., 0., 1., 1., 1.,
       0., 0., 0., 1., 0., 0., 0., 1., 1., 0., 1., 1., 1., 0., 1., 0., 1.,
       1., 1., 1., 1., 0., 0., 0., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1.,
       1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 0., 1., 1., 1., 1., 1.,
       1., 1., 1., 0., 0.

In [83]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=1)

In [84]:
skmodel = LogisticRegression(class_weight='balanced')
#skmodel = LogisticRegression()
#skmodel = GaussianNB()
skmodel.fit(Xtrain, ytrain)                  
y_model = skmodel.predict(Xtest)   
ytrain_model = skmodel.predict(Xtrain)
print('Train accuracy: ',accuracy_score(ytrain, ytrain_model))
print('Test accuracy: ',accuracy_score(ytest, y_model))
print('F1 score: ',f1_score(ytest, y_model))
print('Matthews correlation coefficient: ',matthews_corrcoef(ytest,y_model))

Train accuracy:  0.9972375690607734
Test accuracy:  0.5867768595041323
F1 score:  0.5934959349593496
Matthews correlation coefficient:  0.17332786773245884


Basic Logistic regression
Train accuracy:  0.9972375690607734
Test accuracy:  0.5867768595041323
F1 score:  0.5934959349593496
Matthews correlation coefficient:  0.17332786773245884

In [85]:
Xtrain.shape

(724, 2048)