This notebook contains experiments on running a basic fully connected neural network to predict androgen receptor activity

In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Jul 27 17:21:32 2020

@author: julian

This file is for visualizing molecules in the SMILES format using rdkit
CCOC(=O)C(O)(C1=CC=C(Cl)C=C1)C2=CC=C(Cl)C=C2
"""
"""Useful commands
mol = Chem.MolFromSmiles('CCCO')
smiles = Chem.MolToSmiles(mol)
img = Draw.MolToImage(mol)
smiles_list = ['CCCO',
               'CCOC(=O)C(O)(C1=CC=C(Cl)C=C1)C2=CC=C(Cl)C=C2',
               '[H][C@]12C[C@@]3([H])[C@]4([H])CC[C@@]5([H])C[C@H](CC[C@]5(C)[C@@]4([H])CC[C@]3(C)[C@@]1([H])[C@H](C)[C@]6(CC[C@H](C)CN6)O2)O[C@@H]7O[C@H](CO)[C@H](O[C@@H]8O[C@H](CO)[C@@H](O)[C@H](O[C@@H]9OC[C@@H](O)[C@H](O)[C@H]9O)[C@H]8O[C@@H]%10O[C@H](CO)[C@@H](O)[C@H](O)[C@H]%10O)[C@H](O)[C@H]7O']
mol_list = list(map(Chem.MolFromSmiles, smiles_list))
imgs = Draw.MolsToGridImage(mol_list)
pattern = Chem.MolFromSmiles('C(=O)')
for mol in mol_list:
    print(mol.HasSubstructMatch(pattern))
    
glycine = Chem.MolFromSmiles('C(C(=O)O)N')
glyprint = AllChem.GetMorganFingerprintAsBitVect(glycine, radius=2, nBits=1024)
glyp_arr = np.zeros((1,))
DataStructs.ConvertToNumpyArray(glyprint, glyp_arr)
bi={}
glyanalyse = AllChem.GetMorganFingerprintAsBitVect(glycine, radius=2, nBits=1024, bitInfo=bi)
prints = [(glycine,x,bi) for x in glyanalyse.GetOnBits()]
Draw.DrawMorganBits(prints, molsPerRow=4, legends=[str(x) for x in glyanalyse.GetOnBits()])

#Tanimoto Similarity: Gives number of common on bits divided by total combined on bits
cysteine = Chem.MolFromSmiles('C([C@@H](C(=O)O)N)S')
cyanalyse = AllChem.GetMorganFingerprintAsBitVect(cysteine, radius=2, nBits=1024)

DataStructs.TanimotoSimilarity(glyanalyse,cyanalyse)

#apparently pickling molecules is much faster than 
#reparsing SMILES strings
"""

"Useful commands\nmol = Chem.MolFromSmiles('CCCO')\nsmiles = Chem.MolToSmiles(mol)\nimg = Draw.MolToImage(mol)\nsmiles_list = ['CCCO',\n               'CCOC(=O)C(O)(C1=CC=C(Cl)C=C1)C2=CC=C(Cl)C=C2',\n               '[H][C@]12C[C@@]3([H])[C@]4([H])CC[C@@]5([H])C[C@H](CC[C@]5(C)[C@@]4([H])CC[C@]3(C)[C@@]1([H])[C@H](C)[C@]6(CC[C@H](C)CN6)O2)O[C@@H]7O[C@H](CO)[C@H](O[C@@H]8O[C@H](CO)[C@@H](O)[C@H](O[C@@H]9OC[C@@H](O)[C@H](O)[C@H]9O)[C@H]8O[C@@H]%10O[C@H](CO)[C@@H](O)[C@H](O)[C@H]%10O)[C@H](O)[C@H]7O']\nmol_list = list(map(Chem.MolFromSmiles, smiles_list))\nimgs = Draw.MolsToGridImage(mol_list)\npattern = Chem.MolFromSmiles('C(=O)')\nfor mol in mol_list:\n    print(mol.HasSubstructMatch(pattern))\n    \nglycine = Chem.MolFromSmiles('C(C(=O)O)N')\nglyprint = AllChem.GetMorganFingerprintAsBitVect(glycine, radius=2, nBits=1024)\nglyp_arr = np.zeros((1,))\nDataStructs.ConvertToNumpyArray(glyprint, glyp_arr)\nbi={}\nglyanalyse = AllChem.GetMorganFingerprintAsBitVect(glycine, radius=2, nBits=1024

In [3]:
from rdkit import Chem, DataStructs
from rdkit.Chem import Draw, Descriptors, AllChem

import numpy as np
import pandas as pd


from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score



In [4]:
import torch
from torch import nn
from torch.autograd import Variable

In [5]:
from sklearn.cluster import KMeans

In [6]:
data = pd.read_csv('data/nr-ar.smiles',
                   sep='\t',
                   names=['SMILES','Identifier','Activity'])
minipos = data[data['Activity']==1].head()
minineg = data[100:105]
minidata = pd.concat([minipos,minineg])

In [7]:
balpos = data[data['Activity']==1]
balneg = data[data['Activity']==0]
balneg = balneg.sample(n=380, random_state=4)
baldata = pd.concat([balpos,balneg])
baldata

Unnamed: 0,SMILES,Identifier,Activity
32,ClC1=CC=C(NC(=N)NC(=N)NCCCCCCNC(=N)NC(=N)NC2=C...,NCGC00258955-01,1
173,CCN(CC)C1=CC=C2C(C)=CC(=O)OC2=C1,NCGC00259338-01,1
198,CCN(CC)C1=CC=C2C(C)=CC(=O)OC2=C1,NCGC00257038-01,1
256,OC1=CC(=CC(O)=C1O)C(=O)OC2=CC(=CC(O)=C2O)C(=O)...,NCGC00095101-01,1
264,[H][C@@]12OC\C3=C/C=C/[C@H](C)C\C(C)=C\C[C@]4(...,NCGC00183118-01,1
...,...,...,...
5968,C\C=C(/C)C(=O)OCC1=CC=CC=C1,NCGC00256881-01,0
6274,ClC1=CC=CC(Cl)=C1Cl,NCGC00254273-01,0
9131,O=C1NN=CC2=CC=CC=C12,NCGC00257841-01,0
126,OC1=CC=C([Hg]Cl)C=C1,NCGC00181142-02,0


In [8]:
def validSMILES(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return False
    return True

def morganArrayFromSmiles(smiles, nBits):
    """Accepts a SMILES string and a size parameter nBits and
    produces a numpy array of the required size"""
    RADIUS = 3 # this parameter defines how many atoms away from the base atom the fingerprint looks for
    mol = Chem.MolFromSmiles(smiles)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=RADIUS, nBits=nBits)
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr
    

In [9]:
def createDataset(df, morgansize):
    """takes a pandas dataframe with smiles and a target and creates the 
    X feature matrix and y target vector to train an ML models
    The features are a morgan fingerprint vector of length equal to morgansize"""
    df['Valid'] = df['SMILES'].apply(validSMILES)
    cleandf = df[df['Valid']==True]
    X = [morganArrayFromSmiles(sm,morgansize) for sm in cleandf['SMILES']]
    X = np.array(X)
    y = np.array(cleandf['Activity'])
    assert len(X)==len(y)
    return X,y

In [10]:
morgansize = 2048
X,y = createDataset(baldata,morgansize)

In [11]:
kmeans = KMeans(n_clusters=2, random_state=0).fit(X)

In [12]:
kmeans.labels_

array([1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,

In [13]:
y

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [14]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=1)

In [15]:
skmodel = LogisticRegression(class_weight='balanced')
#skmodel = LogisticRegression()
#skmodel = GaussianNB()
skmodel.fit(Xtrain, ytrain)                  
y_model = skmodel.predict(Xtest)   
ytrain_model = skmodel.predict(Xtrain)
print('Train accuracy: ',accuracy_score(ytrain, ytrain_model))
print('Test accuracy: ',accuracy_score(ytest, y_model))
print('F1 score: ',f1_score(ytest, y_model))

Train accuracy:  0.9929824561403509
Test accuracy:  0.7473684210526316
F1 score:  0.7303370786516854



with 50 bits in the fingerprint
A Gaussian Naive Bayes Model gets to 84 percent accuracy 
but this is meaningless. F1 score is 0.2

with 2048 bits 
GNB
Train accuracy:  0.8306968790081232
Test accuracy:  0.791025641025641
F1 score:  0.12834224598930483

Train accuracy:  0.9210526315789473
Test accuracy:  0.5842105263157895
F1 score:  0.6220095693779905

With balanced Logisitic Regression
Train accuracy:  0.9844662961379507
Test accuracy:  0.9508547008547008
F1 score:  0.41624365482233505

on all the positive data and an equal amount of negative data
Train accuracy:  0.987719298245614
Test accuracy:  0.7578947368421053
F1 score:  0.7228915662650602

Changing the radius to 3 and using 2048 bits gives
Train accuracy:  0.9929824561403509
Test accuracy:  0.7473684210526316
F1 score:  0.7303370786516854


In [16]:
y_model

array([0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1,
       1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0])

In [17]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]])

In [18]:
"""# Hyperparameters for our network
input_size = morgansize
hidden_sizes = [128, 64]
output_size = 1
# Build a feed-forward network
model = nn.Sequential(nn.Linear(input_size, hidden_sizes[0]),
                      nn.ReLU(),
                      nn.Linear(hidden_sizes[0], hidden_sizes[1]),
                      nn.ReLU(),
                      nn.Linear(hidden_sizes[1], output_size),
                      nn.Sigmoid())
print(model)"""

input_size = morgansize
hidden_sizes = [128, 64]
output_size = 1
# Build a feed-forward network
model = nn.Sequential(nn.Linear(input_size, hidden_sizes[1]),
                      nn.ReLU(),
                      nn.Linear(hidden_sizes[1], output_size),
                      nn.Sigmoid())
print(model)

Sequential(
  (0): Linear(in_features=2048, out_features=64, bias=True)
  (1): ReLU()
  (2): Linear(in_features=64, out_features=1, bias=True)
  (3): Sigmoid()
)


In [19]:
criterion = nn.MSELoss()

#CrossEntropyLoss() requires logits as the output and class labels as the target

In [20]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]])

In [21]:
X = torch.from_numpy(X).float()
y = torch.from_numpy(y).float()

In [22]:
Xtrain.shape

(570, 2048)

In [23]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=1)

In [24]:
Xtrain

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 1.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [25]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [31]:
#def training(epochs)
epochs = 10
for e in range(epochs):
    output = model(Xtrain)
    labels = ytrain.unsqueeze(1)
    optimizer.zero_grad()
    loss = criterion(output, labels)
    loss.backward()
    optimizer.step()
    if e % 2 == 0:
        print("Training loss: ",loss.item())

Training loss:  0.008187616243958473
Training loss:  0.004115677904337645
Training loss:  0.0025309291668236256
Training loss:  0.0020008243154734373
Training loss:  0.00151570956222713


In [32]:
predictions = output.detach().numpy()
predictions = np.where(predictions > 0.5, 1, 0)
predictions.sum()

288

In [33]:
testoutput = model(Xtest)
testpredictions = testoutput.detach().numpy()
testpredictions = np.where(testpredictions > 0.5, 1, 0)
testpredictions.sum()

87

In [34]:
print('Train accuracy: ',accuracy_score(predictions,ytrain.numpy()))
print('Test accuracy: ',accuracy_score(testpredictions, ytest))
print('F1 score: ',f1_score(testpredictions, ytest))

Train accuracy:  0.9982456140350877
Test accuracy:  0.6947368421052632
F1 score:  0.6741573033707866


In [30]:
model.parameters

<bound method Module.parameters of Sequential(
  (0): Linear(in_features=2048, out_features=64, bias=True)
  (1): ReLU()
  (2): Linear(in_features=64, out_features=1, bias=True)
  (3): Sigmoid()
)>