This notebook contains experiments on running a basic fully connected neural network to predict androgen receptor activity

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Jul 27 17:21:32 2020

@author: julian

This file is for visualizing molecules in the SMILES format using rdkit
CCOC(=O)C(O)(C1=CC=C(Cl)C=C1)C2=CC=C(Cl)C=C2
"""
"""Useful commands
mol = Chem.MolFromSmiles('CCCO')
smiles = Chem.MolToSmiles(mol)
img = Draw.MolToImage(mol)
smiles_list = ['CCCO',
               'CCOC(=O)C(O)(C1=CC=C(Cl)C=C1)C2=CC=C(Cl)C=C2',
               '[H][C@]12C[C@@]3([H])[C@]4([H])CC[C@@]5([H])C[C@H](CC[C@]5(C)[C@@]4([H])CC[C@]3(C)[C@@]1([H])[C@H](C)[C@]6(CC[C@H](C)CN6)O2)O[C@@H]7O[C@H](CO)[C@H](O[C@@H]8O[C@H](CO)[C@@H](O)[C@H](O[C@@H]9OC[C@@H](O)[C@H](O)[C@H]9O)[C@H]8O[C@@H]%10O[C@H](CO)[C@@H](O)[C@H](O)[C@H]%10O)[C@H](O)[C@H]7O']
mol_list = list(map(Chem.MolFromSmiles, smiles_list))
imgs = Draw.MolsToGridImage(mol_list)
pattern = Chem.MolFromSmiles('C(=O)')
for mol in mol_list:
    print(mol.HasSubstructMatch(pattern))
    
glycine = Chem.MolFromSmiles('C(C(=O)O)N')
glyprint = AllChem.GetMorganFingerprintAsBitVect(glycine, radius=2, nBits=1024)
glyp_arr = np.zeros((1,))
DataStructs.ConvertToNumpyArray(glyprint, glyp_arr)
bi={}
glyanalyse = AllChem.GetMorganFingerprintAsBitVect(glycine, radius=2, nBits=1024, bitInfo=bi)
prints = [(glycine,x,bi) for x in glyanalyse.GetOnBits()]
Draw.DrawMorganBits(prints, molsPerRow=4, legends=[str(x) for x in glyanalyse.GetOnBits()])

#Tanimoto Similarity: Gives number of common on bits divided by total combined on bits
cysteine = Chem.MolFromSmiles('C([C@@H](C(=O)O)N)S')
cyanalyse = AllChem.GetMorganFingerprintAsBitVect(cysteine, radius=2, nBits=1024)

DataStructs.TanimotoSimilarity(glyanalyse,cyanalyse)

#apparently pickling molecules is much faster than 
#reparsing SMILES strings
"""

In [2]:
from rdkit import Chem, DataStructs
from rdkit.Chem import Draw, Descriptors, AllChem

import numpy as np
import pandas as pd


from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

In [3]:
data = pd.read_csv('data/nr-ar.smiles',
                   sep='\t',
                   names=['SMILES','Identifier','Activity'])
minipos = data[data['Activity']==1].head()
minineg = data[100:105]
minidata = pd.concat([minipos,minineg])

In [265]:
minidata

Unnamed: 0,SMILES,Identifier,Activity
32,ClC1=CC=C(NC(=N)NC(=N)NCCCCCCNC(=N)NC(=N)NC2=C...,NCGC00258955-01,1
173,CCN(CC)C1=CC=C2C(C)=CC(=O)OC2=C1,NCGC00259338-01,1
198,CCN(CC)C1=CC=C2C(C)=CC(=O)OC2=C1,NCGC00257038-01,1
256,OC1=CC(=CC(O)=C1O)C(=O)OC2=CC(=CC(O)=C2O)C(=O)...,NCGC00095101-01,1
264,[H][C@@]12OC\C3=C/C=C/[C@H](C)C\C(C)=C\C[C@]4(...,NCGC00183118-01,1
100,CCCCCCCCCCCCN(CCO)CCO,NCGC00255856-01,0
101,[Cl-].CCCCCCCCCCCCCCCCN1C=C[N+](C)=C1,NCGC00258154-01,0
102,O[N+]([O-])=O.ClC1=CC=C(COC(CN2C=CN=C2)C3=C(Cl...,NCGC00257500-01,0
103,CCC(C)NC1=CC=C(NC(C)CC)C=C1,NCGC00257925-01,0
104,OC1=C(C=C(Cl)C=C1)C(=O)NC2=CC=C(C=C2Cl)[N+]([O...,NCGC00254654-01,0


In [5]:
def validSMILES(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return False
    return True

def morganArrayFromSmiles(smiles, nBits):
    """Accepts a SMILES string and a size parameter nBits and
    produces a numpy array of the required size"""
    RADIUS = 2 # this parameter defines how many atoms away from the base atom the fingerprint looks for
    mol = Chem.MolFromSmiles(smiles)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=RADIUS, nBits=nBits)
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr
    

In [12]:
def createDataset(df, morgansize):
    """takes a pandas dataframe with smiles and a target and creates the 
    X feature matrix and y target vector to train an ML models
    The features are a morgan fingerprint vector of length equal to morgansize"""
    df['Valid'] = df['SMILES'].apply(validSMILES)
    cleandf = df[df['Valid']==True]
    X = [morganArrayFromSmiles(sm,morgansize) for sm in cleandf['SMILES']]
    X = np.array(X)
    y = np.array(cleandf['Activity'])
    assert len(X)==len(y)
    return X,y

In [266]:
morgansize = 1024
X,y = createDataset(minidata,morgansize)

In [267]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=1)

In [268]:
skmodel = LogisticRegression(class_weight='balanced')
#model = LogisticRegression()
#model = GaussianNB()
skmodel.fit(Xtrain, ytrain)                  
y_model = skmodel.predict(Xtest)   
ytrain_model = skmodel.predict(Xtrain)
print('Train accuracy: ',accuracy_score(ytrain, ytrain_model))
print('Test accuracy: ',accuracy_score(ytest, y_model))
print('F1 score: ',f1_score(ytest, y_model))

Train accuracy:  1.0
Test accuracy:  0.6666666666666666
F1 score:  0.6666666666666666



with 50 bits in the fingerprint
A Gaussian Naive Bayes Model gets to 84 percent accuracy 
but this is meaningless. F1 score is 0.2

with 2048 bits 
GNB
Train accuracy:  0.8306968790081232
Test accuracy:  0.791025641025641
F1 score:  0.12834224598930483

With balanced Logisitic Regression
Train accuracy:  0.9844662961379507
Test accuracy:  0.9508547008547008
F1 score:  0.41624365482233505


In [269]:
y_model

array([1, 1, 0])

In [270]:
from torch import nn

In [271]:
from torch.autograd import Variable

In [272]:
# Hyperparameters for our network
input_size = morgansize
hidden_sizes = [128, 64]
output_size = 1
# Build a feed-forward network
model = nn.Sequential(nn.Linear(input_size, hidden_sizes[0]),
                      nn.ReLU(),
                      nn.Linear(hidden_sizes[0], hidden_sizes[1]),
                      nn.ReLU(),
                      nn.Linear(hidden_sizes[1], output_size),
                      nn.Sigmoid())
print(model)

Sequential(
  (0): Linear(in_features=1024, out_features=128, bias=True)
  (1): ReLU()
  (2): Linear(in_features=128, out_features=64, bias=True)
  (3): ReLU()
  (4): Linear(in_features=64, out_features=1, bias=True)
  (5): Sigmoid()
)


In [273]:
criterion = nn.MSELoss()

#CrossEntropyLoss() requires logits as the output and class labels as the target

In [274]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [275]:
X = torch.from_numpy(X).float()
y = torch.from_numpy(y).float()

In [276]:
Xtrain.shape

(7, 1024)

In [277]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=1)

In [278]:
Xtrain

tensor([[0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [279]:
optimizer = torch.optim.SGD(model.parameters(), lr=0.003)

In [280]:
#def training(epochs)
epochs = 500
for e in range(epochs):
    output = model(Xtrain)
    labels = ytrain.unsqueeze(1)
    optimizer.zero_grad()
    loss = criterion(output, labels)
    loss.backward()
    optimizer.step()
    if e % 100 == 0:
        print("Training loss: ",loss.item())

Training loss:  0.2535095512866974
Training loss:  0.2507546544075012
Training loss:  0.24833962321281433
Training loss:  0.24605225026607513
Training loss:  0.24379852414131165


In [281]:
predictions = output.detach().numpy()
predictions = np.where(predictions > 0.5, 1, 0)
predictions.sum()

3

In [282]:
accuracy_score(predictions,ytrain.numpy())

0.8571428571428571

In [283]:
f1_score(predictions,ytrain.numpy())

0.8571428571428571

In [260]:
output

tensor([[0.2878],
        [0.3000],
        [0.2977],
        ...,
        [0.2916],
        [0.3110],
        [0.2980]], grad_fn=<SigmoidBackward>)