In [1]:
import itertools
import numpy as np
import pandas as pd
import statsmodels.api as sm
from matplotlib import pyplot as plt
import seaborn as sns

from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
import rdkit
# Render the figure in a notebook:
%matplotlib inline  

from scipy import stats

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import precision_score

In [2]:
X_train = pd.read_csv('MorganFingerprint_x_train.csv',index_col=0)
y_train = pd.read_csv('MorganFingerprint_y_train.csv',index_col=0)
y_train = y_train.drop(columns='smiles')

X_val = pd.read_csv('MorganFingerprint_x_val.csv',index_col=0)
y_val = pd.read_csv('MorganFingerprint_y_val.csv',index_col=0)
y_val = y_val.drop(columns='smiles')

In [13]:
y_train

Unnamed: 0,toxic
3702,1
4741,1
2922,1
5059,0
2249,1
...,...
1887,0
4302,1
1535,1
4048,1


In [14]:
### many of these parameters have been selected to make this similar to the previous perceptron

MLPC1 = MLPClassifier(hidden_layer_sizes=(1,), activation='identity', alpha=0, tol=0.001, solver='sgd', random_state=0, verbose=False)

### Below finish the code to fit and predict the data
MLPC1.fit(X_train,y_train)

y_MLPC1_train = MLPC1.predict(X_train)
y_MLPC1_val = MLPC1.predict(X_val)

MLPC1_train_bacc = balanced_accuracy_score(y_train,y_MLPC1_train)
MLPC1_val_bacc = balanced_accuracy_score(y_val,y_MLPC1_val)
    
print('Balanced Accuracy train:',MLPC1_train_bacc)
print('Balanced Accuracy validation:',MLPC1_val_bacc)


  y = column_or_1d(y, warn=True)


Balanced Accuracy train: 0.7530769753841969
Balanced Accuracy validation: 0.6887045965927354


In [15]:
def fit_and_metric(model):
    model.fit(X_train,y_train)
    y_model_train = model.predict(X_train)
    y_model_val = model.predict(X_val)  
    model_train_bacc = balanced_accuracy_score(y_train,y_model_train)
    model_val_bacc = balanced_accuracy_score(y_val,y_model_val)
    print('Balanced Accuracy train:',model_train_bacc)
    print('Balanced Accuracy validation:',model_val_bacc)


In [16]:
### code with the sigmoid function data

MLPC2 = MLPClassifier(hidden_layer_sizes=(1,), activation='logistic', alpha=0, tol=0.001, solver='sgd', random_state=0, verbose=False) #now using a logisitic activation? So a different function

fit_and_metric(MLPC2)

  y = column_or_1d(y, warn=True)


Balanced Accuracy train: 0.5
Balanced Accuracy validation: 0.5


In [17]:
### code with the tanh function data
MLPC3 = MLPClassifier(hidden_layer_sizes=(1,), activation='tanh', alpha=0, tol=0.001, solver='sgd', random_state=0, verbose=False) #now using a logisitic activation? So a different function
fit_and_metric(MLPC3)

  y = column_or_1d(y, warn=True)


Balanced Accuracy train: 0.7507139942880456
Balanced Accuracy validation: 0.691626486660238


In [18]:
### code with the ReLu function data
MLPC4 = MLPClassifier(hidden_layer_sizes=(1,), activation='relu', alpha=0, tol=0.001, solver='sgd', random_state=0, verbose=False) #now using a logisitic activation? So a different function
fit_and_metric(MLPC4)

  y = column_or_1d(y, warn=True)


Balanced Accuracy train: 0.7518473185547849
Balanced Accuracy validation: 0.6936001285760206


In [20]:
### code with the sigmoid function data

MLPC5 = MLPClassifier(hidden_layer_sizes=(10,5), activation='relu', alpha=0, tol=0.001, max_iter=250, solver='sgd', random_state=0, verbose=False)
# MPLCrandomsearch
### Below finish the code to fit and predict the data
fit_and_metric(MLPC5)

  y = column_or_1d(y, warn=True)


Balanced Accuracy train: 0.8813296160297384
Balanced Accuracy validation: 0.6954580520732883




# MLPC Hyperparameter tuning

In [21]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.svm import LinearSVR
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import plot_tree
from xgboost import XGBRegressor
from sklearn.inspection import permutation_importance
from sklearn.model_selection import learning_curve
from sklearn.model_selection import RandomizedSearchCV
# from sklearn.model_selection import GridSearch

In [None]:
y_train_array = y_train.to_numpy().flatten()

random_search_params = {'activation': ['relu','tanh','logistic','identity'],
    'hidden_layer_sizes': [(10,5)],
    'solver': ['sgd'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}
RandomSVR = RandomizedSearchCV(estimator=MLPClassifier(), param_distributions=random_search_params, n_iter=50, cv=5, scoring='r2', random_state=0,verbose=2)
RandomSVR.fit(X_train,y_train_array)
#no tolerance - therefore not converging?? add tol=0.001 parameters
#cancelled after 1100 minutes!



Fitting 5 folds for each of 16 candidates, totalling 80 fits




[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(10, 5), learning_rate=constant, solver=sgd; total time=  12.1s


# pytorch

In [None]:
import torch
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
import torch.nn as nn
import torch.nn.functional as F

from torchmetrics.classification import Accuracy
from torchmetrics.classification import Recall

In [None]:
### You have been using DataFrames because these are easy to deal with but here we need the data as an array

X_train_array = X_train.to_numpy()
X_train_tensor = torch.Tensor(X_train_array)

### Now some properties will be printed so you can see how this is similar to a NumPy array

print(f'X_train_tensor = {X_train_tensor}')

print(f'Size of X_train_tensor = {X_train_tensor.size()}') 

print(f'Data type of X_train_tensor = {X_train_tensor.dtype}')

X_train_tensor = tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 1.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
Size of X_train_tensor = torch.Size([3686, 2048])
Data type of X_train_tensor = torch.float32


In [None]:
X_val_array = X_val.to_numpy()
X_val_tensor = torch.Tensor(X_val_array)

### Now some properties will be printed so you can see how this is similar to a NumPy array

print(f'X_val_tensor = {X_val_tensor}')

print(f'Size of X_val_tensor = {X_val_tensor.size()}') 

print(f'Data type of X_val_tensor = {X_val_tensor.dtype}')

X_val_tensor = tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
Size of X_val_tensor = torch.Size([791, 2048])
Data type of X_val_tensor = torch.float32


In [None]:
y_train_array = y_train.to_numpy()
y_train_tensor = torch.Tensor(y_train_array)

### Now some properties will be printed so you can see how this is similar to a NumPy array

print(f'y_train_tensor = {y_train_tensor}')

print(f'Size of y_train_tensor = {y_train_tensor.size()}') 

print(f'Data type of y_train_tensor = {y_train_tensor.dtype}')

y_train_tensor = tensor([[1.],
        [1.],
        [1.],
        ...,
        [1.],
        [1.],
        [1.]])
Size of y_train_tensor = torch.Size([3686, 1])
Data type of y_train_tensor = torch.float32


In [None]:
y_val_array = y_val.to_numpy()
y_val_tensor = torch.Tensor(y_val_array)

### Now some properties will be printed so you can see how this is similar to a NumPy array

print(f'y_val_tensor = {y_val_tensor}')

print(f'Size of y_val_tensor = {y_val_tensor.size()}') 

print(f'Data type of y_val_tensor = {y_val_tensor.dtype}')

y_val_tensor = tensor([[0.],
        [0.],
        [1.],
        [1.],
        [1.],
        [0.],
        [0.],
        [0.],
        [1.],
        [1.],
        [1.],
        [0.],
        [1.],
        [1.],
        [1.],
        [1.],
        [0.],
        [1.],
        [1.],
        [0.],
        [1.],
        [1.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [1.],
        [1.],
        [1.],
        [0.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [0.],
        [0.],
        [1.],
        [0.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [0.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
     

In [None]:
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
trainloader = DataLoader(train_dataset, batch_size=128, shuffle=True)

In [None]:
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
valloader = DataLoader(val_dataset, batch_size=128, shuffle=True)

In [None]:
### effectively we are making something like LinearRegression ourselves

class MyPerceptron(nn.Module):    # this class inherits from nn.Module
    def __init__(self):
        super(MyPerceptron, self).__init__() #this calls the constructor of the parent class nn.Module
        
        # define network layers
        self.fc1 = nn.Linear(2048, 1)   # nn.Linear is a class for linear layers, i.e. input to the first hidden layer
        self.fc2 = nn.Linear(1, 2) # this is the hidden layer to the output layer with two outputs (0 or 1)
        
        # Here we are using a linear model as the activation function
    def forward(self,x):
        x = self.fc1(x)
        x = self.fc2(x)  
        return x

In [None]:
### set a random seed ###
torch.manual_seed(0)

### create the network set the criretia and how to optimize
net1 = MyPerceptron()
net1.train() ### this turns the model on for training (above there are no specific layers only involved in training but this is good practise)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(net1.parameters(), lr=0.1)

### set the number of epochs to run
num_epochs = 100  
labels = labels.squeeze() #you fucking idiot. Put it inside the for loop!! Then it works.
for epoch in range(1,num_epochs+1):
    running_loss = 0.0
    for i, data in enumerate(trainloader):
        inputs, labels = data ### extacts the features and ground truth labels
        optimizer.zero_grad() ### removes gradients from previous run in backpropergation
        outputs = net1(inputs) ### gets the predictions - which is now the raw scores "logits"
        loss = criterion(outputs.squeeze(), labels.long()) ### calculates the loss of the outputs with the labels
        loss.backward() ### computes the gradients for backpropagation
        optimizer.step() ### updates the model based on the gradients
        running_loss += loss.item() ### adds the loss to the running_loss 

    avg_train_loss = running_loss / len(trainloader) ### calculates the average lss for the batches

    print('Epoch',epoch,'Loss =',avg_train_loss) ### prints the info

RuntimeError: 0D or 1D target tensor expected, multi-target not supported