<a href="https://colab.research.google.com/github/cosineDaDa0623/2023_Summer_Intern/blob/master/Pytorch_implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install RDkit
RDkit is used to convert smiles to ECFP which is used to train the model

In [None]:
!pip install rdkit



# Package import
Import the necessary packages

In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
import pandas as pd #Pandas have Dataframe, so the manipulation of data is efficient and convenient
import numpy as np #Linear math packages, which are important for the data
from google.colab import drive #Just used to get file from the google drive
drive.mount('/content/drive')
df = pd.read_excel('/content/drive/MyDrive/5HT2A models data.xlsx')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_excel('/content/drive/MyDrive/5HT2A models data.xlsx')

In [None]:
#Import Pytorch which is a commonly-used deeplearning package
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import nn

# Dataset building

In [None]:
#get the ecfp from the smiles
def smiles_to_ecfp4(smiles_string):
  mol = Chem.MolFromSmiles(smiles_string)
  ecfp4 = AllChem.GetMorganFingerprintAsBitVect(mol, radius=4, nBits=2048)
  return ecfp4
#the labels are strings, so we must convert them into numbers
def get_response(ligand_class):
  convert_dic = {
      'dude_active' : 0,
      'dude_inactive' : 1,
      'HTR2A_Antagonist_1.0' : 2,
      'HTR2A_Antagonist_0.0' : 3,
      'HTR2A_Agonist_1.0' : 4,
      'HTR2A_Agonist_0.0' : 5,
      'HTR2A_Inverse Agonist_1.0' : 6
}
  return convert_dic[ligand_class]
#Now construct the dataset
#So we can use the data when training the model
class Compound(Dataset):
  #initialization of the dataset. We can automatically construct the dataset from the input file
  def __init__(self, inputfile):
    df = pd.read_excel(inputfile)
    df['ECFP4'] = df['smiles'].apply(smiles_to_ecfp4)
    df['response'] = df['ligand_class'].apply(get_response)
    self.dataframe = df
    self.response = df['response']
    self.fingerprint = df['ECFP4']
  #get the length of the dataset
  def __len__(self):
    return len(self.dataframe)
  #Dataset iteration function
  def __getitem__(self, idx):
    feature = np.array(self.fingerprint[idx].ToList()).astype(float)
    label = self.response[idx]
    return feature, label

# Split the Dataset into train, validate, and test dataset


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split # this pacakge is used to split the dataset into train, validate and test datasets
from torch.utils.data import Subset #this package is used to create subsets

In [None]:
def split_the_dataset(in_dataset, train_frac = 0.7, valid_frac = 0.15, test_frac = 0.15, seed = 42):
  #in_dataset: the dataset you want to split
  #train_frac, valid_frac, test_frac: the fraction of subsets
  assert test_frac + valid_frac + train_frac == 1.0 # check if the sum of three fracs is 1
  labels = [in_dataset[i][1] for i in range(len(in_dataset))] # get the label of the dataset, warning: this function only supports 1 label
  # split the datasets while keep the label fraction the same
  train_idx, temp_test_idx = train_test_split(
    np.arange(len(labels)),
    test_size = 1 - test_frac,
    stratify=labels,
    random_state=seed
  )
  valid_idx, test_idx = train_test_split(
    temp_test_idx,
    test_size = test_frac / (valid_frac + test_frac),
    stratify = [labels[i] for i in temp_test_idx],
    random_state = seed
  )
  #get the datasets by using subset package
  train_dataset = Subset(in_dataset, train_idx)
  valid_dataset = Subset(in_dataset, valid_idx)
  test_dataset = Subset(in_dataset, test_idx)
  #return the splitted dataset
  return train_dataset, valid_dataset, test_dataset

In [None]:
compound_set = Compound('/content/drive/MyDrive/5HT2A models data.xlsx')
tr, va, tt = split_the_dataset(compound_set)
train_loader = DataLoader(tr, batch_size=32, shuffle=True)
valid_loader = DataLoader(va, batch_size=32, shuffle=False)
test_loader = DataLoader(tt, batch_size=32, shuffle=False)


In [None]:
for x, y in train_loader:
  print(x.size())
  print(y.size())
  break

torch.Size([32, 2048])
torch.Size([32])


# Model building


## Simple NN

In [None]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cuda device


In [None]:
class NN1(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_layers = nn.Sequential(
            nn.Linear(2048, 1024),
            nn.BatchNorm1d(1024),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, 7)
        )

    def forward(self, x):
        return self.linear_layers(x)

In [None]:
loss_fn = nn.CrossEntropyLoss()
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    # Set the model to training mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        # print(X.type())
        # print(y.type())
        # print(X)
        # break
        # Compute prediction and loss
        X = X.float()
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef

def test_loop(dataloader, model, loss_fn):
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    # Keep track of true and predicted labels for additional metrics
    all_true_labels = []
    all_predicted_labels = []

    with torch.no_grad():
        for X, y in dataloader:
            X = X.float()
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

            # Add true labels and predicted labels to lists
            all_true_labels.extend(y.tolist())
            all_predicted_labels.extend(pred.argmax(1).tolist())

    test_loss /= num_batches
    correct /= size

    # Compute additional metrics
    precision = precision_score(all_true_labels, all_predicted_labels, average='macro')
    recall = recall_score(all_true_labels, all_predicted_labels, average='macro')
    f1 = f1_score(all_true_labels, all_predicted_labels, average='macro')
    mcc = matthews_corrcoef(all_true_labels, all_predicted_labels)

    print(f"Test Error: ")
    print(f" Accuracy: {(100*correct):>0.1f}%")
    print(f" Avg loss: {test_loss:>8f}")
    print(f" Precision: {precision:>0.4f}")
    print(f" Recall: {recall:>0.4f}")
    print(f" F1 Score: {f1:>0.4f}")
    print(f" Matthew's Correlation Coefficient: {mcc:>0.4f}")

In [None]:
nn1 = NN1()
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(nn1.parameters(), lr=0.01)

epochs = 100
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_loader, nn1, loss_fn, optimizer)
    test_loop(test_loader, nn1, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss: 2.135488  [   32/ 3422]
loss: 1.010485  [ 3232/ 3422]


  _warn_prf(average, modifier, msg_start, len(result))


Test Error: 
 Accuracy: 75.7%
 Avg loss: 1.003070
 Precision: 0.3289
 Recall: 0.3413
 F1 Score: 0.3321
 Matthew's Correlation Coefficient: 0.6354
Epoch 2
-------------------------------
loss: 0.869270  [   32/ 3422]
loss: 0.476328  [ 3232/ 3422]


  _warn_prf(average, modifier, msg_start, len(result))


Test Error: 
 Accuracy: 80.2%
 Avg loss: 0.706125
 Precision: 0.4721
 Recall: 0.3702
 F1 Score: 0.3592
 Matthew's Correlation Coefficient: 0.7032
Epoch 3
-------------------------------
loss: 0.466625  [   32/ 3422]
loss: 0.553010  [ 3232/ 3422]


  _warn_prf(average, modifier, msg_start, len(result))


Test Error: 
 Accuracy: 83.1%
 Avg loss: 0.579204
 Precision: 0.4686
 Recall: 0.3961
 F1 Score: 0.3930
 Matthew's Correlation Coefficient: 0.7490
Epoch 4
-------------------------------
loss: 0.389613  [   32/ 3422]
loss: 0.483849  [ 3232/ 3422]


  _warn_prf(average, modifier, msg_start, len(result))


Test Error: 
 Accuracy: 84.8%
 Avg loss: 0.515018
 Precision: 0.4569
 Recall: 0.4188
 F1 Score: 0.4191
 Matthew's Correlation Coefficient: 0.7766
Epoch 5
-------------------------------
loss: 0.524752  [   32/ 3422]
loss: 0.382219  [ 3232/ 3422]


  _warn_prf(average, modifier, msg_start, len(result))


Test Error: 
 Accuracy: 85.9%
 Avg loss: 0.471679
 Precision: 0.4502
 Recall: 0.4472
 F1 Score: 0.4459
 Matthew's Correlation Coefficient: 0.7925
Epoch 6
-------------------------------
loss: 0.562221  [   32/ 3422]
loss: 0.266042  [ 3232/ 3422]


  _warn_prf(average, modifier, msg_start, len(result))


Test Error: 
 Accuracy: 86.8%
 Avg loss: 0.443795
 Precision: 0.4507
 Recall: 0.4653
 F1 Score: 0.4574
 Matthew's Correlation Coefficient: 0.8054
Epoch 7
-------------------------------
loss: 0.204200  [   32/ 3422]
loss: 0.199998  [ 3232/ 3422]


  _warn_prf(average, modifier, msg_start, len(result))


Test Error: 
 Accuracy: 87.5%
 Avg loss: 0.424163
 Precision: 0.4568
 Recall: 0.4797
 F1 Score: 0.4673
 Matthew's Correlation Coefficient: 0.8155
Epoch 8
-------------------------------
loss: 0.443090  [   32/ 3422]
loss: 0.122772  [ 3232/ 3422]


  _warn_prf(average, modifier, msg_start, len(result))


Test Error: 
 Accuracy: 88.2%
 Avg loss: 0.398992
 Precision: 0.4625
 Recall: 0.4894
 F1 Score: 0.4746
 Matthew's Correlation Coefficient: 0.8257
Epoch 9
-------------------------------
loss: 0.169165  [   32/ 3422]
loss: 0.161920  [ 3232/ 3422]


  _warn_prf(average, modifier, msg_start, len(result))


Test Error: 
 Accuracy: 88.3%
 Avg loss: 0.388651
 Precision: 0.4636
 Recall: 0.4942
 F1 Score: 0.4770
 Matthew's Correlation Coefficient: 0.8275
Epoch 10
-------------------------------
loss: 0.194091  [   32/ 3422]
loss: 0.224921  [ 3232/ 3422]


  _warn_prf(average, modifier, msg_start, len(result))


Test Error: 
 Accuracy: 87.9%
 Avg loss: 0.400230
 Precision: 0.4573
 Recall: 0.5009
 F1 Score: 0.4735
 Matthew's Correlation Coefficient: 0.8232
Epoch 11
-------------------------------
loss: 0.167197  [   32/ 3422]
loss: 0.144460  [ 3232/ 3422]


  _warn_prf(average, modifier, msg_start, len(result))


Test Error: 
 Accuracy: 88.0%
 Avg loss: 0.394951
 Precision: 0.4594
 Recall: 0.4980
 F1 Score: 0.4752
 Matthew's Correlation Coefficient: 0.8254
Epoch 12
-------------------------------
loss: 0.059135  [   32/ 3422]
loss: 0.476877  [ 3232/ 3422]


  _warn_prf(average, modifier, msg_start, len(result))


Test Error: 
 Accuracy: 88.3%
 Avg loss: 0.386946
 Precision: 0.5564
 Recall: 0.5106
 F1 Score: 0.4963
 Matthew's Correlation Coefficient: 0.8290
Epoch 13
-------------------------------
loss: 0.052876  [   32/ 3422]
loss: 0.390995  [ 3232/ 3422]


  _warn_prf(average, modifier, msg_start, len(result))


Test Error: 
 Accuracy: 89.0%
 Avg loss: 0.366357
 Precision: 0.5720
 Recall: 0.5276
 F1 Score: 0.5188
 Matthew's Correlation Coefficient: 0.8390
Epoch 14
-------------------------------
loss: 0.319659  [   32/ 3422]
loss: 0.251549  [ 3232/ 3422]


  _warn_prf(average, modifier, msg_start, len(result))


Test Error: 
 Accuracy: 88.7%
 Avg loss: 0.375609
 Precision: 0.5634
 Recall: 0.5238
 F1 Score: 0.5108
 Matthew's Correlation Coefficient: 0.8346
Epoch 15
-------------------------------
loss: 0.294939  [   32/ 3422]
loss: 0.299140  [ 3232/ 3422]


  _warn_prf(average, modifier, msg_start, len(result))


Test Error: 
 Accuracy: 88.7%
 Avg loss: 0.376902
 Precision: 0.5648
 Recall: 0.5369
 F1 Score: 0.5284
 Matthew's Correlation Coefficient: 0.8348
Epoch 16
-------------------------------
loss: 0.039623  [   32/ 3422]
loss: 0.122495  [ 3232/ 3422]


  _warn_prf(average, modifier, msg_start, len(result))


Test Error: 
 Accuracy: 89.0%
 Avg loss: 0.375856
 Precision: 0.5772
 Recall: 0.5616
 F1 Score: 0.5562
 Matthew's Correlation Coefficient: 0.8387
Epoch 17
-------------------------------
loss: 0.292024  [   32/ 3422]
loss: 0.025998  [ 3232/ 3422]


  _warn_prf(average, modifier, msg_start, len(result))


Test Error: 
 Accuracy: 89.0%
 Avg loss: 0.374966
 Precision: 0.5746
 Recall: 0.5471
 F1 Score: 0.5415
 Matthew's Correlation Coefficient: 0.8389
Epoch 18
-------------------------------
loss: 0.114413  [   32/ 3422]
loss: 0.062326  [ 3232/ 3422]


  _warn_prf(average, modifier, msg_start, len(result))


Test Error: 
 Accuracy: 88.8%
 Avg loss: 0.384682
 Precision: 0.5730
 Recall: 0.5607
 F1 Score: 0.5553
 Matthew's Correlation Coefficient: 0.8373
Epoch 19
-------------------------------
loss: 0.157080  [   32/ 3422]
loss: 0.039685  [ 3232/ 3422]


  _warn_prf(average, modifier, msg_start, len(result))


Test Error: 
 Accuracy: 89.0%
 Avg loss: 0.381065
 Precision: 0.5685
 Recall: 0.5643
 F1 Score: 0.5569
 Matthew's Correlation Coefficient: 0.8394
Epoch 20
-------------------------------
loss: 0.108468  [   32/ 3422]
loss: 0.089307  [ 3232/ 3422]


  _warn_prf(average, modifier, msg_start, len(result))


Test Error: 
 Accuracy: 89.4%
 Avg loss: 0.368329
 Precision: 0.5729
 Recall: 0.5746
 F1 Score: 0.5663
 Matthew's Correlation Coefficient: 0.8442
Epoch 21
-------------------------------
loss: 0.056252  [   32/ 3422]
loss: 0.114595  [ 3232/ 3422]


  _warn_prf(average, modifier, msg_start, len(result))


Test Error: 
 Accuracy: 89.2%
 Avg loss: 0.378244
 Precision: 0.5736
 Recall: 0.5868
 F1 Score: 0.5758
 Matthew's Correlation Coefficient: 0.8429
Epoch 22
-------------------------------
loss: 0.038290  [   32/ 3422]
loss: 0.026192  [ 3232/ 3422]


  _warn_prf(average, modifier, msg_start, len(result))


Test Error: 
 Accuracy: 89.2%
 Avg loss: 0.379940
 Precision: 0.5700
 Recall: 0.5894
 F1 Score: 0.5744
 Matthew's Correlation Coefficient: 0.8429
Epoch 23
-------------------------------
loss: 0.080872  [   32/ 3422]
loss: 0.040290  [ 3232/ 3422]


  _warn_prf(average, modifier, msg_start, len(result))


Test Error: 
 Accuracy: 89.3%
 Avg loss: 0.382915
 Precision: 0.5700
 Recall: 0.5818
 F1 Score: 0.5701
 Matthew's Correlation Coefficient: 0.8434
Epoch 24
-------------------------------
loss: 0.104930  [   32/ 3422]
loss: 0.089443  [ 3232/ 3422]


  _warn_prf(average, modifier, msg_start, len(result))


Test Error: 
 Accuracy: 88.7%
 Avg loss: 0.399413
 Precision: 0.5553
 Recall: 0.5988
 F1 Score: 0.5719
 Matthew's Correlation Coefficient: 0.8356
Epoch 25
-------------------------------
loss: 0.042983  [   32/ 3422]
loss: 0.074815  [ 3232/ 3422]


  _warn_prf(average, modifier, msg_start, len(result))


Test Error: 
 Accuracy: 89.3%
 Avg loss: 0.389133
 Precision: 0.5697
 Recall: 0.5998
 F1 Score: 0.5802
 Matthew's Correlation Coefficient: 0.8442
Epoch 26
-------------------------------
loss: 0.173765  [   32/ 3422]
loss: 0.074131  [ 3232/ 3422]
Test Error: 
 Accuracy: 89.3%
 Avg loss: 0.393209
 Precision: 0.7076
 Recall: 0.6057
 F1 Score: 0.5905
 Matthew's Correlation Coefficient: 0.8428
Epoch 27
-------------------------------
loss: 0.042902  [   32/ 3422]
loss: 0.050421  [ 3232/ 3422]
Test Error: 
 Accuracy: 89.3%
 Avg loss: 0.393785
 Precision: 0.7037
 Recall: 0.6076
 F1 Score: 0.5896
 Matthew's Correlation Coefficient: 0.8432
Epoch 28
-------------------------------
loss: 0.074874  [   32/ 3422]
loss: 0.014255  [ 3232/ 3422]
Test Error: 
 Accuracy: 89.7%
 Avg loss: 0.385769
 Precision: 0.8196
 Recall: 0.6168
 F1 Score: 0.6128
 Matthew's Correlation Coefficient: 0.8488
Epoch 29
-------------------------------
loss: 0.066102  [   32/ 3422]
loss: 0.198672  [ 3232/ 3422]
Test Error: 

In [None]:
test_loop(valid_loader, nn1, loss_fn)

Test Error: 
 Accuracy: 84.3%
 Avg loss: 0.456557
 Precision: 0.4675
 Recall: 0.4506
 F1 Score: 0.4502
 Matthew's Correlation Coefficient: 0.7765


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
torch.save(nn1.state_dict(), '/content/drive/MyDrive/model_NN1_parameters.pth')

In [None]:
nn2 = NN1()
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(nn1.parameters(), lr=1, weight_decay=1e-5)

epochs = 100
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_loader, nn2, loss_fn, optimizer)
    test_loop(test_loader, nn2, loss_fn)
torch.save(nn2.state_dict(), '/content/drive/MyDrive/model_NN2_parameters.pth')
print("Done!")

Epoch 1
-------------------------------
loss: 1.911221  [   32/ 3422]
loss: 1.988787  [ 3232/ 3422]
Test Error: 
 Accuracy: 17.7%
 Avg loss: 1.916155
 Precision: 0.1456
 Recall: 0.1274
 F1 Score: 0.0991
 Matthew's Correlation Coefficient: -0.0053
Epoch 2
-------------------------------
loss: 1.909779  [   32/ 3422]
loss: 2.024062  [ 3232/ 3422]
Test Error: 
 Accuracy: 16.9%
 Avg loss: 1.918852
 Precision: 0.1481
 Recall: 0.1198
 F1 Score: 0.0966
 Matthew's Correlation Coefficient: -0.0075
Epoch 3
-------------------------------
loss: 1.846298  [   32/ 3422]
loss: 1.878503  [ 3232/ 3422]
Test Error: 
 Accuracy: 19.0%
 Avg loss: 1.912986
 Precision: 0.1466
 Recall: 0.1282
 F1 Score: 0.1013
 Matthew's Correlation Coefficient: -0.0081
Epoch 4
-------------------------------
loss: 1.864084  [   32/ 3422]
loss: 1.916119  [ 3232/ 3422]
Test Error: 
 Accuracy: 17.6%
 Avg loss: 1.920307
 Precision: 0.1450
 Recall: 0.1230
 F1 Score: 0.0955
 Matthew's Correlation Coefficient: -0.0102
Epoch 5
----

KeyboardInterrupt: ignored