This Notebook handles the code for the classification model

In [36]:
%pip install torch torchvision torchaudio
%pip install torch-geometric

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [37]:
%pip install molvs

Note: you may need to restart the kernel to use updated packages.


In [38]:
import os
import pickle
import torch
from torch import nn
from torch.optim import Adam
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv, GATConv, global_mean_pool
import numpy as np
from rdkit import Chem
from torch_geometric.utils import from_smiles
import pandas as pd
from molvs import standardize_smiles
from sklearn.ensemble import RandomForestClassifier
CONFIG = {
    'data_dir': './processed_tox21',
    'hidden_channels': 128,
    'num_layers': 3,
    'dropout': 0.2,
    'batch_size': 64,
    'lr': 1e-3,
    'weight_decay': 0,
    'epochs': 50,
    'patience': 8,
    'device': 'cuda' if torch.cuda.is_available() else 'cpu'
}

# Automatically detects if you have a GPU
print(f"Using device: {CONFIG['device']}")


Using device: cpu


Load data from preprocessing

In [39]:
def load_split(name):
    path = os.path.join(CONFIG['data_dir'], f'tox21_{name}.pkl')
    with open(path, 'rb') as f:
        data = pickle.load(f)
    return data


data_train = load_split('train')
data_validation = load_split('validation')
data_test = load_split('test')

print(f"Train: {len(data_train['smiles'])} | Validation: {len(data_validation['smiles'])} | Test: {len(data_test['smiles'])}")

Train: 6258 | Validation: 782 | Test: 783


Simplify the labels

In [40]:
y_train = pd.DataFrame(data_train['labels']).fillna(0)
y_test = pd.DataFrame(data_test['labels']).fillna(0)

# extract labels
y_train_label = pd.DataFrame(y_train.max(axis=1), columns=['toxic'])
y_test_label = pd.DataFrame(y_test.max(axis=1), columns=['toxic'])

In [41]:
smiles_train = data_train['smiles']
smiles_test = data_test['smiles']

Convert SMILES into ECFP

In [42]:
# Code from https://drzinph.com/ecfp6-fingerprints-in-python-part-3/
from rdkit.Chem import AllChem
from rdkit import Chem, DataStructs
class ECFP6:
    def __init__(self, smiles):
        self.mols = [Chem.MolFromSmiles(i) for i in smiles]
        self.smiles = smiles

    def mol2fp(self, mol, radius = 3):
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius = radius)
        array = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(fp, array)
        return array

    def compute_ECFP6(self):
        bit_headers = ['bit' + str(i) for i in range(2048)]
        arr = np.empty((0,2048), int).astype(int)
        for i in self.mols:
            fp = self.mol2fp(i)
            arr = np.vstack((arr, fp))
        df_ecfp6 = pd.DataFrame(np.asarray(arr).astype(int),columns=bit_headers)
        df_ecfp6.insert(loc=0, column='smiles', value=self.smiles)
        return df_ecfp6

In [43]:
smiles = [standardize_smiles(i) for i in smiles_train] 
sm = [standardize_smiles(i) for i in smiles_test] 

[15:31:52] Can't kekulize mol.  Unkekulized atoms: 3 10


In [44]:
# train set
ecfp6_descriptor_train = ECFP6(smiles)        # create your ECFP6 object and provide smiles
x_train_ecfp = ecfp6_descriptor_train.compute_ECFP6() # compute

# test set
ecfp6_descriptor_test = ECFP6(sm)        # create your ECFP6 object and provide smiles
x_test_ecfp = ecfp6_descriptor_test.compute_ECFP6() # compute



In [None]:
train_data = pd.concat([x_train_ecfp, y_train_label], axis=1)
train_data.head()

Unnamed: 0,smiles,bit0,bit1,bit2,bit3,bit4,bit5,bit6,bit7,bit8,...,bit2039,bit2040,bit2041,bit2042,bit2043,bit2044,bit2045,bit2046,bit2047,0
0,COC(=O)[C@H]1CC[C@H](C(=O)OC)CC1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0
1,O=S1(=O)CC(Cl)(Cl)C(Cl)(Cl)C1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0
2,Cc1cc(C)c(NC(=O)C[C@H](CC(=O)[O-])c2cccc3ccccc...,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1.0
3,Cc1noc(NS(=O)(=O)c2ccc(N)cc2)c1C,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0
4,CC(C)(C)NC[C@H](O)COc1nsnc1N1CCOCC1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0


In [45]:

x_train_ecfp.iloc[:, 1:]

Unnamed: 0,bit0,bit1,bit2,bit3,bit4,bit5,bit6,bit7,bit8,bit9,...,bit2038,bit2039,bit2040,bit2041,bit2042,bit2043,bit2044,bit2045,bit2046,bit2047
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6253,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
6254,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6255,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6256,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Random Forest

In [57]:
rf = RandomForestClassifier(
    n_estimators=100,   # number of trees
    # max_depth=3,        # maximum depth of each tree
    random_state=42
)

In [58]:

rf.fit(x_train_ecfp.iloc[:, 1:], y_train_label)

  return fit_method(estimator, *args, **kwargs)


In [59]:
y_pred = rf.predict(x_test_ecfp.iloc[:, 1:])

In [60]:
from sklearn.metrics import accuracy_score, classification_report
print("Accuracy:", accuracy_score(y_test_label, y_pred))
print("\nClassification Report:\n", classification_report(y_test_label, y_pred))

Accuracy: 0.7215836526181354

Classification Report:
               precision    recall  f1-score   support

         0.0       0.73      0.87      0.79       477
         1.0       0.71      0.49      0.58       306

    accuracy                           0.72       783
   macro avg       0.72      0.68      0.69       783
weighted avg       0.72      0.72      0.71       783

