This Notebook handles the code for the classification model

In [5]:
%pip install torch torchvision torchaudio
%pip install torch-geometric

Collecting torchvision
  Downloading torchvision-0.24.0-cp313-cp313-macosx_12_0_arm64.whl.metadata (5.9 kB)
Collecting torchaudio
  Downloading torchaudio-2.9.0-cp313-cp313-macosx_12_0_arm64.whl.metadata (6.9 kB)
Downloading torchvision-0.24.0-cp313-cp313-macosx_12_0_arm64.whl (1.9 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading torchaudio-2.9.0-cp313-cp313-macosx_12_0_arm64.whl (809 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m809.1/809.1 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torchvision, torchaudio
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [torchaudio][0m [32m1/2[0m [torchaudio]
[1A[2KSuccessfully installed torchaudio-2.9.0 torchvision-0.24.0
Note: you may need to restart the kernel to use updated packages.
Collecting torch-geometric
  Downloading torch_geom

In [20]:
%pip install molvs

Collecting molvs
  Downloading MolVS-0.1.1.tar.gz (61 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: molvs
[33m  DEPRECATION: Building 'molvs' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'molvs'. Discussion can be found at https://github.com/pypa/pip/issues/6334[0m[33m
[0m  Building wheel for molvs (setup.py) ... [?25ldone
[?25h  Created wheel for molvs: filename=MolVS-0.1.1-py3-none-any.whl size=32374 sha256=08b3bcba0e7a5de11b94005ee040d18d30b802201b39451c6c679651b9d0392c
  Stored in directory: /Users/vinceflores/Library/Caches/pip/wheels/39/96/4e/ce6f7526d01db6c49e74c45eeba08c0c49eabaee2ff987206e
Successfully built molvs
Installing collec

In [61]:
import os
import pickle
import torch
from torch import nn
from torch.optim import Adam
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv, GATConv, global_mean_pool
import numpy as np
from rdkit import Chem
from torch_geometric.utils import from_smiles
import pandas as pd
from molvs import standardize_smiles
from sklearn.ensemble import RandomForestClassifier
CONFIG = {
    'data_dir': './processed_tox21',
    'hidden_channels': 128,
    'num_layers': 3,
    'dropout': 0.2,
    'batch_size': 64,
    'lr': 1e-3,
    'weight_decay': 0,
    'epochs': 50,
    'patience': 8,
    'device': 'cuda' if torch.cuda.is_available() else 'cpu'
}

# Automatically detects if you have a GPU
print(f"Using device: {CONFIG['device']}")


Using device: cpu


Load data from preprocessing

In [7]:
def load_split(name):
    path = os.path.join(CONFIG['data_dir'], f'tox21_{name}.pkl')
    with open(path, 'rb') as f:
        data = pickle.load(f)
    return data


data_train = load_split('train')
data_validation = load_split('validation')
data_test = load_split('test')

print(f"Train: {len(data_train['smiles'])} | Validation: {len(data_validation['smiles'])} | Test: {len(data_test['smiles'])}")

Train: 6258 | Validation: 782 | Test: 783


In [56]:

# print(data_train)
# print(data_validation)
print(data_test)

{'smiles': array(['O=C(O)c1ccc(S(=O)(=O)N(Cl)Cl)cc1', 'CCCC[Sn](CCCC)(CCCC)CCCC',
       'CCCC[Sn](Cl)(Cl)Cl', 'OC[C@@H](O)[C@@H](O)[C@H](O)[C@H](O)CO',
       'CCN(CC)CCC(=O)N1c2ccccc2Sc2ccc(Cl)cc21',
       'O=C(O)/C=C(\\CC(=O)O)C(=O)O', 'CC(=O)Nc1ccc(S(N)(=O)=O)cc1',
       'CO/N=C(/C(=O)OC)c1ccccc1CON=C(C)c1cccc(C(F)(F)F)c1',
       'Cc1ccc(C(C)(C)O)cc1', 'CCOC(=O)C1=NN(c2ccccc2)C(=O)C1',
       'CCC(O)OCCCOC', 'CCCCNc1ccc(C(=O)OCCN(C)C)cc1',
       'CCOC(=O)C1OC1(C)c1ccccc1', 'CC(=O)OC(C)c1ccccc1',
       'CC(C)[C@@H]1CC[C@@H](C)C[C@H]1O',
       'CC(C)C[C@H](NC(=O)[C@H](CCc1ccccc1)NC(=O)CN1CCOCC1)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CC(C)C)C(=O)[C@@]1(C)CO1',
       'CN(C)CCO',
       'CCN(Cc1cccc(S(=O)(=O)[O-])c1)c1ccc(C(=C2C=CC(=[N+](CC)Cc3cccc(S(=O)(=O)[O-])c3)C=C2)c2ccc(O)cc2S(=O)(=O)[O-])cc1',
       'Nc1ccc2cc3ccccc3cc2c1', 'O=CNNC=O', 'CCCCCCCCCCC1CO1',
       'CC(=O)[C@H]1CC[C@H]2[C@@H]3CC[C@H]4C[C@H](O)CC[C@]4(C)[C@H]3C(=O)C[C@]12C',
       'CN(C)CCOc1ccccc1Cc1ccccc1.O=C(

Simplify the labels

In [41]:
labels_df = pd.DataFrame(data_train['labels']).fillna(0)


In [42]:
labels_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
count,6258.0,6258.0,6258.0,6258.0,6258.0,6258.0,6258.0,6258.0,6258.0,6258.0,6258.0,6258.0
mean,0.03883,0.030841,0.098913,0.037392,0.100192,0.043784,0.024449,0.119527,0.032758,0.047459,0.115532,0.052892
std,0.193206,0.172899,0.29857,0.189736,0.300279,0.20463,0.15445,0.324434,0.178017,0.212636,0.319689,0.223836
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [48]:
label_df = pd.DataFrame(labels_df.max(axis=1), columns=['toxic'])
label_df.head()

Unnamed: 0,toxic
0,0.0
1,1.0
2,1.0
3,1.0
4,0.0


In [17]:
smiles_df = pd.DataFrame(data_train['smiles'])

In [23]:
smiles_df.head()

Unnamed: 0,0
0,COC(=O)[C@H]1CC[C@H](C(=O)OC)CC1
1,O=S1(=O)CC(Cl)(Cl)C(Cl)(Cl)C1
2,Cc1cc(C)c(NC(=O)C[C@H](CC(=O)[O-])c2cccc3ccccc...
3,Cc1noc(NS(=O)(=O)c2ccc(N)cc2)c1C
4,CC(C)(C)NC[C@H](O)COc1nsnc1N1CCOCC1


Convert SMILES into ECFP

In [None]:
# Code from https://drzinph.com/ecfp6-fingerprints-in-python-part-3/
from rdkit.Chem import AllChem
from rdkit import Chem, DataStructs
class ECFP6:
    def __init__(self, smiles):
        self.mols = [Chem.MolFromSmiles(i) for i in smiles]
        self.smiles = smiles

    def mol2fp(self, mol, radius = 3):
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius = radius)
        array = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(fp, array)
        return array

    def compute_ECFP6(self, name):
        bit_headers = ['bit' + str(i) for i in range(2048)]
        arr = np.empty((0,2048), int).astype(int)
        for i in self.mols:
            fp = self.mol2fp(i)
            arr = np.vstack((arr, fp))
        df_ecfp6 = pd.DataFrame(np.asarray(arr).astype(int),columns=bit_headers)
        df_ecfp6.insert(loc=0, column='smiles', value=self.smiles)
        # df_ecfp6.to_csv(name[:-4]+'_ECFP6.csv', index=False)
        return df_ecfp6

In [24]:
smiles = [standardize_smiles(i) for i in smiles_df[0].values] 

[09:15:03] Can't kekulize mol.  Unkekulized atoms: 3 10


In [None]:
ecfp6_descriptor = ECFP6(smiles)        # create your ECFP6 object and provide smiles
filename= 'smiles'
smiles_fcfp_df = ecfp6_descriptor.compute_ECFP6(filename) # compute
smiles_fcfp_df.head()

In [50]:
train_data = pd.concat([smiles_fcfp_df, label_df], axis=1)
train_data.head()

Unnamed: 0,smiles,bit0,bit1,bit2,bit3,bit4,bit5,bit6,bit7,bit8,...,bit2039,bit2040,bit2041,bit2042,bit2043,bit2044,bit2045,bit2046,bit2047,toxic
0,COC(=O)[C@H]1CC[C@H](C(=O)OC)CC1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
1,O=S1(=O)CC(Cl)(Cl)C(Cl)(Cl)C1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0
2,Cc1cc(C)c(NC(=O)C[C@H](CC(=O)[O-])c2cccc3ccccc...,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1.0
3,Cc1noc(NS(=O)(=O)c2ccc(N)cc2)c1C,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0
4,CC(C)(C)NC[C@H](O)COc1nsnc1N1CCOCC1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0


In [73]:

smiles_fcfp_df.iloc[:, 1:]

Unnamed: 0,bit0,bit1,bit2,bit3,bit4,bit5,bit6,bit7,bit8,bit9,...,bit2038,bit2039,bit2040,bit2041,bit2042,bit2043,bit2044,bit2045,bit2046,bit2047
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6253,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
6254,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6255,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6256,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Random Forest

In [None]:
rf = RandomForestClassifier(
    n_estimators=100,   # number of trees
    # max_depth=3,        # maximum depth of each tree
    random_state=42
)




In [74]:

rf.fit(smiles_fcfp_df.iloc[:, 1:], label_df)

  return fit_method(estimator, *args, **kwargs)
