In [1]:
!python --version

Python 3.11.11


In [2]:
import os
os.environ["TF_USE_LEGACY_KERAS"] = "True"

In [3]:
%%capture
!pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0
!pip install  dgl -f https://data.dgl.ai/wheels/torch-2.4/repo.html
!pip install deepchem
!pip install dgllife

In [4]:
import numpy as np
import pandas as pd
import torch
import dgl
import dgllife
import deepchem as dc

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

print("Numpy Version : ", np.__version__)
print("Pandas Version : ", pd.__version__)
print("Pytorch Version : ", torch.__version__)
print("Dgl Version : ", dgl.__version__)
print("Dgllife Version : ", dgllife.__version__)
print("DeepChem Version : ", dc.__version__)



Instructions for updating:
experimental_relax_shapes is deprecated, use reduce_retracing instead


Numpy Version :  2.0.2
Pandas Version :  2.2.2
Pytorch Version :  2.4.0+cu121
Dgl Version :  2.4.0
Dgllife Version :  0.3.2
DeepChem Version :  2.8.0


In [5]:
df = pd.read_csv('SAMPL.csv')

In [6]:
df.head(2)

Unnamed: 0,iupac,smiles,expt,calc
0,"4-methoxy-N,N-dimethyl-benzamide",CN(C)C(=O)c1ccc(cc1)OC,-11.01,-9.625
1,methanesulfonyl chloride,CS(=O)(=O)Cl,-4.87,-6.219


In [7]:
df['expt'].median()

-3.5300000000000002

In [8]:

expt_cls = [int(boolean) for boolean in list(map(lambda s: s<-3.5, df['expt']))]

In [9]:
df['expt_cls'] = expt_cls

In [10]:
df.head()

Unnamed: 0,iupac,smiles,expt,calc,expt_cls
0,"4-methoxy-N,N-dimethyl-benzamide",CN(C)C(=O)c1ccc(cc1)OC,-11.01,-9.625,1
1,methanesulfonyl chloride,CS(=O)(=O)Cl,-4.87,-6.219,1
2,3-methylbut-1-ene,CC(C)C=C,1.83,2.452,0
3,2-ethylpyrazine,CCc1cnccn1,-5.45,-5.809,1
4,heptan-1-ol,CCCCCCCO,-4.21,-2.917,1


In [11]:
df['expt_cls'].value_counts()

Unnamed: 0_level_0,count
expt_cls,Unnamed: 1_level_1
1,325
0,317


In [12]:
df.shape

(642, 5)

In [13]:
df.drop(labels = [61,195,286], axis=0, inplace=True)
df = df.reset_index(drop=True)

In [14]:
df.shape

(639, 5)

In [15]:
df.rename(columns={'expt_cls': 'task1'}, inplace=True)

## **Loading custom dataset in DeepChem and featurization**

In [16]:
with dc.utils.UniversalNamedTemporaryFile(mode='w') as tmpfile:
    df.to_csv(tmpfile.name)
    loader = dc.data.CSVLoader(["task1"], feature_field="smiles",
                             featurizer=dc.feat.ConvMolFeaturizer(per_atom_fragmentation=False))
    dataset = loader.create_dataset(tmpfile.name)
len(dataset)

639

In [17]:
type(dataset)

In [18]:
dataset.ids[:10]


array(['CN(C)C(=O)c1ccc(cc1)OC', 'CS(=O)(=O)Cl', 'CC(C)C=C', 'CCc1cnccn1',
       'CCCCCCCO', 'Cc1cc(cc(c1)O)C', 'CC(C)C(C)C', 'CCCC(C)(C)O',
       'C[C@@H]1CCCC[C@@H]1C', 'CC[C@H](C)O'], dtype=object)

In [19]:
dataset.get_shape()

((np.int64(639),),
 (np.int64(639), np.int64(1)),
 (np.int64(639), np.int64(1)),
 (np.int64(639),))

## **Random Train Test Split**

In [20]:
splitter = dc.splits.RandomSplitter()
train_dataset, test_dataset = splitter.train_test_split(
  dataset=dataset, frac_train=.80, seed=9
)
len(train_dataset)

511

In [21]:
len(test_dataset)

128

In [22]:
model = dc.models.GraphConvModel(n_tasks=1, mode='classification', dropout=0.2)

In [23]:
model.fit(train_dataset, nb_epoch=100)

0.039031589031219484

In [24]:
from collections import Counter
a = tuple(test_dataset.y.flatten())
Counter(a)

Counter({np.float64(0.0): 58, np.float64(1.0): 70})

In [25]:
y_pred_test = model.predict(test_dataset)
y_pred_train = model.predict(train_dataset)

In [27]:
# y_pred_test

In [28]:
y_pred_test_labels = (y_pred_test[:, 0, 1] > 0.5).astype(int)
y_pred_train_labels = (y_pred_train[:, 0, 1] > 0.5).astype(int)

In [29]:
y_pred_test_labels

array([0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0])

In [30]:
test_accuracy = accuracy_score(test_dataset.y, y_pred_test_labels)
test_precision = precision_score(test_dataset.y, y_pred_test_labels)
test_recall = recall_score(test_dataset.y, y_pred_test_labels)
test_f1 = f1_score(test_dataset.y, y_pred_test_labels)
test_roc_auc = roc_auc_score(test_dataset.y, y_pred_test[:, 0, 1])  # Use probabilities for ROC-AUC

In [31]:
train_accuracy = accuracy_score(train_dataset.y, y_pred_train_labels)
train_precision = precision_score(train_dataset.y, y_pred_train_labels)
train_recall = recall_score(train_dataset.y, y_pred_train_labels)
train_f1 = f1_score(train_dataset.y, y_pred_train_labels)
train_roc_auc = roc_auc_score(train_dataset.y, y_pred_train[:, 0, 1])

In [32]:
print("Test Set Metrics:")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall: {test_recall:.4f}")
print(f"F1-score: {test_f1:.4f}")
print(f"ROC-AUC: {test_roc_auc:.4f}")

Test Set Metrics:
Accuracy: 0.8594
Precision: 0.9643
Recall: 0.7714
F1-score: 0.8571
ROC-AUC: 0.9732


In [33]:

print("\nTrain Set Metrics:")
print(f"Accuracy: {train_accuracy:.4f}")
print(f"Precision: {train_precision:.4f}")
print(f"Recall: {train_recall:.4f}")
print(f"F1-score: {train_f1:.4f}")
print(f"ROC-AUC: {train_roc_auc:.4f}")


Train Set Metrics:
Accuracy: 0.9902
Precision: 1.0000
Recall: 0.9803
F1-score: 0.9901
ROC-AUC: 0.9999


In [37]:
from sklearn.metrics import confusion_matrix
confusion_matrix(test_dataset.y, y_pred_test_labels)

array([[56,  2],
       [16, 54]])

In [38]:
metric = dc.metrics.Metric(dc.metrics.roc_auc_score)
print("Training set score:", model.evaluate(train_dataset, [metric]))
print("Test set score:", model.evaluate(test_dataset, [metric]))

Training set score: {'roc_auc_score': np.float64(0.9999234045160696)}
Test set score: {'roc_auc_score': np.float64(0.973152709359606)}


## **Scaffold Split**

In [39]:
scaffoldsplitter = dc.splits.ScaffoldSplitter()
train_dataset,test_dataset = scaffoldsplitter.train_test_split(dataset, frac_train=.80)

model = dc.models.GraphConvModel(n_tasks=1, mode='classification', dropout=0.2)

model.fit(train_dataset, nb_epoch=100)

y_pred_test = model.predict(test_dataset)
y_pred_train = model.predict(train_dataset)

y_pred_test_labels = (y_pred_test[:, 0, 1] > 0.5).astype(int)
y_pred_train_labels = (y_pred_train[:, 0, 1] > 0.5).astype(int)

test_accuracy = accuracy_score(test_dataset.y, y_pred_test_labels)
print(f"Accuracy: {test_accuracy:.4f}")

Accuracy: 0.7891


In [40]:
from sklearn.metrics import confusion_matrix
confusion_matrix(test_dataset.y, y_pred_test_labels)

array([[29, 11],
       [16, 72]])

In [41]:
metric = dc.metrics.Metric(dc.metrics.roc_auc_score)
print("Training set score:", model.evaluate(train_dataset, [metric]))
print("Test set score:", model.evaluate(test_dataset, [metric]))

Training set score: {'roc_auc_score': np.float64(0.9999691833590139)}
Test set score: {'roc_auc_score': np.float64(0.8872159090909091)}


## **Other Graph Models**

**AttentiveFP**

In [42]:
with dc.utils.UniversalNamedTemporaryFile(mode='w') as tmpfile:
    df.to_csv(tmpfile.name)
    loader = dc.data.CSVLoader(["task1"], feature_field="smiles",
                             featurizer=dc.feat.MolGraphConvFeaturizer(use_edges=True))
    dataset = loader.create_dataset(tmpfile.name)
len(dataset)

639

In [43]:
splitter = dc.splits.RandomSplitter()
train_dataset, test_dataset = splitter.train_test_split(dataset=dataset, frac_train=.80, seed=9)
len(train_dataset)

511

In [44]:
model = dc.models.AttentiveFPModel(mode='classification', n_tasks=1, batch_size=16, learning_rate=0.001)
model.fit(train_dataset, nb_epoch=30)

0.07016247113545736

In [45]:
y_pred_test = model.predict(test_dataset)
y_pred_train = model.predict(train_dataset)

y_pred_test_labels = (y_pred_test[:, 1] > 0.5).astype(int)
y_pred_train_labels = (y_pred_train[:, 1] > 0.5).astype(int)

test_accuracy = accuracy_score(test_dataset.y, y_pred_test_labels)
print(f"Accuracy: {test_accuracy:.4f}")

Accuracy: 0.8750


In [46]:
from sklearn.metrics import confusion_matrix
confusion_matrix(test_dataset.y, y_pred_test_labels)

array([[55,  3],
       [13, 57]])

In [47]:
metric = dc.metrics.Metric(dc.metrics.roc_auc_score)
print("Training set score:", model.evaluate(train_dataset, [metric]))
print("Test set score:", model.evaluate(test_dataset, [metric]))

Training set score: {'roc_auc_score': np.float64(0.9990961732896229)}
Test set score: {'roc_auc_score': np.float64(0.9746305418719212)}


## **Predictions of External Molecules**

In [48]:
df.iloc[45:48, :]

Unnamed: 0,iupac,smiles,expt,calc,task1
45,methyl butanoate,CCCC(=O)OC,-2.83,-3.552,0
46,2-hydroxybenzaldehyde,c1ccc(c(c1)C=O)O,-4.68,-8.809,1
47,azetidine,C1CNC1,-5.56,-3.861,1


In [49]:
ex_smiles = ['CCCC(=O)OC', 'c1ccc(c(c1)C=O)O', 'C1CNC1']

In [50]:
featurizer = dc.feat.MolGraphConvFeaturizer(use_edges=True)
X = featurizer.featurize(ex_smiles)
ex_dataset = dc.data.NumpyDataset(X=X)

In [51]:
ext_test_score = model.predict(ex_dataset)

In [52]:
predicted_labels = (ext_test_score[:, 1] > 0.5).astype(int)

In [53]:
predicted_labels

array([0, 1, 1])