In [1]:
!python --version

Python 3.11.11


In [2]:
import os
os.environ["TF_USE_LEGACY_KERAS"] = "1"

In [3]:
%%capture
!pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0
!pip install  dgl -f https://data.dgl.ai/wheels/torch-2.4/repo.html
!pip install deepchem
!pip install dgllife

In [4]:
import numpy as np
import pandas as pd
import torch
import dgl
import dgllife
import deepchem as dc

from sklearn.model_selection import train_test_split

print("Numpy Version : ", np.__version__)
print("Pandas Version : ", pd.__version__)
print("Pytorch Version : ", torch.__version__)
print("Dgl Version : ", dgl.__version__)
print("Dgllife Version : ", dgllife.__version__)
print("DeepChem Version : ", dc.__version__)

DGL backend not selected or invalid.  Assuming PyTorch for now.


Setting the default backend to "pytorch". You can change it in the ~/.dgl/config.json file or export the DGLBACKEND environment variable.  Valid options are: pytorch, mxnet, tensorflow (all lowercase)


Instructions for updating:
experimental_relax_shapes is deprecated, use reduce_retracing instead


Numpy Version :  2.0.2
Pandas Version :  2.2.2
Pytorch Version :  2.4.0+cu121
Dgl Version :  2.4.0
Dgllife Version :  0.3.2
DeepChem Version :  2.8.0


In [5]:
df = pd.read_csv('SAMPL.csv')

In [None]:
df.head()

Unnamed: 0,iupac,smiles,expt,calc
0,"4-methoxy-N,N-dimethyl-benzamide",CN(C)C(=O)c1ccc(cc1)OC,-11.01,-9.625
1,methanesulfonyl chloride,CS(=O)(=O)Cl,-4.87,-6.219
2,3-methylbut-1-ene,CC(C)C=C,1.83,2.452
3,2-ethylpyrazine,CCc1cnccn1,-5.45,-5.809
4,heptan-1-ol,CCCCCCCO,-4.21,-2.917


In [None]:
df.shape

(642, 4)

In [None]:
train, test = train_test_split(df, test_size=0.20, random_state=42)

In [None]:
train.shape

(513, 4)

In [None]:
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [None]:
df = pd.concat([train, test])
df.head()

Unnamed: 0,iupac,smiles,expt,calc
0,"N,N-dimethylformamide",CN(C)C=O,-7.81,-6.932
1,2-chloro-2-methyl-propane,CC(C)(C)Cl,1.09,0.826
2,"1,2-dinitroxyethane",C(CO[N+](=O)[O-])O[N+](=O)[O-],-5.73,-6.227
3,2-propoxyethanol,CCCOCCO,-6.4,-3.94
4,pentyl propanoate,CCCCCOC(=O)CC,-2.11,-2.176


In [None]:
df.rename(columns={'expt': 'task1'}, inplace=True)

## **Loading custom dataset in DeepChem and featurization**

In [None]:
with dc.utils.UniversalNamedTemporaryFile(mode='w') as tmpfile:
    df.to_csv(tmpfile.name)
    loader = dc.data.CSVLoader(["task1"], feature_field="smiles",
                             featurizer=dc.feat.ConvMolFeaturizer(per_atom_fragmentation=False))
    dataset = loader.create_dataset(tmpfile.name)
len(dataset)

642

In [None]:
type(dataset)

In [None]:
df.head()

Unnamed: 0,iupac,smiles,task1,calc
0,"N,N-dimethylformamide",CN(C)C=O,-7.81,-6.932
1,2-chloro-2-methyl-propane,CC(C)(C)Cl,1.09,0.826
2,"1,2-dinitroxyethane",C(CO[N+](=O)[O-])O[N+](=O)[O-],-5.73,-6.227
3,2-propoxyethanol,CCCOCCO,-6.4,-3.94
4,pentyl propanoate,CCCCCOC(=O)CC,-2.11,-2.176


In [None]:
dataset.ids[:10]


array(['CN(C)C=O', 'CC(C)(C)Cl', 'C(CO[N+](=O)[O-])O[N+](=O)[O-]',
       'CCCOCCO', 'CCCCCOC(=O)CC', 'COCC(OC)(OC)OC',
       'CC1=CC(=O)CC(C1)(C)C', 'CCCc1ccc(c(c1)OC)O', 'CC(=O)C1CCCCC1',
       'C'], dtype=object)

In [None]:
dataset.get_shape()

((642,), (642, 1), (642, 1), (642,))

In [None]:
# for X, y, w, id in dataset.itersamples():
#   print(y, id)

In [None]:
dataset.X[0].get_atom_features()
# [0].get_atom_features()

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 1., 0., 0., 0.

In [None]:
# Split by index numbers
splitter = dc.splits.SpecifiedSplitter(test_indices=range(513, 642))
train_dataset, test_dataset = splitter.train_test_split(dataset)

In [None]:
len(test_dataset)

129

In [None]:
model = dc.models.GraphConvModel(n_tasks=1, mode='regression', dropout=0.2, batch_normalize = False)

In [None]:
model.fit(train_dataset, nb_epoch=100)

0.5245515441894532

In [None]:
y_pred_test = model.predict(test_dataset)
y_pred_train = model.predict(train_dataset)

In [None]:
# test_dataset.y

In [None]:
from sklearn.metrics import r2_score
print("Train r2 score: ", r2_score(y_pred_train, train_dataset.y))
print("Test r2 score: ", r2_score(y_pred_test, test_dataset.y))

Train r2 score:  0.9458315273288176
Test r2 score:  0.8664042073137397


In [None]:
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)
print("Training set score:", model.evaluate(train_dataset, [metric]))
print("Test set score:", model.evaluate(test_dataset, [metric]))

Training set score: {'pearson_r2_score': 0.9770957483929081}
Test set score: {'pearson_r2_score': 0.9444117159299883}


## **Random Train Test Split**

In [None]:
splitter = dc.splits.RandomSplitter()
train_dataset, test_dataset = splitter.train_test_split(
  dataset=dataset, frac_train=.80, seed=9
)
len(train_dataset)

513

In [None]:
model = dc.models.GraphConvModel(n_tasks=1, mode='regression', dropout=0.2)

In [None]:
model.fit(train_dataset, nb_epoch=100)

1.0316976165771485

In [None]:
y_pred_test = model.predict(test_dataset)
y_pred_train = model.predict(train_dataset)

In [None]:
from sklearn.metrics import r2_score
print("Train r2 score: ", r2_score(y_pred_train, train_dataset.y))
print("Test r2 score: ", r2_score(y_pred_test, test_dataset.y))

Train r2 score:  0.7466138555458752
Test r2 score:  0.6373747814798145


In [None]:
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)
print("Training set score:", model.evaluate(train_dataset, [metric]))
print("Test set score:", model.evaluate(test_dataset, [metric]))

Training set score: {'pearson_r2_score': 0.9505668177397952}
Test set score: {'pearson_r2_score': 0.8801111839870016}


## **Scaffold Split**

In [None]:
scaffoldsplitter = dc.splits.ScaffoldSplitter()
train_dataset,test_dataset = scaffoldsplitter.train_test_split(dataset, frac_train=.80)

In [None]:
model.fit(train_dataset, nb_epoch=100)

0.5620740127563476

In [None]:
y_pred_test = model.predict(test_dataset)
y_pred_train = model.predict(train_dataset)
print("Train r2 score: ", r2_score(y_pred_train, train_dataset.y))
print("Test r2 score: ", r2_score(y_pred_test, test_dataset.y))

Train r2 score:  0.786163381530879
Test r2 score:  0.6800920908583967


In [None]:
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)
print("Training set score:", model.evaluate(train_dataset, [metric]))
print("Test set score:", model.evaluate(test_dataset, [metric]))

Training set score: {'pearson_r2_score': 0.962124982686598}
Test set score: {'pearson_r2_score': 0.9153761781783717}


## **Molecular Weight Splitter**

In [None]:
molecularweightsplitter = dc.splits.MolecularWeightSplitter()
train_dataset, test_dataset = molecularweightsplitter.train_test_split(dataset,frac_train=.80)

In [None]:
model.fit(train_dataset, nb_epoch=100)

0.38336532592773437

In [None]:
y_pred_test = model.predict(test_dataset)
y_pred_train = model.predict(train_dataset)

print("Train r2 score: ", r2_score(y_pred_train, train_dataset.y))
print("Test r2 score: ", r2_score(y_pred_test, test_dataset.y))

Train r2 score:  0.8336765666983809
Test r2 score:  0.7391005731252127


In [None]:
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)
print("Training set score:", model.evaluate(train_dataset, [metric]))
print("Test set score:", model.evaluate(test_dataset, [metric]))

Training set score: {'pearson_r2_score': 0.9748803185962122}
Test set score: {'pearson_r2_score': 0.893672553341316}


## **MinMaxSplitter**

In [None]:
maxminsplitter = dc.splits.MaxMinSplitter()
train_dataset, test_dataset = maxminsplitter.train_test_split(dataset,frac_train=.80)

In [None]:
model.fit(train_dataset, nb_epoch=200)

0.2550699424743652

In [None]:
y_pred_test = model.predict(test_dataset)
y_pred_train = model.predict(train_dataset)

print("Train r2 score: ", r2_score(y_pred_train, train_dataset.y))
print("Test r2 score: ", r2_score(y_pred_test, test_dataset.y))

Train r2 score:  0.8578755095232764
Test r2 score:  0.8316508919848555


In [None]:
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)
print("Training set score:", model.evaluate(train_dataset, [metric]))
print("Test set score:", model.evaluate(test_dataset, [metric]))

Training set score: {'pearson_r2_score': 0.9816210254572666}
Test set score: {'pearson_r2_score': 0.9208069274052878}


## **ButinaSplit**

In [None]:
butinasplitter = dc.splits.ButinaSplitter()
train_dataset, test_dataset = butinasplitter.train_test_split(dataset, frac_train=.80, seed=2)

In [None]:
model.fit(train_dataset, nb_epoch=100)

0.2434164047241211

In [None]:
y_pred_test = model.predict(test_dataset)
y_pred_train = model.predict(train_dataset)

print("Train r2 score: ", r2_score(y_pred_train, train_dataset.y))
print("Test r2 score: ", r2_score(y_pred_test, test_dataset.y))

Train r2 score:  0.8694202966632
Test r2 score:  0.8884590865693047


In [None]:
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)
print("Training set score:", model.evaluate(train_dataset, [metric]))
print("Test set score:", model.evaluate(test_dataset, [metric]))

Training set score: {'pearson_r2_score': 0.9827353366034832}
Test set score: {'pearson_r2_score': 0.9435764174922706}


## **FingerprintSplit**

In [None]:
fpsplitter = dc.splits.FingerprintSplitter()
train_dataset, test_dataset = fpsplitter.train_test_split(dataset, frac_train=.80, seed=2)

In [None]:
model.fit(train_dataset, nb_epoch=100)

0.2582437515258789

In [None]:
y_pred_test = model.predict(test_dataset)
y_pred_train = model.predict(train_dataset)

print("Train r2 score: ", r2_score(y_pred_train, train_dataset.y))
print("Test r2 score: ", r2_score(y_pred_test, test_dataset.y))

Train r2 score:  0.896628491126867
Test r2 score:  0.7313443720337369


In [None]:
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)
print("Training set score:", model.evaluate(train_dataset, [metric]))
print("Test set score:", model.evaluate(test_dataset, [metric]))

Training set score: {'pearson_r2_score': 0.9843664698263709}
Test set score: {'pearson_r2_score': 0.9426476342803143}


## **Other Graph Models**

**GAT Model**

In [None]:
df.drop(labels = [9,113,131], axis=0, inplace=True)
df = df.reset_index(drop=True)

In [None]:
with dc.utils.UniversalNamedTemporaryFile(mode='w') as tmpfile:
    df.to_csv(tmpfile.name)
    loader = dc.data.CSVLoader(["task1"], feature_field="smiles",
                             featurizer=dc.feat.MolGraphConvFeaturizer())
    dataset = loader.create_dataset(tmpfile.name)
len(dataset)

637

In [None]:
df.iloc[131,:]

Unnamed: 0,131
iupac,hydrogen sulfide
smiles,S
task1,-0.7
calc,-1.135


In [None]:
# fingerprintsplitter = dc.splits.FingerprintSplitter()
# train_dataset, test_dataset = fingerprintsplitter.train_test_split(dataset,frac_train=.80, seed=6)

splitter = dc.splits.RandomSplitter()
train_dataset, test_dataset = splitter.train_test_split(dataset=dataset, frac_train=.80, seed=9)
len(train_dataset)

509

In [None]:
model = dc.models.GATModel(n_tasks=1, mode='regression', dropout=0.2,  graph_conv_layers = [64, 64], dense_layer_size = 128)
model.fit(train_dataset, nb_epoch=50)

2.671858825683594

In [None]:
from sklearn.metrics import r2_score
y_pred_test = model.predict(test_dataset)
y_pred_train = model.predict(train_dataset)

print("Train r2 score: ", r2_score(y_pred_train, train_dataset.y))
print("Test r2 score: ", r2_score(y_pred_test, test_dataset.y))

Train r2 score:  0.870859838318558
Test r2 score:  0.8540553268656076


In [None]:
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)
print("Training set score:", model.evaluate(train_dataset, [metric]))
print("Test set score:", model.evaluate(test_dataset, [metric]))

Training set score: {'pearson_r2_score': 0.8920184446224242}
Test set score: {'pearson_r2_score': 0.8885254989438204}


**DAG Model**

In [None]:
with dc.utils.UniversalNamedTemporaryFile(mode='w') as tmpfile:
    df.to_csv(tmpfile.name)
    loader = dc.data.CSVLoader(["task1"], feature_field="smiles",
                             featurizer=dc.feat.ConvMolFeaturizer())
    dataset = loader.create_dataset(tmpfile.name)
len(dataset)

637

In [None]:
trans = dc.trans.DAGTransformer(max_atoms=50)
dataset = trans.transform(dataset)

In [None]:
splitter = dc.splits.RandomSplitter()
train_dataset, test_dataset = splitter.train_test_split(dataset=dataset, frac_train=.80, seed=9)
len(train_dataset)

509

In [None]:
model = dc.models.DAGModel(n_tasks=1, mode='regression', dropout=0.2,  n_graph_feat=30, n_outputs=30, layer_sizes=[100], layer_sizes_gather=[100])
model.fit(train_dataset, nb_epoch=30)

3.1991458892822267

In [None]:
from sklearn.metrics import r2_score
y_pred_test = model.predict(test_dataset)
y_pred_train = model.predict(train_dataset)

print("Train r2 score: ", r2_score(y_pred_train, train_dataset.y))
print("Test r2 score: ", r2_score(y_pred_test, test_dataset.y))

Train r2 score:  0.8883538084732362
Test r2 score:  0.7883480406523975


In [None]:
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)
print("Training set score:", model.evaluate(train_dataset, [metric]))
print("Test set score:", model.evaluate(test_dataset, [metric]))

Training set score: {'pearson_r2_score': 0.9172509966915043}
Test set score: {'pearson_r2_score': 0.8482615989868949}


**AttentiveFP**

In [None]:
with dc.utils.UniversalNamedTemporaryFile(mode='w') as tmpfile:
    df.to_csv(tmpfile.name)
    loader = dc.data.CSVLoader(["task1"], feature_field="smiles",
                             featurizer=dc.feat.MolGraphConvFeaturizer(use_edges=True))
    dataset = loader.create_dataset(tmpfile.name)
len(dataset)

637

In [None]:
splitter = dc.splits.RandomSplitter()
train_dataset, test_dataset = splitter.train_test_split(dataset=dataset, frac_train=.80, seed=9)
len(train_dataset)

509

In [None]:
model = dc.models.AttentiveFPModel(mode='regression', n_tasks=1, batch_size=16, learning_rate=0.001)
model.fit(train_dataset, nb_epoch=30)

0.5686037699381511

In [None]:
from sklearn.metrics import r2_score
y_pred_test = model.predict(test_dataset)
y_pred_train = model.predict(train_dataset)

print("Train r2 score: ", r2_score(y_pred_train, train_dataset.y))
print("Test r2 score: ", r2_score(y_pred_test, test_dataset.y))

Train r2 score:  0.9712329769119304
Test r2 score:  0.9268034047760122


In [None]:
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)
print("Training set score:", model.evaluate(train_dataset, [metric]))
print("Test set score:", model.evaluate(test_dataset, [metric]))

## **Predictions of External Molecules**

In [None]:
df.iloc[56:58, :]

Unnamed: 0,iupac,smiles,task1,calc
56,methylcyclopentane,CC1CCCC1,1.59,1.785
57,2-bromopropane,CC(C)Br,-0.48,0.448


In [None]:
ex_smiles = ['CC1CCCC1', 'CC(C)Br']

In [None]:
featurizer = dc.feat.MolGraphConvFeaturizer(use_edges=True)
X = featurizer.featurize(ex_smiles)
ex_dataset = dc.data.NumpyDataset(X=X)

In [None]:
ext_test_score = model.predict(ex_dataset)

In [None]:
ext_test_score

array([[ 1.7229227 ],
       [-0.41806248]], dtype=float32)