In [2]:
!pip install --pre deepchem
import deepchem
deepchem.__version__

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting deepchem
  Downloading deepchem-2.7.2.dev20230218044322-py3-none-any.whl (709 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m709.5/709.5 KB[0m [31m35.8 MB/s[0m eta [36m0:00:00[0m
Collecting rdkit
  Downloading rdkit-2022.9.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.3/29.3 MB[0m [31m48.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit, deepchem
Successfully installed deepchem-2.7.2.dev20230218044322 rdkit-2022.9.4




'2.7.2.dev'

In [3]:
import deepchem as dc

In [4]:
import numpy as np

In [5]:
tox21_tasks, tox21_datasets, transformers = dc.molnet.load_tox21()

In [6]:
tox21_tasks

['NR-AR',
 'NR-AR-LBD',
 'NR-AhR',
 'NR-Aromatase',
 'NR-ER',
 'NR-ER-LBD',
 'NR-PPAR-gamma',
 'SR-ARE',
 'SR-ATAD5',
 'SR-HSE',
 'SR-MMP',
 'SR-p53']

In [7]:
len(tox21_tasks) #12 different assays

12

In [8]:
tox21_datasets

(<DiskDataset X.shape: (6264, 1024), y.shape: (6264, 12), w.shape: (6264, 12), task_names: ['NR-AR' 'NR-AR-LBD' 'NR-AhR' ... 'SR-HSE' 'SR-MMP' 'SR-p53']>,
 <DiskDataset X.shape: (783, 1024), y.shape: (783, 12), w.shape: (783, 12), ids: ['N#C[C@@H]1CC(F)(F)CN1C(=O)CNC1CC2CCC(C1)N2c1ncccn1'
  'CN(C)C(=O)NC1(c2ccccc2)CCN(CCC[C@@]2(c3ccc(Cl)c(Cl)c3)CCCN(C(=O)c3ccccc3)C2)CC1'
  'CSc1nnc(C(C)(C)C)c(=O)n1N' ...
  'O=C(O[C@H]1CN2CCC1CC2)N1CCc2ccccc2[C@@H]1c1ccccc1'
  'C#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@@H]4[C@H]3C(=C)C[C@@]21CC'
  'NC(=O)C(c1ccccc1)(c1ccccc1)[C@@H]1CCN(CCc2ccc3c(c2)CCO3)C1'], task_names: ['NR-AR' 'NR-AR-LBD' 'NR-AhR' ... 'SR-HSE' 'SR-MMP' 'SR-p53']>,
 <DiskDataset X.shape: (784, 1024), y.shape: (784, 12), w.shape: (784, 12), ids: ['CC1(C)S[C@@H]2[C@H](NC(=O)Cc3ccccc3)C(=O)N2[C@H]1C(=O)O.CC1(C)S[C@@H]2[C@H](NC(=O)Cc3ccccc3)C(=O)N2[C@H]1C(=O)O.c1ccc(CNCCNCc2ccccc2)cc1'
  'CC(C)(c1ccc(Oc2ccc3c(c2)C(=O)OC3=O)cc1)c1ccc(Oc2ccc3c(c2)C(=O)OC3=O)cc1'
  'Cc1cc(C(C)(C)C)c(O)c(C)c1

In [9]:
train_dataset, valid_dataset, test_dataset = tox21_datasets #split up datasets correctly into 3 sets

In [12]:
train_dataset.X.shape

(6264, 1024)

In [13]:
valid_dataset.X.shape

(783, 1024)

In [14]:
test_dataset.X.shape

(784, 1024)

In [15]:
train_dataset.y.shape

(6264, 12)

In [16]:
np.shape(valid_dataset.y) #can also check shape using numpy

(783, 12)

In [17]:
np.shape(test_dataset.y) #there are 12 data points, aka labels for each sample, corresponding to the 12 tasks discussed earlier (12 different assays).

(784, 12)

In [18]:
train_dataset.w.shape #check the weight of the label, if it is 0, this means the value was not measured, is missing, and does not carry any weight.

(6264, 12)

In [19]:
np.count_nonzero(train_dataset.w) #there are 63,647 non zero values in the train dataset, which contains 6264 x 12 = 75,168 elements in total

63647

In [25]:
np.count_nonzero(train_dataset.w == 0) #how many of the elements of the train dataset have weight == 0 (in short, missing values)?

11521

In [24]:
np.count_nonzero(train_dataset.w == 0)

11521

In [26]:
transformers #which transformer was used? The Balancing transformer was used! This Balancing transformer is used to correct unbalanced data

[<deepchem.trans.transformers.BalancingTransformer at 0x7f75885d8c10>]

In [27]:
model = dc.models.MultitaskClassifier(n_tasks = 12, n_features = 1024, layer_sizes =[1000]) #construct a multitask classifier model. layer_sizes means we have a single hidden layer of width 1000.

In [28]:
#training the model
model.fit(train_dataset, nb_epoch = 10) #10 epochs of gradient descent training. An epoch means one complete pass through all the samples in the dataset

0.49407828648885094

In [29]:
#compute the ROC AUC metric for this dataset. Get the mean, because there are 12 tasks in tox21 dataset.
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean)

In [30]:
train_scores = model.evaluate(train_dataset, [metric], transformers)
test_scores = model.evaluate(test_dataset, [metric], transformers)

In [31]:
print(train_scores)
print(test_scores)

{'mean-roc_auc_score': 0.9585282697274358}
{'mean-roc_auc_score': 0.6856930400264968}


In [None]:
#the roc auc score of the train set is about 95% whereas for the test set it is about 69%.