In [2]:
from tdc.single_pred import Tox
import deepchem
import xgboost
import rdkit
from rdkit import Chem
from rdkit.Chem.EState import Fingerprinter
from rdkit.Chem.EState import EState

import numpy as np
import pandas as pd
import tensorflow as tf
import autokeras as ak

from tdc.benchmark_group import admet_group
group = admet_group(path = 'data/')
data = Tox(name="LD50_Zhu")
df = data.get_data()

Found local copy...
Found local copy...
Loading...
Done!


In [10]:
convMolFeat = deepchem.deepchem.feat.ConvMolFeaturizer()
trainMol = df.iloc[:,1].map(lambda x: Chem.MolFromSmiles(x))
mols = []

for _ in range(2):
    mol = Chem.MolToSmiles(trainMol[0], doRandom=True)
    print(mol)
    for arr in convMolFeat.featurize(mol).view()[0].get_atom_features():
        print(arr)
    mols.append(mol)

[N+](=Nc1ccccc1)([O-])c1ccccc1
[ 0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  1.  0.  0.  0.  0.  0.  0. -1.  0.  0.  1.  0.  0.  0.  0.  1.  0.
  0.  0.  0.]
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 1.
 0. 0. 0.]
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 1.
 0. 0. 0.]
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.

In [3]:
predictions_list = []
metric = deepchem.deepchem.metrics.Metric(deepchem.deepchem.metrics.mean_absolute_error)
featurizer = deepchem.deepchem.feat.ConvMolFeaturizer()

for seed in [1, 2]:
    benchmark = group.get('LD50_Zhu') 
    
    predictions = {}
    name = benchmark['name']
    train_val, test = benchmark['train_val'], benchmark['test']
    train, valid = group.get_train_valid_split(benchmark = name, split_type = 'default', seed = seed)  

    trainAugX = []
    trainAugY = []
    validAugX = []
    validAugY = []
    testAugX = []
    testAugY = []

    #augment SMILES
    for row in train.itertuples():
        mol = Chem.MolFromSmiles(row[2])
        for _ in range(30):
            newMol = Chem.MolToSmiles(mol, doRandom=True)
            trainAugX.append(featurizer.featurize(newMol)[0])
            trainAugY.append(row[3])

    for row in valid.itertuples():
        mol = Chem.MolFromSmiles(row[2])
        for _ in range(30):
            newMol = Chem.MolToSmiles(mol, doRandom=True)
            validAugX.append(featurizer.featurize(newMol)[0])
            validAugY.append(row[3])

    for row in test.itertuples():
        mol = Chem.MolFromSmiles(row[2])
        for _ in range(30):
            newMol = Chem.MolToSmiles(mol, doRandom=True)
            testAugX.append(featurizer.featurize(newMol)[0])
            testAugY.append(row[3])

    #convert training and validation data into a deepchem dataset
    train_data = deepchem.deepchem.data.NumpyDataset(X=trainAugX, y=np.array(trainAugY))
    valid_data = deepchem.deepchem.data.NumpyDataset(X=validAugX, y=np.array(validAugY))

    #fit data
    reg = deepchem.deepchem.models.GraphConvModel(
        n_tasks=1, 
        dropout=.0005,
        dense_layer_size=1063,
        graph_conv_layers=[128, 128, 128],
        mode="regression",)
    callback = deepchem.deepchem.models.ValidationCallback(valid_data, 1000, metric)
    reg.fit(train_data, nb_epoch=100, callbacks=callback)

    # store test predictions in y_pred_test
    y_pred_test = reg.predict_on_batch(X=np.array(testAugX))

    print("predictions for run #")
    print(seed)
    print(y_pred_test[0:5])
        
    predictions[name] = y_pred_test
    predictions_list.append(predictions)

print("Prediction List:")
print(predictions_list)

generating training, validation splits...
100%|██████████| 5907/5907 [00:01<00:00, 4451.74it/s]


Step 1000 validation: mean_absolute_error=0.584649
Step 2000 validation: mean_absolute_error=0.563168
Step 3000 validation: mean_absolute_error=0.564694
Step 4000 validation: mean_absolute_error=0.557703
Step 5000 validation: mean_absolute_error=0.541447
Step 6000 validation: mean_absolute_error=0.5733
Step 7000 validation: mean_absolute_error=0.532991
Step 8000 validation: mean_absolute_error=0.546384
Step 9000 validation: mean_absolute_error=0.572427
Step 10000 validation: mean_absolute_error=0.539933
Step 11000 validation: mean_absolute_error=0.540419
Step 12000 validation: mean_absolute_error=0.53259
Step 13000 validation: mean_absolute_error=0.530342
Step 14000 validation: mean_absolute_error=0.536317
Step 15000 validation: mean_absolute_error=0.526393
Step 16000 validation: mean_absolute_error=0.528251
Step 17000 validation: mean_absolute_error=0.521351
Step 18000 validation: mean_absolute_error=0.530423
Step 19000 validation: mean_absolute_error=0.528663
Step 20000 validation: m

NameError: name 'f_test' is not defined

In [13]:
pred = reg.predict_on_batch(X=np.array(testAugX)).tolist()
print(pred)

[[2.1263108253479004], [2.1263108253479004], [2.1263108253479004], [2.1263108253479004], [2.1263108253479004], [2.1263108253479004], [2.1263108253479004], [2.1263108253479004], [2.1263108253479004], [2.1263108253479004], [2.1263108253479004], [2.1263108253479004], [2.1263108253479004], [2.1263108253479004], [2.1263108253479004], [2.1263108253479004], [2.1263108253479004], [2.1263108253479004], [2.1263108253479004], [2.1263108253479004], [2.1263108253479004], [2.1263108253479004], [2.1263108253479004], [2.1263108253479004], [2.1263108253479004], [2.1263108253479004], [2.1263108253479004], [2.1263108253479004], [2.1263108253479004], [2.1263108253479004], [3.1123452186584473], [3.112344741821289], [3.1123452186584473], [3.112344741821289], [3.112344741821289], [3.1123452186584473], [3.112344741821289], [3.1123452186584473], [3.112344741821289], [3.1123452186584473], [3.1123452186584473], [3.112344741821289], [3.112344741821289], [3.112344741821289], [3.1123452186584473], [3.11234474182128

In [18]:
predictions[name] = np.mean(np.array(pred).reshape(-1, 30), axis=1)
group.evaluate(predictions)

{'ld50_zhu': {'mae': 0.597}}