In [1]:
from tdc.single_pred import Tox
import deepchem
import xgboost
import rdkit
from rdkit import Chem
from rdkit.Chem.EState import Fingerprinter
from rdkit.Chem.EState import EState

import numpy as np
import pandas as pd
import tensorflow as tf
import autokeras as ak

from tdc.benchmark_group import admet_group
group = admet_group(path = 'data/')

Found local copy...


In [4]:
predictions_list = []
metric = deepchem.deepchem.metrics.Metric(deepchem.deepchem.metrics.mean_absolute_error)
ecfpFeat = deepchem.deepchem.feat.CircularFingerprint(radius=4)
convMolFeat = deepchem.deepchem.feat.ConvMolFeaturizer()
xgb_reg_ecfp = xgboost.XGBRegressor()
xgb_reg_estate = xgboost.XGBRegressor()

for seed in [1, 2, 3, 4, 5]:
    benchmark = group.get('ppbr_az') 
    
    predictions = {}
    name = benchmark['name']
    train_val, test = benchmark['train_val'], benchmark['test']
    train, valid = group.get_train_valid_split(benchmark = name, split_type = 'default', seed = seed)  

    #trainMol = train.iloc[:,1].map(lambda x: Chem.MolFromSmiles(x))
    #validMol = valid.iloc[:,1].map(lambda x: Chem.MolFromSmiles(x))
    testMol = test.iloc[:,1].map(lambda x: Chem.MolFromSmiles(x))
    train_valMol = train_val.iloc[:,1].map(lambda x: Chem.MolFromSmiles(x))

    #featurize training, valid, and test data for xgboost
    #trainFeat = np.stack(np.array(trainMol.map(lambda x: Fingerprinter.FingerprintMol(x)[1])))
    #validFeat = np.stack(np.array(validMol.map(lambda x: Fingerprinter.FingerprintMol(x)[1])))
    testFeat = np.stack(np.array(testMol.map(lambda x: Fingerprinter.FingerprintMol(x)[1])))
    train_valFeat = np.stack(np.array(train_valMol.map(lambda x: Fingerprinter.FingerprintMol(x)[1])))

    #featurize training, valid, and test data for xgboost
    #ecfp_f_train = ecfpFeat.featurize(train.iloc[:,1].to_list())
    #ecfp_f_valid = ecfpFeat.featurize(valid.iloc[:,1].to_list())
    ecfp_f_test = ecfpFeat.featurize(test.iloc[:,1].to_list())
    train_val_f = ecfpFeat.featurize(train_val.iloc[:,1].to_list())

    #featurize training, valid, and test data for the GCN
    cv_f_train = convMolFeat.featurize(train.iloc[:,1].to_list())
    cv_f_valid = convMolFeat.featurize(valid.iloc[:,1].to_list())
    cv_f_test = convMolFeat.featurize(test.iloc[:,1].to_list())

    #convert training and validation data into a deepchem dataset for the gcn
    gcn_train_data = deepchem.deepchem.data.NumpyDataset(X=cv_f_train, y=np.array(train.iloc[:,2]), ids=np.array(train.iloc[:,1].to_list()))
    gcn_valid_data = deepchem.deepchem.data.NumpyDataset(X=cv_f_valid, y=np.array(valid.iloc[:,2]), ids=np.array(valid.iloc[:,1].to_list()))

    #fit data on GCN
    reg = deepchem.deepchem.models.GraphConvModel(
        n_tasks=1, 
        dropout=.0005,
        dense_layer_size=1063,
        graph_conv_layers=[128, 128, 128],
        mode="regression",)
    callback = deepchem.deepchem.models.ValidationCallback(gcn_valid_data, 1000, metric)
    reg.fit(gcn_train_data, nb_epoch=100, callbacks=callback)

    #predict values on gcn and reshape array
    gcn_pred = reg.predict_on_batch(X=np.array(cv_f_test)).reshape(559,)

    #fit xgboost model and store np ndarray in xgb_pred
    xgb_reg_estate.fit(X=train_valFeat, y=train_val.iloc[:,2], eval_metric="mae", verbose=True)
    xgb_reg_ecfp.fit(X=train_val_f, y=train_val.iloc[:,2], eval_metric="mae")
    pred_estate = xgb_reg_estate.predict(X=testFeat)
    pred_ecfp = xgb_reg_ecfp.predict(X=ecfp_f_test)
    
    #pred_ak = akReg.predict(rdkit_f_test)

    # store test predictions in y_pred_test
    y_pred_test = np.mean([ pred_estate, pred_ecfp, gcn_pred ], axis=0)

    print("predictions for run #")
    print(seed)
    print(y_pred_test[0:5])
        
    predictions[name] = y_pred_test
    predictions_list.append(predictions)

print("Prediction List:")
print(predictions_list)

generating training, validation splits...
100%|██████████| 2231/2231 [00:01<00:00, 1303.03it/s]


Step 1000 validation: mean_absolute_error=13.0357
Step 2000 validation: mean_absolute_error=11.5734


generating training, validation splits...


predictions for run #
1
[69.58057  69.58057  94.43487  94.43487  65.822014]


100%|██████████| 2231/2231 [00:01<00:00, 1591.12it/s]


Step 1000 validation: mean_absolute_error=11.9347
Step 2000 validation: mean_absolute_error=11.4805


generating training, validation splits...


predictions for run #
2
[69.0965  69.0965  94.60116 94.60116 69.72582]


100%|██████████| 2231/2231 [00:01<00:00, 1698.73it/s]


Step 1000 validation: mean_absolute_error=11.3671
Step 2000 validation: mean_absolute_error=10.0847


generating training, validation splits...


predictions for run #
3
[67.4826  67.4826  94.52991 94.52991 70.13153]


100%|██████████| 2231/2231 [00:01<00:00, 1689.30it/s]


Step 1000 validation: mean_absolute_error=9.70799
Step 2000 validation: mean_absolute_error=9.83599


generating training, validation splits...


predictions for run #
4
[69.38898 69.38898 95.61266 95.61266 69.16992]


100%|██████████| 2231/2231 [00:01<00:00, 1515.96it/s]


Step 1000 validation: mean_absolute_error=11.1678
Step 2000 validation: mean_absolute_error=11.6069
predictions for run #
5
[70.88437  70.88437  95.334984 95.334984 69.24944 ]
Prediction List:
[{'ppbr_az': array([ 69.58057 ,  69.58057 ,  94.43487 ,  94.43487 ,  65.822014,
        64.07192 ,  65.822014,  64.07192 ,  94.65334 ,  91.02719 ,
        74.910706,  83.05482 ,  74.910706,  83.05482 ,  83.05482 ,
        71.56158 ,  74.910706,  86.59426 ,  94.115974,  56.38783 ,
        60.042217,  89.98261 ,  84.0701  ,  82.25527 ,  82.25527 ,
        82.25527 ,  94.441956,  95.605034,  95.605034,  94.441956,
       101.46575 ,  91.06371 ,  87.78983 ,  87.78983 ,  91.472626,
        91.472626,  71.14993 ,  71.14993 ,  71.14993 ,  71.14993 ,
        71.14993 ,  71.14993 ,  71.14993 ,  71.14993 ,  88.92742 ,
        97.961525,  97.165855,  84.26824 ,  84.26824 ,  84.26824 ,
        84.26824 ,  84.26824 ,  84.26824 ,  90.39915 ,  81.5772  ,
        85.07627 ,  86.87396 ,  81.5772  ,  86.907585,  9

In [5]:
for i in predictions_list:
    print(group.evaluate(i))

group.evaluate_many(predictions_list)

{'ppbr_az': {'mae': 8.911}}
{'ppbr_az': {'mae': 9.417}}
{'ppbr_az': {'mae': 8.881}}
{'ppbr_az': {'mae': 9.093}}
{'ppbr_az': {'mae': 9.092}}


{'ppbr_az': [9.079, 0.191]}