<a href="https://colab.research.google.com/github/gitconnoisseur/toxicity-model/blob/master/ecfp_xgb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#import autogluon as ag
from tdc.single_pred import Tox
import deepchem
import xgboost

import numpy as np
import pandas as pd
import tensorflow as tf
import autokeras as ak

from tdc.benchmark_group import admet_group
group = admet_group(path = 'data/')

Found local copy...


Run benchmark combining the predictions of a GCN with deepchem's `ConvMolFeaturizer` and xgboost with an ECFP featurizer:

In [9]:
predictions_list = []
metric = deepchem.deepchem.metrics.Metric(deepchem.deepchem.metrics.mean_absolute_error)
convMolFeat = deepchem.deepchem.feat.ConvMolFeaturizer()
ecfpFeat = deepchem.deepchem.feat.CircularFingerprint(radius=4)
xgb_reg = xgboost.XGBRegressor()

for seed in [1, 2, 3, 4, 5]:
    benchmark = group.get('LD50_Zhu') 
    
    predictions = {}
    name = benchmark['name']
    train_val, test = benchmark['train_val'], benchmark['test']
    train, valid = group.get_train_valid_split(benchmark = name, split_type = 'default', seed = seed)  

    #featurize training, valid, and test data for the GCN
    cv_f_train = convMolFeat.featurize(train.iloc[:,1].to_list())
    cv_f_valid = convMolFeat.featurize(valid.iloc[:,1].to_list())
    cv_f_test = convMolFeat.featurize(test.iloc[:,1].to_list())

    #featurize training, valid, and test data for xgboost
    ecfp_f_train_val = ecfpFeat.featurize(train_val.iloc[:,1].to_list())
    ecfp_f_test = ecfpFeat.featurize(test.iloc[:,1].to_list())

    # TODO: implement validation splits for xgboost

    #convert training and validation data into a deepchem dataset for the gcn
    gcn_train_data = deepchem.deepchem.data.NumpyDataset(X=cv_f_train, y=np.array(train.iloc[:,2]), ids=np.array(train.iloc[:,1].to_list()))
    gcn_valid_data = deepchem.deepchem.data.NumpyDataset(X=cv_f_valid, y=np.array(valid.iloc[:,2]), ids=np.array(valid.iloc[:,1].to_list()))

    #fit data on GCN
    reg = deepchem.deepchem.models.GraphConvModel(
        n_tasks=1, 
        dropout=.0005,
        dense_layer_size=1063,
        graph_conv_layers=[128, 128, 128],
        mode="regression",)
    callback = deepchem.deepchem.models.ValidationCallback(gcn_valid_data, 1000, metric)
    reg.fit(gcn_train_data, nb_epoch=100, callbacks=callback)

    #predict values on gcn and reshape array
    gcn_pred = reg.predict_on_batch(X=np.array(cv_f_test)).reshape(1478,)
    
    #fit xgboost model and store np ndarray in xgb_pred
    xgb_reg.fit(X=ecfp_f_train_val, y=train_val.iloc[:,2], eval_metric="mae")
    xgb_pred = xgb_reg.predict(X=ecfp_f_test)

    # store test predictions in y_pred_test
    y_pred_test = np.mean([ gcn_pred, xgb_pred ], axis=0)

    print("predictions for run #")
    print(seed)
    print(y_pred_test[0:5])
        
    predictions[name] = y_pred_test
    predictions_list.append(predictions)

print("Prediction List:")
print(predictions_list)

generating training, validation splits...
100%|██████████| 5907/5907 [00:01<00:00, 4482.90it/s]


Step 1000 validation: mean_absolute_error=0.644428
Step 2000 validation: mean_absolute_error=0.562787
Step 3000 validation: mean_absolute_error=0.57614
Step 4000 validation: mean_absolute_error=0.61391
Step 5000 validation: mean_absolute_error=0.572617


generating training, validation splits...


[2.1937973 3.0208912 3.0537117 ... 2.5082133 2.045674  2.0345955]
(1478,)
<class 'numpy.ndarray'>
[2.1006067 2.2473924 3.3978975 ... 2.4514642 2.0766413 2.1703484]
(1478,)
<class 'numpy.ndarray'>
predictions for run #
1
[2.147202  2.634142  3.2258046 3.9417586 4.21091  ]


100%|██████████| 5907/5907 [00:01<00:00, 4286.31it/s]


Step 1000 validation: mean_absolute_error=0.558785
Step 2000 validation: mean_absolute_error=0.604676
Step 3000 validation: mean_absolute_error=0.548045
Step 4000 validation: mean_absolute_error=0.527643
Step 5000 validation: mean_absolute_error=0.510725


generating training, validation splits...


[2.5492542 3.2551484 2.5519679 ... 2.2081149 2.0613754 2.5779355]
(1478,)
<class 'numpy.ndarray'>
[2.1006067 2.2473924 3.3978975 ... 2.4514642 2.0766413 2.1703484]
(1478,)
<class 'numpy.ndarray'>
predictions for run #
2
[2.3249304 2.7512703 2.9749327 3.8157382 4.03194  ]


100%|██████████| 5907/5907 [00:01<00:00, 3881.10it/s]


Step 1000 validation: mean_absolute_error=0.689797
Step 2000 validation: mean_absolute_error=0.671639
Step 3000 validation: mean_absolute_error=0.660826
Step 4000 validation: mean_absolute_error=0.638563


generating training, validation splits...


[2.1824887 2.5029848 2.843361  ... 1.9643127 2.1949472 2.2514832]
(1478,)
<class 'numpy.ndarray'>
[2.1006067 2.2473924 3.3978975 ... 2.4514642 2.0766413 2.1703484]
(1478,)
<class 'numpy.ndarray'>
predictions for run #
3
[2.1415477 2.3751886 3.1206293 3.9648597 4.130946 ]


100%|██████████| 5907/5907 [00:01<00:00, 4479.93it/s]


Step 1000 validation: mean_absolute_error=0.605699
Step 2000 validation: mean_absolute_error=0.567268
Step 3000 validation: mean_absolute_error=0.56645
Step 4000 validation: mean_absolute_error=0.5562
Step 5000 validation: mean_absolute_error=0.544577


generating training, validation splits...


[2.786223  2.410971  2.6042209 ... 2.4439466 1.9954015 2.169179 ]
(1478,)
<class 'numpy.ndarray'>
[2.1006067 2.2473924 3.3978975 ... 2.4514642 2.0766413 2.1703484]
(1478,)
<class 'numpy.ndarray'>
predictions for run #
4
[2.4434147 2.3291817 3.001059  3.8187964 4.2881036]


100%|██████████| 5907/5907 [00:01<00:00, 3823.62it/s]


Step 1000 validation: mean_absolute_error=0.584592
Step 2000 validation: mean_absolute_error=0.52629
Step 3000 validation: mean_absolute_error=0.540046
Step 4000 validation: mean_absolute_error=0.555672
[1.9449487 3.0889635 3.526101  ... 2.4635382 2.1035762 2.9584775]
(1478,)
<class 'numpy.ndarray'>
[2.1006067 2.2473924 3.3978975 ... 2.4514642 2.0766413 2.1703484]
(1478,)
<class 'numpy.ndarray'>
predictions for run #
5
[2.0227776 2.668178  3.4619994 4.1749067 4.3649864]
Prediction List:
[{'ld50_zhu': array([2.147202 , 2.634142 , 3.2258046, ..., 2.4798388, 2.0611577,
       2.1024718], dtype=float32)}, {'ld50_zhu': array([2.3249304, 2.7512703, 2.9749327, ..., 2.3297896, 2.0690084,
       2.374142 ], dtype=float32)}, {'ld50_zhu': array([2.1415477, 2.3751886, 3.1206293, ..., 2.2078884, 2.1357942,
       2.2109158], dtype=float32)}, {'ld50_zhu': array([2.4434147, 2.3291817, 3.001059 , ..., 2.4477053, 2.0360215,
       2.1697636], dtype=float32)}, {'ld50_zhu': array([2.0227776, 2.668178 , 3

In [10]:
for i in predictions_list:
    print(group.evaluate(i))

group.evaluate_many(predictions_list)

{'ld50_zhu': {'mae': 0.622}}
{'ld50_zhu': {'mae': 0.641}}
{'ld50_zhu': {'mae': 0.595}}
{'ld50_zhu': {'mae': 0.628}}
{'ld50_zhu': {'mae': 0.603}}


{'ld50_zhu': [0.618, 0.017]}

In [14]:
#code below may be used for hyperparam opt as needed

hyperOptMetric = deepchem.deepchem.metrics.Metric(deepchem.deepchem.metrics.r2_score)

def model_builder(**model_params):
  return deepchem.deepchem.models.GraphConvModel(n_tasks=1, mode="regression", graph_conv_layers=[128, 128, 128])

#optimizer = deepchem.deepchem.hyper.GaussianProcessHyperparamOpt(model_builder)

optimizer = deepchem.deepchem.hyper.GaussianProcessHyperparamOpt(model_builder)

params_dict = {"dropout": .01, "dense_layer_size": 200}

best_model, best_hyperparams, all_results = optimizer.hyperparam_search(params_dict, train_data, valid_data, hyperOptMetric, search_range=10)

Evaluation 	 Proposed point 	  Current eval. 	 Best eval.
init   	 [9.54472202e-02 1.66300000e+03]. 	  0.05614210575885126 	 0.13022634235269537
init   	 [7.81778992e-02 1.95200000e+03]. 	  0.11803599063732828 	 0.13022634235269537
init   	 [6.78009843e-02 3.88000000e+02]. 	  0.13022634235269537 	 0.13022634235269537
1      	 [1.21594231e-02 4.02000000e+02]. 	  0.10297839213545745 	 0.13022634235269537
2      	 [5.63577392e-02 1.94996902e+03]. 	  0.07543380244710662 	 0.13022634235269537
3      	 [8.73771372e-02 3.99869355e+02]. 	  0.10647494762543674 	 0.13022634235269537
4      	 [5.73742248e-02 3.86042513e+02]. 	  -0.015072240034492701 	 0.13022634235269537
5      	 [1.44460922e-02 1.93100000e+03]. 	  0.04720652864226782 	 0.13022634235269537
6      	 [6.92807233e-02 1.93369230e+03]. 	  [92m0.21513004052653117[0m 	 0.21513004052653117
7      	 [1.00000000e-01 3.89993332e+02]. 	  0.08601514697821022 	 0.21513004052653117
8      	 [5.25671057e-03 1.94747276e+03]. 	  0.09740603500068

In [15]:
print(best_hyperparams)

{'dropout': 0.0661305398871231, 'dense_layer_size': 397}
