<a href="https://colab.research.google.com/github/gitconnoisseur/toxicity-model/blob/master/ecfp_xgb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#import autogluon as ag
from tdc.single_pred import Tox
import deepchem
import xgboost

import numpy as np
import pandas as pd
import tensorflow as tf
import autokeras as ak

from tdc.benchmark_group import admet_group
group = admet_group(path = 'data/')

Found local copy...


Run benchmark combining the predictions of a GCN with deepchem's `ConvMolFeaturizer` and xgboost with an ECFP featurizer:

In [3]:
predictions_list = []
metric = deepchem.deepchem.metrics.Metric(deepchem.deepchem.metrics.mean_absolute_error)
convMolFeat = deepchem.deepchem.feat.ConvMolFeaturizer()
ecfpFeat = deepchem.deepchem.feat.CircularFingerprint(radius=4)
xgb_reg = xgboost.XGBRegressor()

for seed in [1, 2, 3, 4, 5]:
    benchmark = group.get('LD50_Zhu') 
    
    predictions = {}
    name = benchmark['name']
    train_val, test = benchmark['train_val'], benchmark['test']
    train, valid = group.get_train_valid_split(benchmark = name, split_type = 'default', seed = seed)  

    #featurize training, valid, and test data for the GCN
    cv_f_train = convMolFeat.featurize(train.iloc[:,1].to_list())
    cv_f_valid = convMolFeat.featurize(valid.iloc[:,1].to_list())
    cv_f_test = convMolFeat.featurize(test.iloc[:,1].to_list())

    #featurize training, valid, and test data for xgboost
    ecfp_f_train_val = ecfpFeat.featurize(train_val.iloc[:,1].to_list())
    ecfp_f_test = ecfpFeat.featurize(test.iloc[:,1].to_list())

    # TODO: implement validation splits for xgboost

    #convert training and validation data into a deepchem dataset for the gcn
    gcn_train_data = deepchem.deepchem.data.NumpyDataset(X=cv_f_train, y=np.array(train.iloc[:,2]), ids=np.array(train.iloc[:,1].to_list()))
    gcn_valid_data = deepchem.deepchem.data.NumpyDataset(X=cv_f_valid, y=np.array(valid.iloc[:,2]), ids=np.array(valid.iloc[:,1].to_list()))

    #fit data on GCN
    reg = deepchem.deepchem.models.GraphConvModel(
        n_tasks=1, 
        dropout=.0005,
        dense_layer_size=1063,
        graph_conv_layers=[128, 128, 128],
        mode="regression",)
    callback = deepchem.deepchem.models.ValidationCallback(gcn_valid_data, 1000, metric)
    reg.fit(gcn_train_data, nb_epoch=100, callbacks=callback)

    #predict values on gcn and reshape array
    gcn_pred = reg.predict_on_batch(X=np.array(cv_f_test)).reshape(1478,)
    
    #fit xgboost model and store np ndarray in xgb_pred
    xgb_reg.fit(X=ecfp_f_train_val, y=train_val.iloc[:,2], eval_metric="mae")
    xgb_pred = xgb_reg.predict(X=ecfp_f_test)

    # store test predictions in y_pred_test
    y_pred_test = np.mean([ gcn_pred, xgb_pred ], axis=0)

    print("predictions for run #")
    print(seed)
    print(y_pred_test[0:5])
        
    predictions[name] = y_pred_test
    predictions_list.append(predictions)

print("Prediction List:")
print(predictions_list)

generating training, validation splits...
100%|██████████| 5907/5907 [00:02<00:00, 2883.09it/s]


done featurizing


KeyboardInterrupt: 

In [10]:
for i in predictions_list:
    print(group.evaluate(i))

group.evaluate_many(predictions_list)

{'ld50_zhu': {'mae': 0.622}}
{'ld50_zhu': {'mae': 0.641}}
{'ld50_zhu': {'mae': 0.595}}
{'ld50_zhu': {'mae': 0.628}}
{'ld50_zhu': {'mae': 0.603}}


{'ld50_zhu': [0.618, 0.017]}

In [10]:
# multiply mae by negative one so it can be maximized
hyperOptMetric = deepchem.deepchem.metrics.Metric(lambda *args: -1.0 * deepchem.deepchem.metrics.mean_absolute_error(*args), mode="regression")


def model_builder(**model_params):
  return deepchem.deepchem.models.GraphConvModel(n_tasks=1, mode="regression", graph_conv_layers=[128, 128, 128])

#optimizer = deepchem.deepchem.hyper.GaussianProcessHyperparamOpt(model_builder)

optimizer = deepchem.deepchem.hyper.GaussianProcessHyperparamOpt(model_builder)

params_dict = {"dropout": .01, "dense_layer_size": 200}

best_model, best_hyperparams, all_results = optimizer.hyperparam_search(params_dict, gcn_train_data, gcn_valid_data, hyperOptMetric, search_range=10)

Evaluation 	 Proposed point 	  Current eval. 	 Best eval.
init   	 [9.89697716e-02 3.53000000e+02]. 	  -0.6180186351910334 	 -0.6042042773573261
init   	 [7.96162767e-02 1.78900000e+03]. 	  -0.6188608646463799 	 -0.6042042773573261
init   	 [1.34372558e-02 2.04000000e+02]. 	  -0.6042042773573261 	 -0.6042042773573261


KeyboardInterrupt: 

In [9]:
print(best_hyperparams)
print(all_results)

{'dropout': 0.08443353692030406, 'dense_layer_size': 775}
{'_dense_layer_size_1928_dropout_0.026056': -0.6161189073270971, '_dense_layer_size_1748_dropout_0.016193': -0.6064113871286622, '_dense_layer_size_59_dropout_0.053056': -0.5984787022322215, '_dense_layer_size_1326_dropout_0.007285': -0.6412197289047448, '_dense_layer_size_1903_dropout_0.057144': -0.6024587194909907, '_dense_layer_size_853_dropout_0.041442': -0.6136069268677005, '_dense_layer_size_647_dropout_0.047537': -0.6199212998163071, '_dense_layer_size_1627_dropout_0.026829': -0.6103059768141202, '_dense_layer_size_902_dropout_0.045080': -0.6157950049466144, '_dense_layer_size_775_dropout_0.084434': -0.592903416345826, '_dense_layer_size_1208_dropout_0.080337': -0.5943297103757949, '_dense_layer_size_471_dropout_0.035145': -0.6101408396199527, '_dense_layer_size_1837_dropout_0.008306': -0.622728939081561, '_dense_layer_size_1871_dropout_0.077215': -0.6009905486577903, '_dense_layer_size_594_dropout_0.008264': -0.625564075