In [2]:
from tdc.single_pred import Tox
import deepchem
import xgboost
import rdkit
from rdkit import Chem
from rdkit.Chem.EState import Fingerprinter
from rdkit.Chem.EState import EState

import numpy as np
import pandas as pd
import tensorflow as tf
import autokeras as ak
from sklearn.model_selection import GridSearchCV

from tdc.benchmark_group import admet_group
from tdc import utils
group = admet_group(path = 'data/')
from tdc import Evaluator

Found local copy...


In [5]:
maeMetric = deepchem.deepchem.metrics.Metric(deepchem.deepchem.metrics.mean_absolute_error)
negMaeMetric = deepchem.deepchem.metrics.Metric(lambda *args: -1.0 * deepchem.deepchem.metrics.mean_absolute_error(*args), mode="regression")
ecfpFeat = deepchem.deepchem.feat.CircularFingerprint(radius=4)
convMolFeat = deepchem.deepchem.feat.ConvMolFeaturizer()
xgb_reg_ecfp = xgboost.XGBRegressor()
xgb_reg_estate = xgboost.XGBRegressor()

spearmanMetric = Evaluator(name = "Spearman")
spearmanMetricWrapped = deepchem.deepchem.metrics.Metric(spearmanMetric, name="spearman", mode="regression")

#see reg_tuning_condensed.ipynb for up-to-date code, may want to verify that ecfp matches ecfp and estate matches estate
"""params = {'max_depth': [3,6,10],
           'learning_rate': [0.01, 0.05, 0.1],
           'n_estimators': [100, 500, 1000],
           'colsample_bytree': [0.3, 0.7]}"""

params = {'max_depth': [3,4]}
regressionDatasets = ["Caco2_Wang", 'Lipophilicity_AstraZeneca', 'Solubility_AqSolDB', 'PPBR_AZ', 'VDss_Lombardo', 'Half_Life_Obach', "Clearance_Hepatocyte_AZ", "Clearance_Microsome_AZ", 'LD50_Zhu']

for dataset in ["Half_Life_Obach"]:

    xgbScoring = None
    gcnTuneScoring = None
    gcnScoring = None

    if(dataset in ["VDss_Lombardo", "Half_Life_Obach", "Clearance_Hepatocyte_AZ", "Clearance_Microsome_AZ"]):
       gcnTuneScoring = spearmanMetricWrapped
       #higher spearman is better
       xgbScoring = spearmanMetric
       gcnScoring = spearmanMetricWrapped
    else:
           xgbScoring = deepchem.deepchem.metrics.mae_score
           gcnTuneScoring = negMaeMetric
           gcnScoring = maeMetric

    benchmark = group.get(dataset)
    name = benchmark["name"]
    train_val, test = benchmark['train_val'], benchmark['test']
    train, valid = group.get_train_valid_split(benchmark = name, split_type = 'default', seed = 1)  
    results = {}
    results["name"] = name

    train_valMol = train_val.iloc[:,1].map(lambda x: Chem.MolFromSmiles(x))
    testMol = test.iloc[:,1].map(lambda x: Chem.MolFromSmiles(x))

    #featurize training, valid, and test data for xgboost (estate)
    trainValEstateFeat = np.stack(np.array(train_valMol.map(lambda x: Fingerprinter.FingerprintMol(x)[1])))
    testEstateFeat = np.stack(np.array(testMol.map(lambda x: Fingerprinter.FingerprintMol(x)[1])))

    #featurize training, valid, and test data for xgboost (ecfp)
    trainValEcfpFeat = ecfpFeat.featurize(train_val.iloc[:,1].to_list())
    testEcfpFeat = ecfpFeat.featurize(test.iloc[:,1].to_list())

    #featurize training, valid, and test data for the GCN
    trainGcnFeat = convMolFeat.featurize(train.iloc[:,1].to_list())
    validGcnFeat = convMolFeat.featurize(valid.iloc[:,1].to_list())
    testGcnFeat = convMolFeat.featurize(test.iloc[:,1].to_list())

    #convert training and validation data into a deepchem dataset for the gcn
    gcn_train_data = deepchem.deepchem.data.NumpyDataset(X=trainGcnFeat, y=np.array(train.iloc[:,2]), ids=np.array(train.iloc[:,1].to_list()))
    gcn_valid_data = deepchem.deepchem.data.NumpyDataset(X=validGcnFeat, y=np.array(valid.iloc[:,2]), ids=np.array(valid.iloc[:,1].to_list()))

    #tune estate regressor
    clfEstate = GridSearchCV(estimator=xgb_reg_estate, param_grid=params, scoring=xgbScoring, verbose=1, cv=10)
    clfEstate.fit(trainValEstateFeat, train_val.iloc[:,2])
    results["estate"] = clfEstate.best_params_

    #tune ecfp regressor
    clfEcfp = GridSearchCV(estimator=xgb_reg_ecfp, param_grid=params, scoring=xgbScoring, verbose=1, cv=10)
    clfEcfp.fit(trainValEcfpFeat, train_val.iloc[:,2])
    results["ecfp"] = clfEcfp.best_params_

    def model_builder(**model_params):
       return deepchem.deepchem.models.GraphConvModel(n_tasks=1, mode="regression", graph_conv_layers=[128, 128, 64])

    optimizer = deepchem.deepchem.hyper.GaussianProcessHyperparamOpt(model_builder)

    params_dict = {"dropout": .01,} #"dense_layer_size": 200}

    best_model, best_hyperparams, all_results = optimizer.hyperparam_search(params_dict, gcn_train_data, gcn_valid_data, gcnTuneScoring, search_range=2)
    results["gcn"] = best_hyperparams

    print(results)

    predictions_list = []

    for seed in [1, 2, 3, 4, 5]:

      predictions = {}

      trainBench, validBench = group.get_train_valid_split(benchmark = name, split_type = 'default', seed = seed)
      ecfp_benchmark_reg = xgboost.XGBRegressor(max_depth=results["ecfp"]["max_depth"])
      estate_benchmark_reg = xgboost.XGBRegressor(max_depth=results["estate"]["max_depth"])

      #colsample_bytree=0.3, learning_rate=0.01, max_depth=10, n_estimators=5000

      trainGcnFeatBench = convMolFeat.featurize(trainBench.iloc[:,1].to_list())
      validGcnFeatBench = convMolFeat.featurize(validBench.iloc[:,1].to_list())

      gcn_bench_train_data = deepchem.deepchem.data.NumpyDataset(X=trainGcnFeatBench, y=np.array(trainBench.iloc[:,2]), ids=np.array(trainBench.iloc[:,1].to_list()))
      gcn_bench_valid_data = deepchem.deepchem.data.NumpyDataset(X=validGcnFeatBench, y=np.array(validBench.iloc[:,2]), ids=np.array(validBench.iloc[:,1].to_list()))

      reg = deepchem.deepchem.models.GraphConvModel(
         n_tasks=1, 
         dropout=results["gcn"]["dropout"],
         dense_layer_size=results["gcn"]["dense_layer_size"],
         graph_conv_layers=[128, 128, 64],
         mode="regression",)
      callback = deepchem.deepchem.models.ValidationCallback(gcn_bench_valid_data, 1000, gcnScoring)
      reg.fit(gcn_bench_train_data, nb_epoch=100, callbacks=callback)

      #I don't think eval_metric does anything if there's no eval_set specified,
      #but if there is one, I think you're supposed to pass a string like "mae"
      estate_benchmark_reg.fit(X=trainValEstateFeat, y=train_val.iloc[:,2], eval_metric=xgbScoring, verbose=True)
      ecfp_benchmark_reg.fit(X=trainValEcfpFeat, y=train_val.iloc[:,2], eval_metric=xgbScoring)
      
      pred_estate = estate_benchmark_reg.predict(X=testEstateFeat)
      pred_ecfp = ecfp_benchmark_reg.predict(X=testEcfpFeat)

      gcn_pred_raw = reg.predict_on_batch(X=np.array(testGcnFeat))
      gcn_pred = gcn_pred_raw.reshape(gcn_pred_raw.size,)
      #note: reshaping is necessary for averaging I believe
      print(gcn_pred[0:5])
      print(pred_ecfp[0:5])
      print(pred_estate[0:5])

      y_pred_test = np.mean([ pred_estate, pred_ecfp, gcn_pred ], axis=0)

      print("predictions for run #")
      print(seed)
      print(y_pred_test[0:5])

      predictions[name] = y_pred_test
      predictions_list.append(predictions)

    print(predictions_list)
    print(group.evaluate_many(predictions_list))


generating training, validation splits...
100%|██████████| 532/532 [00:00<00:00, 1889.68it/s]


Fitting 5 folds for each of 2 candidates, totalling 10 fits
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Evaluation 	 Proposed point 	  Current eval. 	 Best eval.
init   	 [0.0107581]. 	  -0.06729128666269002 	 -0.015565600117704097
init   	 [0.00777513]. 	  -0.08509194731011573 	 -0.015565600117704097
init   	 [0.00619442]. 	  -0.015565600117704097 	 -0.015565600117704097
1      	 [0.00509862]. 	  [92m0.05475898297817954[0m 	 0.05475898297817954
2      	 [0.00509465]. 	  -0.09085920171270095 	 0.05475898297817954
3      	 [0.01170728]. 	  -0.050987318334274306 	 0.05475898297817954
4      	 [0.00769253]. 	  0.037956117210093836 	 0.05475898297817954
5      	 [0.01667891]. 	  0.02921543406707538 	 0.05475898297817954
6      	 [0.01989735]. 	  [92m0.1316091445849468[0m 	 0.1316091445849468
7      	 [0.01980007]. 	  0.05104718602703471 	 0.1316091445849468
8      	 [0.0197807]. 	  0.04268566493816545 	 0.1316091445849468
9      	 [0.00855546]. 	  0.048931860882833904 	

generating training, validation splits...


20     	 [0.0093611]. 	  -0.024465930441416952 	 0.1316091445849468
half_life_obach
{'estate': {'max_depth': 3}, 'ecfp': {'max_depth': 3}, 'gcn': {'dropout': 0.019897350391822177}}


100%|██████████| 532/532 [00:00<00:00, 1857.01it/s]
generating training, validation splits...


[-15.481456   -8.344112  -10.045974   -1.4040323  12.368832 ]
[ 8.064317   6.1167135  6.3028345  9.737072  10.640959 ]
[  9.039435   17.740782   13.975753    9.2034025 175.7673   ]
predictions for run #
1
[ 0.54076546  5.1711273   3.4108713   5.8454814  66.25903   ]


100%|██████████| 532/532 [00:00<00:00, 1770.40it/s]
generating training, validation splits...


[ 1.7501738  6.052224   8.040025  18.19757   16.304451 ]
[ 8.064317   6.1167135  6.3028345  9.737072  10.640959 ]
[  9.039435   17.740782   13.975753    9.2034025 175.7673   ]
predictions for run #
2
[ 6.2846417  9.969906   9.439537  12.379349  67.5709   ]


100%|██████████| 532/532 [00:00<00:00, 2063.43it/s]
generating training, validation splits...


[-10.170633   -5.4849772  -4.070062   -6.2313833  12.9233675]
[ 8.064317   6.1167135  6.3028345  9.737072  10.640959 ]
[  9.039435   17.740782   13.975753    9.2034025 175.7673   ]
predictions for run #
3
[ 2.3110397  6.124172   5.4028416  4.236364  66.44388  ]


100%|██████████| 532/532 [00:00<00:00, 2128.93it/s]
generating training, validation splits...


[-11.581478   -1.1550381  -1.5624142   4.4190097   9.5583105]
[ 8.064317   6.1167135  6.3028345  9.737072  10.640959 ]
[  9.039435   17.740782   13.975753    9.2034025 175.7673   ]
predictions for run #
4
[ 1.840758   7.5674853  6.238724   7.786495  65.32219  ]


100%|██████████| 532/532 [00:00<00:00, 2014.58it/s]


[-8.030292   -0.64624554 -7.6894364  -5.872697   12.654764  ]
[ 8.064317   6.1167135  6.3028345  9.737072  10.640959 ]
[  9.039435   17.740782   13.975753    9.2034025 175.7673   ]
predictions for run #
5
[ 3.0244868  7.737083   4.196384   4.355926  66.35435  ]
[{'half_life_obach': array([ 3.02448678e+00,  7.73708296e+00,  4.19638395e+00,  4.35592604e+00,
        6.63543472e+01,  6.91317215e+01,  1.31030045e+01,  1.82153320e+01,
        1.10988960e+02,  7.50078726e+00,  9.55292702e+00,  1.11254776e+00,
        3.70574212e+00,  7.71888971e+00,  3.83668804e+00,  7.69552803e+00,
        1.93740128e+02,  1.88965664e+01, -2.54949498e+00, -3.48212743e+00,
       -3.48212743e+00,  1.29562654e+01,  2.06139469e+01,  1.66037731e+01,
        1.01120958e+01,  1.80028343e+00,  4.54975281e+01,  2.89217734e+00,
        7.09040213e+00,  7.57124405e+01,  5.23017120e+01,  1.36013803e+01,
        5.94477081e+00,  1.23568268e+01,  3.54687762e+00,  3.87125897e+00,
        3.09970856e+00,  6.53628397e+00,  

In [6]:
group.evaluate_many(predictions_list)

{'half_life_obach': [0.341, 0.0]}