In [None]:
import deepchem
import xgboost
from rdkit import Chem
from rdkit.Chem.EState import Fingerprinter
import numpy as np
from sklearn.model_selection import GridSearchCV
from tdc.benchmark_group import admet_group
group = admet_group(path = 'data/')
from tdc import Evaluator
maeMetric = deepchem.deepchem.metrics.Metric(deepchem.deepchem.metrics.mean_absolute_error)
negMaeMetric = deepchem.deepchem.metrics.Metric(lambda *args: -1.0 * deepchem.deepchem.metrics.mean_absolute_error(*args), mode="regression")
ecfpFeat = deepchem.deepchem.feat.CircularFingerprint(radius=4)
convMolFeat = deepchem.deepchem.feat.ConvMolFeaturizer()
xgb_reg_ecfp = xgboost.XGBRegressor()
xgb_reg_estate = xgboost.XGBRegressor()
spearmanMetric = Evaluator(name = "Spearman")
spearmanMetricWrapped = deepchem.deepchem.metrics.Metric(spearmanMetric, name="spearman", mode="regression")
params = {'max_depth': [3, 4, 5, 6, 8, 10, 12], 'learning_rate': [0.005, 0.01, 0.03, 0.05, 0.1, 0.15], 'n_estimators': [100, 300, 500, 700, 900, 1100], 'colsample_bytree': [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]}
regressionDatasets = ["Caco2_Wang", 'Lipophilicity_AstraZeneca', 'Solubility_AqSolDB', 'PPBR_AZ', 'VDss_Lombardo', 'Half_Life_Obach', "Clearance_Hepatocyte_AZ", "Clearance_Microsome_AZ", 'LD50_Zhu']
for dataset in regressionDatasets:
    xgbScoring = None
    gcnTuneScoring = None
    gcnScoring = None
    if(dataset in ["VDss_Lombardo", "Half_Life_Obach", "Clearance_Hepatocyte_AZ", "Clearance_Microsome_AZ"]):
       gcnTuneScoring = spearmanMetricWrapped
       xgbScoring = spearmanMetric
       gcnScoring = spearmanMetricWrapped
    else:
           xgbScoring = deepchem.deepchem.metrics.mae_score
           gcnTuneScoring = negMaeMetric
           gcnScoring = maeMetric
    benchmark = group.get(dataset)
    name = benchmark["name"]
    train_val, test = benchmark['train_val'], benchmark['test']
    train, valid = group.get_train_valid_split(benchmark = name, split_type = 'default', seed = 1)  
    results = {}
    results["name"] = name
    train_valMol = train_val.iloc[:,1].map(lambda x: Chem.MolFromSmiles(x))
    testMol = test.iloc[:,1].map(lambda x: Chem.MolFromSmiles(x))
    trainValEstateFeat = np.stack(np.array(train_valMol.map(lambda x: Fingerprinter.FingerprintMol(x)[1])))
    testEstateFeat = np.stack(np.array(testMol.map(lambda x: Fingerprinter.FingerprintMol(x)[1])))
    trainValEcfpFeat = ecfpFeat.featurize(train_val.iloc[:,1].to_list())
    testEcfpFeat = ecfpFeat.featurize(test.iloc[:,1].to_list())
    trainGcnFeat = convMolFeat.featurize(train.iloc[:,1].to_list())
    validGcnFeat = convMolFeat.featurize(valid.iloc[:,1].to_list())
    testGcnFeat = convMolFeat.featurize(test.iloc[:,1].to_list())
    gcn_train_data = deepchem.deepchem.data.NumpyDataset(X=trainGcnFeat, y=np.array(train.iloc[:,2]), ids=np.array(train.iloc[:,1].to_list()))
    gcn_valid_data = deepchem.deepchem.data.NumpyDataset(X=validGcnFeat, y=np.array(valid.iloc[:,2]), ids=np.array(valid.iloc[:,1].to_list()))
    clfEstate = GridSearchCV(estimator=xgb_reg_estate, param_grid=params, scoring=xgbScoring, cv=10)
    clfEstate.fit(trainValEstateFeat, train_val.iloc[:,2])
    results["estate"] = clfEstate.best_params_
    clfEcfp = GridSearchCV(estimator=xgb_reg_ecfp, param_grid=params, scoring=xgbScoring, cv=10)
    clfEcfp.fit(trainValEcfpFeat, train_val.iloc[:,2])
    results["ecfp"] = clfEcfp.best_params_
    def model_builder(**model_params):
       return deepchem.deepchem.models.GraphConvModel(n_tasks=1, mode="regression", graph_conv_layers=[128, 128, 64])
    optimizer = deepchem.deepchem.hyper.GaussianProcessHyperparamOpt(model_builder)
    params_dict = {"dropout": .01, "dense_layer_size": 500}
    best_model, best_hyperparams, all_results = optimizer.hyperparam_search(params_dict, gcn_train_data, gcn_valid_data, gcnTuneScoring, search_range=20)
    results["gcn"] = best_hyperparams
    print(results)
    predictions_list = []
    for seed in [1, 2, 3, 4, 5]:
      predictions = {}
      trainBench, validBench = group.get_train_valid_split(benchmark = name, split_type = 'default', seed = seed)
      ecfp_benchmark_reg = xgboost.XGBRegressor(max_depth=results["ecfp"]["max_depth"], learning_rate=results["ecfp"]["learning_rate"], n_estimators=results["ecfp"]["n_estimators"], colsample_bytree=results["ecfp"]["colsample_bytree"])
      estate_benchmark_reg = xgboost.XGBRegressor(max_depth=results["estate"]["max_depth"], learning_rate=results["estate"]["learning_rate"], n_estimators=results["estate"]["n_estimators"], colsample_bytree=results["estate"]["colsample_bytree"])
      trainGcnFeatBench = convMolFeat.featurize(trainBench.iloc[:,1].to_list())
      validGcnFeatBench = convMolFeat.featurize(validBench.iloc[:,1].to_list())
      gcn_bench_train_data = deepchem.deepchem.data.NumpyDataset(X=trainGcnFeatBench, y=np.array(trainBench.iloc[:,2]), ids=np.array(trainBench.iloc[:,1].to_list()))
      gcn_bench_valid_data = deepchem.deepchem.data.NumpyDataset(X=validGcnFeatBench, y=np.array(validBench.iloc[:,2]), ids=np.array(validBench.iloc[:,1].to_list()))
      reg = deepchem.deepchem.models.GraphConvModel(
         n_tasks=1, 
         dropout=results["gcn"]["dropout"],
         dense_layer_size=results["gcn"]["dense_layer_size"],
         graph_conv_layers=[128, 128, 64],
         mode="regression",)
      callback = deepchem.deepchem.models.ValidationCallback(gcn_bench_valid_data, 1000, gcnScoring)
      reg.fit(gcn_bench_train_data, nb_epoch=100, callbacks=callback)
      estate_benchmark_reg.fit(X=trainValEstateFeat, y=train_val.iloc[:,2], eval_metric=xgbScoring, verbose=True)
      ecfp_benchmark_reg.fit(X=trainValEcfpFeat, y=train_val.iloc[:,2], eval_metric=xgbScoring)
      pred_estate = estate_benchmark_reg.predict(X=testEstateFeat)
      pred_ecfp = ecfp_benchmark_reg.predict(X=testEcfpFeat)
      gcn_pred_raw = reg.predict_on_batch(X=np.array(testGcnFeat))
      gcn_pred = gcn_pred_raw.reshape(gcn_pred_raw.size,)
      y_pred_test = np.mean([ pred_estate, pred_ecfp, gcn_pred ], axis=0)
      predictions[name] = y_pred_test
      predictions_list.append(predictions)
    print(group.evaluate_many(predictions_list))
