In [1]:
from tdc.single_pred import Tox
import deepchem
import xgboost
import rdkit
from rdkit import Chem
from rdkit.Chem.EState import Fingerprinter
from rdkit.Chem.EState import EState

import numpy as np
import pandas as pd
import tensorflow as tf
import autokeras as ak

from tdc.benchmark_group import admet_group
group = admet_group(path = 'data/')

Found local copy...


In [3]:
predictions_list = []
metric = deepchem.deepchem.metrics.Metric(deepchem.deepchem.metrics.mean_absolute_error)
ecfpFeat = deepchem.deepchem.feat.CircularFingerprint(radius=4)
convMolFeat = deepchem.deepchem.feat.ConvMolFeaturizer()
xgb_reg_ecfp = xgboost.XGBRegressor()
xgb_reg_estate = xgboost.XGBRegressor()

for seed in [1, 2, 3, 4, 5]:
    benchmark = group.get('Solubility_AqSolDB') 
    
    predictions = {}
    name = benchmark['name']
    train_val, test = benchmark['train_val'], benchmark['test']
    train, valid = group.get_train_valid_split(benchmark = name, split_type = 'default', seed = seed)  

    #trainMol = train.iloc[:,1].map(lambda x: Chem.MolFromSmiles(x))
    #validMol = valid.iloc[:,1].map(lambda x: Chem.MolFromSmiles(x))
    testMol = test.iloc[:,1].map(lambda x: Chem.MolFromSmiles(x))
    train_valMol = train_val.iloc[:,1].map(lambda x: Chem.MolFromSmiles(x))

    #featurize training, valid, and test data for xgboost
    #trainFeat = np.stack(np.array(trainMol.map(lambda x: Fingerprinter.FingerprintMol(x)[1])))
    #validFeat = np.stack(np.array(validMol.map(lambda x: Fingerprinter.FingerprintMol(x)[1])))
    testFeat = np.stack(np.array(testMol.map(lambda x: Fingerprinter.FingerprintMol(x)[1])))
    train_valFeat = np.stack(np.array(train_valMol.map(lambda x: Fingerprinter.FingerprintMol(x)[1])))

    #featurize training, valid, and test data for xgboost
    #ecfp_f_train = ecfpFeat.featurize(train.iloc[:,1].to_list())
    #ecfp_f_valid = ecfpFeat.featurize(valid.iloc[:,1].to_list())
    ecfp_f_test = ecfpFeat.featurize(test.iloc[:,1].to_list())
    train_val_f = ecfpFeat.featurize(train_val.iloc[:,1].to_list())

    #featurize training, valid, and test data for the GCN
    cv_f_train = convMolFeat.featurize(train.iloc[:,1].to_list())
    cv_f_valid = convMolFeat.featurize(valid.iloc[:,1].to_list())
    cv_f_test = convMolFeat.featurize(test.iloc[:,1].to_list())

    #convert training and validation data into a deepchem dataset for the gcn
    gcn_train_data = deepchem.deepchem.data.NumpyDataset(X=cv_f_train, y=np.array(train.iloc[:,2]), ids=np.array(train.iloc[:,1].to_list()))
    gcn_valid_data = deepchem.deepchem.data.NumpyDataset(X=cv_f_valid, y=np.array(valid.iloc[:,2]), ids=np.array(valid.iloc[:,1].to_list()))

    #fit data on GCN
    reg = deepchem.deepchem.models.GraphConvModel(
        n_tasks=1, 
        dropout=.0005,
        dense_layer_size=1063,
        graph_conv_layers=[128, 128, 128],
        mode="regression",)
    callback = deepchem.deepchem.models.ValidationCallback(gcn_valid_data, 1000, metric)
    reg.fit(gcn_train_data, nb_epoch=100, callbacks=callback)

    #predict values on gcn and reshape array (reshape value is 20% of dataset size)
    gcn_pred = reg.predict_on_batch(X=np.array(cv_f_test)).reshape(1997,)

    #fit xgboost model and store np ndarray in xgb_pred
    xgb_reg_estate.fit(X=train_valFeat, y=train_val.iloc[:,2], eval_metric="mae", verbose=True)
    xgb_reg_ecfp.fit(X=train_val_f, y=train_val.iloc[:,2], eval_metric="mae")
    pred_estate = xgb_reg_estate.predict(X=testFeat)
    pred_ecfp = xgb_reg_ecfp.predict(X=ecfp_f_test)
    
    #pred_ak = akReg.predict(rdkit_f_test)

    # store test predictions in y_pred_test
    y_pred_test = np.mean([ pred_estate, pred_ecfp, gcn_pred ], axis=0)

    print("predictions for run #")
    print(seed)
    print(y_pred_test[0:5])
        
    predictions[name] = y_pred_test
    predictions_list.append(predictions)

print("Prediction List:")
print(predictions_list)

generating training, validation splits...
100%|██████████| 7985/7985 [00:02<00:00, 3930.94it/s]


Step 1000 validation: mean_absolute_error=1.01274
Step 2000 validation: mean_absolute_error=0.953542
Step 3000 validation: mean_absolute_error=0.941146
Step 4000 validation: mean_absolute_error=0.973134
Step 5000 validation: mean_absolute_error=0.973608
Step 6000 validation: mean_absolute_error=1.00188
Step 7000 validation: mean_absolute_error=0.969157


generating training, validation splits...


predictions for run #
1
[-3.829607  -4.6302495 -3.958729  -3.3809106 -4.5026965]


100%|██████████| 7985/7985 [00:06<00:00, 1181.20it/s]


Step 1000 validation: mean_absolute_error=0.920314
Step 2000 validation: mean_absolute_error=0.897902
Step 3000 validation: mean_absolute_error=1.1382
Step 4000 validation: mean_absolute_error=0.94247
Step 5000 validation: mean_absolute_error=1.00255
Step 6000 validation: mean_absolute_error=1.03912
Step 7000 validation: mean_absolute_error=0.991543


generating training, validation splits...


predictions for run #
2
[-3.9606915 -4.623056  -4.190508  -3.4892704 -4.5091395]


100%|██████████| 7985/7985 [00:02<00:00, 3105.62it/s]


Step 1000 validation: mean_absolute_error=0.983907
Step 2000 validation: mean_absolute_error=0.938863
Step 3000 validation: mean_absolute_error=0.985724
Step 4000 validation: mean_absolute_error=0.989961
Step 5000 validation: mean_absolute_error=0.92776
Step 6000 validation: mean_absolute_error=1.02594
Step 7000 validation: mean_absolute_error=0.964027


generating training, validation splits...


predictions for run #
3
[-4.0738425 -4.897732  -4.158022  -3.3460114 -4.6705265]


100%|██████████| 7985/7985 [00:03<00:00, 2211.67it/s]


Step 1000 validation: mean_absolute_error=0.938398
Step 2000 validation: mean_absolute_error=0.942947
Step 3000 validation: mean_absolute_error=1.06161
Step 4000 validation: mean_absolute_error=0.926266
Step 5000 validation: mean_absolute_error=0.915771
Step 6000 validation: mean_absolute_error=0.906887
Step 7000 validation: mean_absolute_error=0.917885


generating training, validation splits...


predictions for run #
4
[-4.125597  -4.6695223 -3.8521497 -3.5364335 -4.260222 ]


100%|██████████| 7985/7985 [00:01<00:00, 4066.95it/s]


Step 1000 validation: mean_absolute_error=1.29827
Step 2000 validation: mean_absolute_error=1.22617
Step 3000 validation: mean_absolute_error=1.22662
Step 4000 validation: mean_absolute_error=1.25488
Step 5000 validation: mean_absolute_error=1.21568
predictions for run #
5
[-4.055109  -5.0469365 -4.181121  -3.378898  -4.8059783]
Prediction List:
[{'solubility_aqsoldb': array([-3.829607 , -4.6302495, -3.958729 , ..., -3.7829888, -3.374567 ,
       -5.6744065], dtype=float32)}, {'solubility_aqsoldb': array([-3.9606915, -4.623056 , -4.190508 , ..., -3.8388834, -3.2878044,
       -5.364433 ], dtype=float32)}, {'solubility_aqsoldb': array([-4.0738425, -4.897732 , -4.158022 , ..., -3.6012478, -3.326412 ,
       -5.1451993], dtype=float32)}, {'solubility_aqsoldb': array([-4.125597 , -4.6695223, -3.8521497, ..., -3.807085 , -3.5838299,
       -5.545088 ], dtype=float32)}, {'solubility_aqsoldb': array([-4.055109 , -5.0469365, -4.181121 , ..., -3.5897882, -3.0642164,
       -5.6462617], dtype=fl

In [4]:
for i in predictions_list:
    print(group.evaluate(i))

group.evaluate_many(predictions_list)

{'solubility_aqsoldb': {'mae': 0.941}}
{'solubility_aqsoldb': {'mae': 0.932}}
{'solubility_aqsoldb': {'mae': 0.938}}
{'solubility_aqsoldb': {'mae': 0.927}}
{'solubility_aqsoldb': {'mae': 0.92}}


{'solubility_aqsoldb': [0.932, 0.008]}