In [1]:
from tdc.single_pred import Tox
import deepchem
import xgboost
import rdkit
from rdkit import Chem
from rdkit.Chem.EState import Fingerprinter
from rdkit.Chem.EState import EState

import numpy as np
import pandas as pd
import tensorflow as tf
import autokeras as ak

from tdc.benchmark_group import admet_group
group = admet_group(path = 'data/')

Found local copy...


In [3]:
predictions_list = []
metric = deepchem.deepchem.metrics.Metric(deepchem.deepchem.metrics.mean_absolute_error)
ecfpFeat = deepchem.deepchem.feat.CircularFingerprint(radius=4)
convMolFeat = deepchem.deepchem.feat.ConvMolFeaturizer()
xgb_reg_ecfp = xgboost.XGBRegressor()
xgb_reg_estate = xgboost.XGBRegressor()

for seed in [1, 2, 3, 4, 5]:
    benchmark = group.get('Lipophilicity_AstraZeneca') 
    
    predictions = {}
    name = benchmark['name']
    train_val, test = benchmark['train_val'], benchmark['test']
    train, valid = group.get_train_valid_split(benchmark = name, split_type = 'default', seed = seed)  

    #trainMol = train.iloc[:,1].map(lambda x: Chem.MolFromSmiles(x))
    #validMol = valid.iloc[:,1].map(lambda x: Chem.MolFromSmiles(x))
    testMol = test.iloc[:,1].map(lambda x: Chem.MolFromSmiles(x))
    train_valMol = train_val.iloc[:,1].map(lambda x: Chem.MolFromSmiles(x))

    #featurize training, valid, and test data for xgboost
    #trainFeat = np.stack(np.array(trainMol.map(lambda x: Fingerprinter.FingerprintMol(x)[1])))
    #validFeat = np.stack(np.array(validMol.map(lambda x: Fingerprinter.FingerprintMol(x)[1])))
    testFeat = np.stack(np.array(testMol.map(lambda x: Fingerprinter.FingerprintMol(x)[1])))
    train_valFeat = np.stack(np.array(train_valMol.map(lambda x: Fingerprinter.FingerprintMol(x)[1])))

    #featurize training, valid, and test data for xgboost
    #ecfp_f_train = ecfpFeat.featurize(train.iloc[:,1].to_list())
    #ecfp_f_valid = ecfpFeat.featurize(valid.iloc[:,1].to_list())
    ecfp_f_test = ecfpFeat.featurize(test.iloc[:,1].to_list())
    train_val_f = ecfpFeat.featurize(train_val.iloc[:,1].to_list())

    #featurize training, valid, and test data for the GCN
    cv_f_train = convMolFeat.featurize(train.iloc[:,1].to_list())
    cv_f_valid = convMolFeat.featurize(valid.iloc[:,1].to_list())
    cv_f_test = convMolFeat.featurize(test.iloc[:,1].to_list())

    #convert training and validation data into a deepchem dataset for the gcn
    gcn_train_data = deepchem.deepchem.data.NumpyDataset(X=cv_f_train, y=np.array(train.iloc[:,2]), ids=np.array(train.iloc[:,1].to_list()))
    gcn_valid_data = deepchem.deepchem.data.NumpyDataset(X=cv_f_valid, y=np.array(valid.iloc[:,2]), ids=np.array(valid.iloc[:,1].to_list()))

    #fit data on GCN
    reg = deepchem.deepchem.models.GraphConvModel(
        n_tasks=1, 
        dropout=.0005,
        dense_layer_size=1063,
        graph_conv_layers=[128, 128, 128],
        mode="regression",)
    callback = deepchem.deepchem.models.ValidationCallback(gcn_valid_data, 1000, metric)
    reg.fit(gcn_train_data, nb_epoch=100, callbacks=callback)

    #predict values on gcn and reshape array (reshape value is 20% of dataset size)
    gcn_pred = reg.predict_on_batch(X=np.array(cv_f_test)).reshape(840,)

    #fit xgboost model and store np ndarray in xgb_pred
    xgb_reg_estate.fit(X=train_valFeat, y=train_val.iloc[:,2], eval_metric="mae", verbose=True)
    xgb_reg_ecfp.fit(X=train_val_f, y=train_val.iloc[:,2], eval_metric="mae")
    pred_estate = xgb_reg_estate.predict(X=testFeat)
    pred_ecfp = xgb_reg_ecfp.predict(X=ecfp_f_test)
    
    #pred_ak = akReg.predict(rdkit_f_test)

    # store test predictions in y_pred_test
    y_pred_test = np.mean([ pred_estate, pred_ecfp, gcn_pred ], axis=0)

    print("predictions for run #")
    print(seed)
    print(y_pred_test[0:5])
        
    predictions[name] = y_pred_test
    predictions_list.append(predictions)

print("Prediction List:")
print(predictions_list)

generating training, validation splits...
100%|██████████| 3360/3360 [00:01<00:00, 1797.44it/s]


Step 1000 validation: mean_absolute_error=0.627711
Step 2000 validation: mean_absolute_error=0.620116
Step 3000 validation: mean_absolute_error=0.595497


generating training, validation splits...


predictions for run #
1
[2.8500712 2.25688   2.8688192 2.0571826 3.3175933]


100%|██████████| 3360/3360 [00:01<00:00, 2027.98it/s]


Step 1000 validation: mean_absolute_error=0.633926
Step 2000 validation: mean_absolute_error=0.612353
Step 3000 validation: mean_absolute_error=0.617436


generating training, validation splits...


predictions for run #
2
[2.9627287 2.7085812 3.014145  1.9563475 3.296446 ]


100%|██████████| 3360/3360 [00:01<00:00, 2173.76it/s]


Step 1000 validation: mean_absolute_error=0.614865
Step 2000 validation: mean_absolute_error=0.557529
Step 3000 validation: mean_absolute_error=0.57496


generating training, validation splits...


predictions for run #
3
[2.8751686 2.5922852 2.9563057 1.7493271 3.30706  ]


100%|██████████| 3360/3360 [00:02<00:00, 1626.36it/s]


Step 1000 validation: mean_absolute_error=0.663867
Step 2000 validation: mean_absolute_error=0.626036
Step 3000 validation: mean_absolute_error=0.624427


generating training, validation splits...


predictions for run #
4
[2.7230968 2.3265705 2.8612127 1.9468008 3.372966 ]


100%|██████████| 3360/3360 [00:01<00:00, 1869.95it/s]


Step 1000 validation: mean_absolute_error=0.583211
Step 2000 validation: mean_absolute_error=0.553703
Step 3000 validation: mean_absolute_error=0.564678
predictions for run #
5
[2.884411  2.4452553 3.0005624 1.8837522 3.2093859]
Prediction List:
[{'lipophilicity_astrazeneca': array([ 2.8500712 ,  2.25688   ,  2.8688192 ,  2.0571826 ,  3.3175933 ,
        2.7425587 ,  2.4688604 ,  2.0433614 ,  1.090197  ,  0.66331667,
        2.015417  ,  2.517454  ,  2.2209432 ,  2.6824148 ,  2.1512868 ,
        2.3148367 ,  1.7319971 ,  2.0906017 ,  2.493233  ,  3.03597   ,
        2.5614393 ,  3.2990446 ,  1.9715677 ,  1.8880835 ,  2.179107  ,
        3.3843195 ,  0.77406365,  0.8207953 ,  2.1632125 ,  1.7402911 ,
        1.025362  ,  2.2426674 ,  2.2678285 ,  2.8058999 ,  2.8793418 ,
        2.3809135 ,  2.618644  ,  3.655969  ,  2.095978  ,  1.6588866 ,
        1.3795705 ,  2.1317384 ,  1.4749085 ,  1.9406189 ,  3.3362243 ,
        2.156199  ,  3.4352028 ,  2.0529861 ,  2.6835477 ,  1.5162363 ,
   

In [4]:
for i in predictions_list:
    print(group.evaluate(i))

group.evaluate_many(predictions_list)

{'lipophilicity_astrazeneca': {'mae': 0.59}}
{'lipophilicity_astrazeneca': {'mae': 0.591}}
{'lipophilicity_astrazeneca': {'mae': 0.593}}
{'lipophilicity_astrazeneca': {'mae': 0.599}}
{'lipophilicity_astrazeneca': {'mae': 0.605}}


{'lipophilicity_astrazeneca': [0.596, 0.006]}