In [1]:
from tdc.single_pred import Tox
import deepchem
import xgboost
import rdkit
from rdkit import Chem
from rdkit.Chem.EState import Fingerprinter
from rdkit.Chem.EState import EState

import numpy as np
import pandas as pd
import tensorflow as tf
import autokeras as ak

from tdc.benchmark_group import admet_group
group = admet_group(path = 'data/')

Found local copy...


In [3]:
predictions_list = []
metric = deepchem.deepchem.metrics.Metric(deepchem.deepchem.metrics.roc_auc_score)
ecfpFeat = deepchem.deepchem.feat.CircularFingerprint(radius=4)
convMolFeat = deepchem.deepchem.feat.ConvMolFeaturizer()
xgb_reg_ecfp = xgboost.XGBClassifier()
xgb_reg_estate = xgboost.XGBClassifier()

for seed in [1, 2, 3, 4, 5]:
    benchmark = group.get('ames') 
    
    predictions = {}
    name = benchmark['name']
    train_val, test = benchmark['train_val'], benchmark['test']
    train, valid = group.get_train_valid_split(benchmark = name, split_type = 'default', seed = seed)  

    #trainMol = train.iloc[:,1].map(lambda x: Chem.MolFromSmiles(x))
    #validMol = valid.iloc[:,1].map(lambda x: Chem.MolFromSmiles(x))
    testMol = test.iloc[:,1].map(lambda x: Chem.MolFromSmiles(x))
    train_valMol = train_val.iloc[:,1].map(lambda x: Chem.MolFromSmiles(x))

    #featurize training, valid, and test data for xgboost
    #trainFeat = np.stack(np.array(trainMol.map(lambda x: Fingerprinter.FingerprintMol(x)[1])))
    #validFeat = np.stack(np.array(validMol.map(lambda x: Fingerprinter.FingerprintMol(x)[1])))
    testFeat = np.stack(np.array(testMol.map(lambda x: Fingerprinter.FingerprintMol(x)[1])))
    train_valFeat = np.stack(np.array(train_valMol.map(lambda x: Fingerprinter.FingerprintMol(x)[1])))

    #featurize training, valid, and test data for xgboost
    #ecfp_f_train = ecfpFeat.featurize(train.iloc[:,1].to_list())
    #ecfp_f_valid = ecfpFeat.featurize(valid.iloc[:,1].to_list())
    ecfp_f_test = ecfpFeat.featurize(test.iloc[:,1].to_list())
    train_val_f = ecfpFeat.featurize(train_val.iloc[:,1].to_list())

    #featurize training, valid, and test data for the GCN
    cv_f_train = convMolFeat.featurize(train.iloc[:,1].to_list())
    cv_f_valid = convMolFeat.featurize(valid.iloc[:,1].to_list())
    cv_f_test = convMolFeat.featurize(test.iloc[:,1].to_list())

    #convert training and validation data into a deepchem dataset for the gcn
    gcn_train_data = deepchem.deepchem.data.NumpyDataset(X=cv_f_train, y=np.array(train.iloc[:,2]), ids=np.array(train.iloc[:,1].to_list()))
    gcn_valid_data = deepchem.deepchem.data.NumpyDataset(X=cv_f_valid, y=np.array(valid.iloc[:,2]), ids=np.array(valid.iloc[:,1].to_list()))
    gcn_test_data = deepchem.deepchem.data.NumpyDataset(X=cv_f_test, y=np.array(test.iloc[:,2]), ids=np.array(test.iloc[:,1].to_list()))

    #fit data on GCN
    reg = deepchem.deepchem.models.GraphConvModel(
        n_tasks=1, 
        dropout=.0005,
        dense_layer_size=1063,
        graph_conv_layers=[128, 128, 128],
        mode="classification",)
    callback = deepchem.deepchem.models.ValidationCallback(gcn_valid_data, 1000, metric)
    reg.fit(gcn_train_data, nb_epoch=100, callbacks=callback)

    gcn_pred = reg.predict(dataset=gcn_test_data)
    gcn_pred_processed = []
    for prediction in gcn_pred:
        gcn_pred_processed.append(prediction[0][1])

    #fit xgboost model and store np ndarray in xgb_pred
    xgb_reg_estate.fit(X=train_valFeat, y=train_val.iloc[:,2], eval_metric="auc", verbose=True)
    xgb_reg_ecfp.fit(X=train_val_f, y=train_val.iloc[:,2], eval_metric="auc")
    pred_estate = xgb_reg_estate.predict(X=testFeat)
    pred_ecfp = xgb_reg_ecfp.predict(X=ecfp_f_test)

    # store test predictions in y_pred_test
    y_pred_test = np.round(np.mean([ pred_estate, pred_ecfp, gcn_pred_processed ], axis=0))

    print("predictions for run #")
    print(seed)
    print(y_pred_test[0:5])
        
    predictions[name] = y_pred_test
    predictions_list.append(predictions)

print("Prediction List:")
print(predictions_list)

generating training, validation splits...
100%|██████████| 5821/5821 [00:02<00:00, 2283.49it/s]


Step 1000 validation: roc_auc_score=0.874117
Step 2000 validation: roc_auc_score=0.865843
Step 3000 validation: roc_auc_score=0.85053
Step 4000 validation: roc_auc_score=0.823223
Step 5000 validation: roc_auc_score=0.821407
1457


generating training, validation splits...


1457
1457
predictions for run #
1
[1. 1. 1. 1. 1.]


100%|██████████| 5821/5821 [00:01<00:00, 3480.85it/s]


Step 1000 validation: roc_auc_score=0.795735
Step 2000 validation: roc_auc_score=0.790728
Step 3000 validation: roc_auc_score=0.799542
Step 4000 validation: roc_auc_score=0.785934
1457


generating training, validation splits...


1457
1457
predictions for run #
2
[1. 1. 1. 1. 1.]


100%|██████████| 5821/5821 [00:01<00:00, 3218.45it/s]


Step 1000 validation: roc_auc_score=0.859539
Step 2000 validation: roc_auc_score=0.822343
Step 3000 validation: roc_auc_score=0.83282
Step 4000 validation: roc_auc_score=0.838085
Step 5000 validation: roc_auc_score=0.863053
1457


generating training, validation splits...


1457
1457
predictions for run #
3
[1. 1. 1. 1. 1.]


100%|██████████| 5821/5821 [00:01<00:00, 3730.24it/s]


Step 1000 validation: roc_auc_score=0.859446
Step 2000 validation: roc_auc_score=0.841365
Step 3000 validation: roc_auc_score=0.840198
Step 4000 validation: roc_auc_score=0.838345
Step 5000 validation: roc_auc_score=0.833481
1457


generating training, validation splits...


1457
1457
predictions for run #
4
[1. 1. 1. 1. 1.]


100%|██████████| 5821/5821 [00:01<00:00, 3708.19it/s]


Step 1000 validation: roc_auc_score=0.866028
Step 2000 validation: roc_auc_score=0.865939
Step 3000 validation: roc_auc_score=0.873723
Step 4000 validation: roc_auc_score=0.840261
Step 5000 validation: roc_auc_score=0.851494
1457
1457
1457
predictions for run #
5
[1. 1. 1. 1. 1.]
Prediction List:
[{'ames': array([1., 1., 1., ..., 0., 1., 1.])}, {'ames': array([1., 1., 1., ..., 0., 1., 1.])}, {'ames': array([1., 1., 1., ..., 0., 1., 1.])}, {'ames': array([1., 1., 1., ..., 0., 1., 1.])}, {'ames': array([1., 1., 1., ..., 0., 1., 1.])}]


In [4]:
for i in predictions_list:
    print(group.evaluate(i))

group.evaluate_many(predictions_list)

{'ames': {'roc-auc': 0.766}}
{'ames': {'roc-auc': 0.781}}
{'ames': {'roc-auc': 0.77}}
{'ames': {'roc-auc': 0.775}}
{'ames': {'roc-auc': 0.775}}


{'ames': [0.773, 0.005]}