In [1]:
from tdc.single_pred import Tox
import deepchem
import xgboost
import rdkit
from rdkit import Chem
from rdkit.Chem.EState import Fingerprinter
from rdkit.Chem.EState import EState

import numpy as np
import pandas as pd
import tensorflow as tf
import autokeras as ak

from tdc.benchmark_group import admet_group
group = admet_group(path = 'data/')
data = Tox(name="LD50_Zhu")
df = data.get_data()

Found local copy...
Found local copy...
Loading...
Done!


In [17]:
predictions_list = []
metric = deepchem.deepchem.metrics.Metric(deepchem.deepchem.metrics.mean_absolute_error)
featurizer = deepchem.deepchem.feat.CircularFingerprint(radius=4)
xgb_reg = xgboost.XGBRegressor()

for seed in [1, 2]:
    benchmark = group.get('LD50_Zhu') 
    
    predictions = {}
    name = benchmark['name']
    train_val, test = benchmark['train_val'], benchmark['test']
    train, valid = group.get_train_valid_split(benchmark = name, split_type = 'default', seed = seed)  

    trainAugX = []
    trainAugY = []
    testAugX = []
    testAugY = []

    #augment SMILES
    for row in train_val.itertuples():
        mol = Chem.MolFromSmiles(row[2])
        for _ in range(5):
            newMol = Chem.MolToSmiles(mol, doRandom=True)
            trainAugX.append(featurizer.featurize(newMol)[0])
            trainAugY.append(row[3])

    for row in test.itertuples():
        mol = Chem.MolFromSmiles(row[2])
        for _ in range(5):
            newMol = Chem.MolToSmiles(mol, doRandom=True)
            testAugX.append(featurizer.featurize(newMol)[0])
            testAugY.append(row[3])

    #convert training data into a deepchem dataset
    #train_data = deepchem.deepchem.data.NumpyDataset(X=trainAugX, y=np.array(trainAugY))
    #valid_data = deepchem.deepchem.data.NumpyDataset(X=validAugX, y=np.array(validAugY))

    #fit data
    xgb_reg.fit(X=trainAugX, y=trainAugY, eval_metric="mae", verbose=True)
    y_pred_test = xgb_reg.predict(X=testAugX)

    print("predictions for run #")
    print(seed)
    print(y_pred_test[0:5])
        
    predictions[name] = y_pred_test
    predictions_list.append(predictions)

print("Prediction List:")
print(predictions_list)

generating training, validation splits...
100%|██████████| 5907/5907 [00:01<00:00, 3935.46it/s]
generating training, validation splits...


predictions for run #
1
[2.276333 2.276333 2.276333 2.276333 2.276333]


100%|██████████| 5907/5907 [00:01<00:00, 4141.51it/s]


KeyboardInterrupt: 

In [18]:
predictions_list[0]['ld50_zhu'] = np.mean(np.array(predictions_list[0]['ld50_zhu']).reshape(-1, 5), axis=1)
group.evaluate(predictions_list[0])

{'ld50_zhu': {'mae': 0.663}}

In [16]:
predictions_list[0]['ld50_zhu']

array([2.276333, 2.276333, 2.276333, ..., 2.30654 , 2.30654 , 2.30654 ],
      dtype=float32)

In [40]:
bleachCanonical = featurizer.featurize("[O-]Cl.[Na+]")[0]
xgb_reg.predict(X=[bleachCanonical]).tolist()

[2.2600696086883545]