In [4]:
import os
import pandas as pd
import numpy as np

from tdc.benchmark_group import admet_group
group = admet_group(path = 'data/') #this will create a new folder under /notebooks. Please delete after use

DATAPATH = "../data"
PREDPATH = "../predictions"

Downloading Benchmark Group...
100%|██████████| 1.47M/1.47M [00:00<00:00, 1.51MiB/s]
Extracting zip file...
Done!


## TABPFN Classifier

TabPFN is prepared to deal with classifications only. We have used the classification tasks from the ADMET benchmark to validate the ensemble method.
The 5-fold train, validation and test splits are saved for reproducibility. One of the models is also saved and incorporated in the Ersilia Model Hub.

In [2]:
admet_datasets = [#"bioavailability_ma",
                  #"hia_hou",
                  #"pgp_broccatelli",
                  "bbb_martins",
                  "cyp2c9_veith",
                  "cyp2d6_veith",
                  "cyp3a4_veith",
                  "cyp2c9_substrate_carbonmangels",
                  "cyp2d6_substrate_carbonmangels",
                  "cyp3a4_substrate_carbonmangels",
                  "herg",
                  "ames",
                  "dili"]

In [16]:
for a in admet_datasets:
    for seed in [1, 2, 3, 4, 5]:
        benchmark = group.get(a) 
        predictions = {}
        name = benchmark['name']
        train_val, test = benchmark['train_val'], benchmark['test']
        train, valid = group.get_train_valid_split(benchmark = name, split_type = 'default', seed = seed)
        test.to_csv(os.path.join(DATAPATH, "tdc", "{}_test.csv".format(name)), index=False)
        valid.to_csv(os.path.join(DATAPATH, "tdc", "{}_valid_{}.csv".format(name, seed)), index=False)
        train.to_csv(os.path.join(DATAPATH, "tdc", "{}_train_{}.csv".format(name, seed)), index=False)

generating training, validation splits...
100%|██████████| 512/512 [00:00<00:00, 1498.97it/s]
generating training, validation splits...
100%|██████████| 512/512 [00:00<00:00, 1564.64it/s]
generating training, validation splits...
100%|██████████| 512/512 [00:00<00:00, 1623.11it/s]
generating training, validation splits...
100%|██████████| 512/512 [00:00<00:00, 1670.06it/s]
generating training, validation splits...
100%|██████████| 512/512 [00:00<00:00, 1630.73it/s]
generating training, validation splits...
100%|██████████| 461/461 [00:00<00:00, 1652.64it/s]
generating training, validation splits...
100%|██████████| 461/461 [00:00<00:00, 1681.68it/s]
generating training, validation splits...
100%|██████████| 461/461 [00:00<00:00, 1738.50it/s]
generating training, validation splits...
100%|██████████| 461/461 [00:00<00:00, 1236.80it/s]
generating training, validation splits...
100%|██████████| 461/461 [00:00<00:00, 1667.70it/s]
generating training, validation splits...
100%|██████████| 9

In [6]:
from eosce.models import ErsiliaCompoundEmbeddings
from ensemble_tabpfn import EnsembleTabPFN
import joblib
MODELPATH = "../models"

descriptor = ErsiliaCompoundEmbeddings()
clf = EnsembleTabPFN()

for a in admet_datasets:
    print(a)
    for seed in [1, 2, 3, 4, 5]:
        benchmark = group.get(a)
        name = benchmark['name']
        predictions = {}
        test = pd.read_csv(os.path.join(DATAPATH, "tdc", "{}_test.csv".format(name)))
        valid = pd.read_csv(os.path.join(DATAPATH, "tdc", "{}_valid_{}.csv".format(name, seed)))
        train = pd.read_csv(os.path.join(DATAPATH, "tdc", "{}_train_{}.csv".format(name, seed)))
        #transform drugs to Ersilia Compound Embeddings
        train_transformed = descriptor.transform(train["Drug"])
        valid_transformed = descriptor.transform(valid["Drug"])
        test_transformed = descriptor.transform(test["Drug"])
        #fit ensemble tabpfn on the train set
        clf.fit(train_transformed, train["Y"])
        if seed == 1:
            joblib.dump(clf, os.path.join(MODELPATH, "{}_{}.joblib".format(name,seed))) #save one model
        #predict and save probabilities
        y_pred_valid = clf.predict_proba(valid_transformed)
        y_pred_test = clf.predict_proba(test_transformed)
        valid["proba1"] = y_pred_valid
        valid["bin_pred"]=[0 if x < 0.5 else 1 for x in y_pred_valid]
        valid.to_csv(os.path.join(DATAPATH, "tdc_preds", "{}_valid_{}.csv".format(name,seed)), index=False)
        test["proba1"]= y_pred_test
        test["bin_pred"]=[0 if x < 0.5 else 1 for x in y_pred_test]
        test.to_csv(os.path.join(DATAPATH, "tdc_preds", "{}_test_{}.csv".format(name,seed)), index=False)

bbb_martins




Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters




Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters




Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters




Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters




Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
cyp2c9_veith
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters


KeyboardInterrupt: 

In [19]:
#append predictions to list for evaluation
predictions_list = []

for a in admet_datasets:
    for seed in [1, 2, 3, 4, 5]:
        test = pd.read_csv(os.path.join(DATAPATH, "tdc_preds", "{}_test_{}.csv".format(name,seed)))
        #append predictions to predictions list
        predictions[name] = test["proba1"]
        predictions_list.append(predictions)

results = group.evaluate_many(predictions_list)

[{'bioavailability_ma': array([0.836, 0.889, 0.662, 0.887, 0.872, 0.931, 0.978, 0.817, 0.806,
         0.589, 0.947, 0.772, 0.936, 0.88 , 0.944, 0.951, 0.978, 0.607,
         0.644, 0.835, 0.758, 0.658, 0.442, 0.55 , 0.654, 0.78 , 0.936,
         0.926, 0.963, 0.935, 0.937, 0.943, 0.925, 0.958, 0.944, 0.818,
         0.624, 0.954, 0.807, 0.797, 0.953, 0.869, 0.916, 0.939, 0.938,
         0.93 , 0.924, 0.796, 0.796, 0.414, 0.734, 0.835, 0.512, 0.621,
         0.587, 0.565, 0.621, 0.903, 0.907, 0.839, 0.937, 0.962, 0.752,
         0.476, 0.963, 0.9  , 0.73 , 0.846, 0.95 , 0.468, 0.935, 0.832,
         0.746, 0.94 , 0.736, 0.756, 0.855, 0.878, 0.886, 0.525, 0.833,
         0.889, 0.885, 0.9  , 0.916, 0.85 , 0.966, 0.419, 0.948, 0.969,
         0.928, 0.976, 0.964, 0.981, 0.852, 0.953, 0.585, 0.593, 0.585,
         0.865, 0.784, 0.471, 0.519, 0.772, 0.488, 0.608, 0.362, 0.626,
         0.947, 0.865, 0.827, 0.912, 0.827, 0.921, 0.938, 0.912, 0.963,
         0.811, 0.926, 0.979, 0.913, 0.407