In [1]:
import pandas as pd 
from autogluon.tabular import TabularPredictor

from superpac. base import get_split_mmp_indices

### Load and split data

In [4]:
full_df = pd.read_csv('./MACCS_keys/MACCS_encoding.csv')

ds_size = len(full_df)
zero_out, one_out, two_out = get_split_mmp_indices("./index sets for train test split", ds_size)

zero_out_df = full_df.iloc[zero_out]
two_out_df = full_df.iloc[two_out]
one_out_df = full_df.iloc[one_out]

train_data = zero_out_df.drop('pKi_diff', axis=1)
test_data = two_out_df.drop('pKi_diff', axis=1)
one_data = one_out_df.drop('pKi_diff', axis=1)

label = 'is_AC'


y_train = train_data[label]
y_test = test_data[label]
y_one = one_data[label]

X_train = pd.DataFrame(train_data.drop(columns=[label]))
X_test = pd.DataFrame(test_data.drop(columns=[label]))
X_one = pd.DataFrame(one_data.drop(columns=[label]))


## Training classifiers

AutoGluon TabularPredictor will train NeuralNets, Random Forests, ExtraTrees, CatBoost models and their multi-layer stacked ensembles.

In [5]:
save_path = 'agClassifier'
metric = 'roc_auc'
time_limit = 12000


predictor = TabularPredictor(label, eval_metric=metric, path=save_path)
predictor.fit(train_data, time_limit=time_limit, presets='best_quality')

Presets specified: ['best_quality']
Beginning AutoGluon training ... Time limit = 12000s
AutoGluon will save models to "agModel_class_frag/"
AutoGluon Version:  0.4.0
Python Version:     3.8.3
Operating System:   Darwin
Train Data Rows:    7089
Train Data Columns: 498
Label Column: is_AC
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [0, 1]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    1567.4 MB
	Train Data (Original)  Memory Usage: 28.24 MB (1.8% of available memory)
	Inferring data type of each feature based on column values. Set feature_met

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L3,0.804382,0.946709,31.079878,55.180279,8659.492607,0.004669,0.002698,5.465445,3,True,16
1,ExtraTreesGini_BAG_L1,0.804141,0.930281,0.144838,5.134617,4.620638,0.144838,5.134617,4.620638,1,True,4
2,RandomForestGini_BAG_L1,0.804091,0.928731,0.145879,5.900046,5.553252,0.145879,5.900046,5.553252,1,True,1
3,CatBoost_BAG_L2,0.8038,0.942899,30.660631,48.156436,8644.423369,2.068826,3.083843,1088.203782,2,True,11
4,RandomForestGini_BAG_L2,0.802915,0.939651,28.722749,46.911212,7559.592459,0.130944,1.838619,3.372872,2,True,9
5,ExtraTreesGini_BAG_L2,0.802626,0.94287,28.733588,47.99888,7559.495111,0.141783,2.926286,3.275524,2,True,12
6,RandomForestEntr_BAG_L1,0.802501,0.930208,0.15259,4.488874,6.457156,0.15259,4.488874,6.457156,1,True,2
7,ExtraTreesEntr_BAG_L2,0.800814,0.944164,28.732929,47.189766,7559.440233,0.141124,2.117173,3.220646,2,True,13
8,ExtraTreesEntr_BAG_L1,0.800761,0.928983,0.19618,2.722427,3.976683,0.19618,2.722427,3.976683,1,True,5
9,RandomForestEntr_BAG_L2,0.799363,0.942796,28.723475,47.050279,7559.32721,0.13167,1.977686,3.107623,2,True,10


### Pruning the predictor of unwanted models:

In [9]:
predictor = TabularPredictor.load("./agModel_class_frag")
predictor.delete_models(models_to_keep=["CatBoost_BAG_L1", "CatBoost_BAG_L2", "ExtraTreesGini_BAG_L1", "ExtraTreesEntr_BAG_L1", "ExtraTreesGini_BAG_L2", "ExtraTreesEntr_BAG_L2",
    "RandomForestEntr_BAG_L1", "RandomForestGini_BAG_L1","RandomForestEntr_BAG_L2","RandomForestGini_BAG_L2"], dry_run=False)

Deleting model WeightedEnsemble_L2. All files under ./agModel_class_frag/models/WeightedEnsemble_L2/ will be removed.
Deleting model NeuralNetFastAI_BAG_L2. All files under ./agModel_class_frag/models/NeuralNetFastAI_BAG_L2/ will be removed.
Deleting model NeuralNetTorch_BAG_L2. All files under ./agModel_class_frag/models/NeuralNetTorch_BAG_L2/ will be removed.
Deleting model WeightedEnsemble_L3. All files under ./agModel_class_frag/models/WeightedEnsemble_L3/ will be removed.


In [10]:
results = predictor.fit_summary(show_plot = True)

*** Summary of fit() ***
Estimated performance of each model:
                      model  score_val  pred_time_val     fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0     ExtraTreesEntr_BAG_L2   0.944164      47.189766  7559.440233                2.117173           3.220646            2       True         12
1           CatBoost_BAG_L2   0.942899      48.156436  8644.423369                3.083843        1088.203782            2       True         10
2     ExtraTreesGini_BAG_L2   0.942870      47.998880  7559.495111                2.926286           3.275524            2       True         11
3   RandomForestEntr_BAG_L2   0.942796      47.050279  7559.327210                1.977686           3.107623            2       True          9
4   RandomForestGini_BAG_L2   0.939651      46.911212  7559.592459                1.838619           3.372872            2       True          8
5           CatBoost_BAG_L1   0.934421       4.813488  2575.145343  

### Quick evaluation on the test set:

In [11]:

y_pred = predictor.predict_proba(X_test)


performance = predictor.evaluate_predictions(y_true = y_test, y_pred = y_pred, auxiliary_metrics = True)

Evaluation: roc_auc on test data: 0.7579628137809655
Evaluations on test data:
{
    "roc_auc": 0.7579628137809655,
    "accuracy": 0.8850509626274066,
    "balanced_accuracy": 0.6068383949167403,
    "mcc": 0.2857391549155361,
    "f1": 0.3255813953488372,
    "precision": 0.47115384615384615,
    "recall": 0.24873096446700507
}
