# Development of the AutoML training workflow

In [29]:
import argparse
import logging
import os
import pickle
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import shap
import xgboost as xgb
from sklearn.model_selection import KFold, StratifiedKFold

In [30]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [31]:
sys.path.append("../..")

In [32]:
from refract.trainers import AutoMLTrainer

In [33]:
feature_path = "/scratch/users/nphill22/projects/corsello_lab/20240313_prism_final_reruns/new_baseline/processed_data/x-all.pkl"
response_path = "/scratch/users/nphill22/projects/corsello_lab/20240313_prism_final_reruns/data/features/responses/amg-232_2.5.csv"
output_dir = "automl_test"

In [34]:
logger = logging.getLogger(__name__)
logging.basicConfig(level="INFO")

In [35]:
# load data
logger.info("Loading feature data...")
with open(feature_path, "rb") as f:
    feature_df = pickle.load(f)
feature_df.set_index("ccle_name", inplace=True)
feature_df.fillna(-1, inplace=True)


INFO:__main__:Loading feature data...


In [36]:
logger.info("Loading response data...")
response_df = pd.read_csv(response_path)

# only keep cell lines we have features for
available_ccle_names = set(feature_df.index)
response_df = response_df[response_df["ccle_name"].isin(available_ccle_names)]

# drop culture column
response_df = response_df.drop(columns=["culture"])
# drop duplicates by ccle_name, keep first
response_df = response_df.drop_duplicates(subset=["ccle_name"], keep="first")


INFO:__main__:Loading response data...


In [37]:
# START CV TRAIN
skf = KFold(n_splits=4, shuffle=True, random_state=42)
trainers = []
for i, (train_index, test_index) in enumerate(skf.split(response_df)):
    logger.info(f"Training fold {i}")
    response_train = response_df.iloc[train_index, :].reset_index(drop=True).copy()
    response_test = response_df.iloc[test_index, :].reset_index(drop=True).copy()

    # train one fold
    trainer = AutoMLTrainer(
        response_train=response_train,
        response_test=response_test,
        feature_df=feature_df,
    )
    trainer.train()
    trainers.append(trainer)
    ### END CV TRAIN

INFO:__main__:Training fold 0


[flaml.automl.logger: 04-24 15:52:40] {1680} INFO - task = regression
[flaml.automl.logger: 04-24 15:52:40] {1691} INFO - Evaluation method: holdout
[flaml.automl.logger: 04-24 15:52:40] {1789} INFO - Minimizing error metric: rmse
[flaml.automl.logger: 04-24 15:52:40] {1901} INFO - List of ML learners in AutoML Run: ['xgboost', 'rf', 'lgbm']
[flaml.automl.logger: 04-24 15:52:40] {2219} INFO - iteration 0, current learner xgboost
[flaml.automl.logger: 04-24 15:52:40] {2345} INFO - Estimated sufficient time budget=171s. Estimated necessary time budget=0s.
[flaml.automl.logger: 04-24 15:52:40] {2392} INFO -  at 0.0s,	estimator xgboost's best error=0.8830,	best estimator xgboost's best error=0.8830
[flaml.automl.logger: 04-24 15:52:40] {2219} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 04-24 15:52:40] {2392} INFO -  at 0.0s,	estimator lgbm's best error=0.9129,	best estimator xgboost's best error=0.8830
[flaml.automl.logger: 04-24 15:52:40] {2219} INFO - iteration 2, curr

INFO:__main__:Training fold 1


Fold correlation: 0.7719606684089517
[flaml.automl.logger: 04-24 15:53:46] {1680} INFO - task = regression
[flaml.automl.logger: 04-24 15:53:46] {1691} INFO - Evaluation method: holdout
[flaml.automl.logger: 04-24 15:53:46] {1789} INFO - Minimizing error metric: rmse
[flaml.automl.logger: 04-24 15:53:46] {1901} INFO - List of ML learners in AutoML Run: ['xgboost', 'rf', 'lgbm']
[flaml.automl.logger: 04-24 15:53:46] {2219} INFO - iteration 0, current learner xgboost
[flaml.automl.logger: 04-24 15:53:46] {2345} INFO - Estimated sufficient time budget=242s. Estimated necessary time budget=0s.
[flaml.automl.logger: 04-24 15:53:46] {2392} INFO -  at 0.0s,	estimator xgboost's best error=0.7601,	best estimator xgboost's best error=0.7601
[flaml.automl.logger: 04-24 15:53:46] {2219} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 04-24 15:53:46] {2392} INFO -  at 0.1s,	estimator lgbm's best error=0.7665,	best estimator xgboost's best error=0.7601
[flaml.automl.logger: 04-24 15:5

INFO:__main__:Training fold 2


Fold correlation: 0.7699963720305474
[flaml.automl.logger: 04-24 15:54:51] {1680} INFO - task = regression
[flaml.automl.logger: 04-24 15:54:51] {1691} INFO - Evaluation method: holdout
[flaml.automl.logger: 04-24 15:54:51] {1789} INFO - Minimizing error metric: rmse
[flaml.automl.logger: 04-24 15:54:51] {1901} INFO - List of ML learners in AutoML Run: ['xgboost', 'rf', 'lgbm']
[flaml.automl.logger: 04-24 15:54:51] {2219} INFO - iteration 0, current learner xgboost
[flaml.automl.logger: 04-24 15:54:51] {2345} INFO - Estimated sufficient time budget=225s. Estimated necessary time budget=0s.
[flaml.automl.logger: 04-24 15:54:51] {2392} INFO -  at 0.0s,	estimator xgboost's best error=0.9674,	best estimator xgboost's best error=0.9674
[flaml.automl.logger: 04-24 15:54:51] {2219} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 04-24 15:54:51] {2392} INFO -  at 0.1s,	estimator lgbm's best error=0.9839,	best estimator xgboost's best error=0.9674
[flaml.automl.logger: 04-24 15:5

INFO:__main__:Training fold 3


Fold correlation: 0.6790206508445278
[flaml.automl.logger: 04-24 15:55:59] {1680} INFO - task = regression
[flaml.automl.logger: 04-24 15:55:59] {1691} INFO - Evaluation method: holdout
[flaml.automl.logger: 04-24 15:55:59] {1789} INFO - Minimizing error metric: rmse
[flaml.automl.logger: 04-24 15:55:59] {1901} INFO - List of ML learners in AutoML Run: ['xgboost', 'rf', 'lgbm']
[flaml.automl.logger: 04-24 15:55:59] {2219} INFO - iteration 0, current learner xgboost
[flaml.automl.logger: 04-24 15:55:59] {2345} INFO - Estimated sufficient time budget=176s. Estimated necessary time budget=0s.
[flaml.automl.logger: 04-24 15:55:59] {2392} INFO -  at 0.0s,	estimator xgboost's best error=0.9051,	best estimator xgboost's best error=0.9051
[flaml.automl.logger: 04-24 15:55:59] {2219} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 04-24 15:56:00] {2392} INFO -  at 0.1s,	estimator lgbm's best error=0.9128,	best estimator xgboost's best error=0.9051
[flaml.automl.logger: 04-24 15:5

In [39]:
# get the top model types
top_model_types = []
for trainer in trainers:
    pass
    

In [45]:
trainer.automl.best_config_train_time

0.1323850154876709

In [52]:
def get_training_details(trainers):
    # For each trainer, save training details
    all_fold_trainers = []
    for trainer in trainers:
        # get model_name 
        model_name = trainer.automl.best_estimator
        # get config
        config = trainer.automl.best_config
        # train time
        train_time = trainer.automl.best_config_train_time

        # construct a dictionary
        training_details = {
            "model_name": model_name,
            "train_time": train_time,
            **config,
        }
        all_fold_trainers.append(training_details)
    df = pd.DataFrame(all_fold_trainers)
    return df

In [53]:
trainer_df = get_training_details(trainers)

In [54]:
trainer_df

Unnamed: 0,model_name,train_time,n_estimators,max_leaves,min_child_weight,learning_rate,subsample,colsample_bylevel,colsample_bytree,reg_alpha,reg_lambda,num_leaves,min_child_samples,log_max_bin,max_features
0,xgboost,0.071234,9,9.0,15.406144,0.174321,0.719262,0.618166,0.998664,0.012227,0.053691,,,,
1,lgbm,0.031666,5,,,0.404427,,,0.88561,0.003348,0.16647,7.0,11.0,8.0,
2,xgboost,1.097086,4,10.0,0.554261,0.403417,0.790256,0.82626,0.90325,0.000977,0.263878,,,,
3,rf,0.132385,5,23.0,,,,,,,,,,,0.690975


In [None]:
# trial run