In [None]:
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
from utils import modify_metadata, TARGETS
import torch
from sklearn.model_selection import GroupKFold
from sklearn.metrics import classification_report
import os
from ax.service.managed_loop import optimize
import cupy as cp
import xgboost as xgb

In [None]:
TRAIN_METADATA_DIR = "../../data/train.csv"
TRAIN_SIGNATURES_DIR = "../../data/train_signatures/"
metadata = pd.read_csv(TRAIN_METADATA_DIR)
metadata = modify_metadata(metadata)
criterion = torch.nn.KLDivLoss(reduction='batchmean')

In [None]:
metadata

In [None]:
signature_level = 4
scaler_type = "meanvarPerChannel_1.0"

In [None]:
TRAIN_SIGNATURES_FILE = f"{TRAIN_SIGNATURES_DIR}all_sigs_lvl_{signature_level}_scaler_{scaler_type}_experts.pt"
signature_features = torch.load(TRAIN_SIGNATURES_FILE)

In [None]:
targets = metadata[TARGETS]

In [None]:
targets = targets.to_numpy()

In [None]:
features = signature_features.reshape(signature_features.shape[0], -1).numpy()

In [None]:
parameters = [
    {"name": "n_estimators", "type": "range", "bounds": [1, 1000]},
    {"name": "max_depth", "type": "range", "bounds": [3, 8]},
    {"name": "min_child_weight", "type": "range", "bounds": [1, 10]},
    {"name": "gamma", "type": "range", "bounds": [0., 0.5]},
    {"name": "subsample", "type": "range", "bounds": [0.5, 1.]},
    {"name": "colsample_bytree", "type": "range", "bounds": [0.5, 1.]},
    {"name": "eta", "type": "range", "bounds": [0.01, 0.3]},
    {"name": "lambda", "type": "range", "bounds": [0., 100.]},
    {"name": "alpha", "type": "range", "bounds": [0., 100.]}
]

In [None]:
def CV_score(parameters, save_models = False):
    print(parameters)
    gkf = GroupKFold(n_splits=5)
    scores = []
    #multiprocessing.set_start_method('spawn', force=True)
    for i, (train_index, valid_index) in enumerate(gkf.split(metadata, metadata.target, metadata.patient_id)):
        X_train, X_valid = np.take(features, train_index, axis=0), np.take(features, valid_index, axis=0)
        y_train, y_valid = np.take(targets, train_index, axis=0), np.take(targets, valid_index, axis=0)
        y_train_argmax = np.argmax(y_train, axis=1)
        # put on GPU
        X_train = cp.array(X_train)
        y_train_argmax = cp.array(y_train_argmax)
        clf = xgb.XGBClassifier(tree_method = "hist", device = "cuda", n_estimators = parameters.get("n_estimators"), max_depth = parameters.get("max_depth"), min_child_weight = parameters.get("min_child_weight"), gamma = parameters.get("gamma"), subsample = parameters.get("subsample"), colsample_bytree = parameters.get("colsample_bytree"), eta = parameters.get("eta"), reg_lambda = parameters.get("lambda"), reg_alpha = parameters.get("alpha"))
        clf.fit(X_train, y_train_argmax)
        X_valid = cp.array(X_valid)
        y_pred = clf.predict_proba(X_valid)
        loss = criterion(torch.log(torch.tensor(y_pred)), torch.tensor(y_valid))
        print(f"Fold {i} loss: {loss.item()}")
        scores.append(loss.item())
        if save_models:
            # create model directory
            dir_name = f"model_logs/xgboost_{parameters.get('n_estimators')}_{parameters.get('max_depth')}_{parameters.get('min_child_weight')}_{parameters.get('gamma')}_{parameters.get('subsample')}_{parameters.get('colsample_bytree')}_{parameters.get('eta')}_{parameters.get('lambda')}_{parameters.get('alpha')}"
            os.makedirs(dir_name, exist_ok=True)
            clf.save_model(f"{dir_name}/model_{i}.json")
    # write scores to file
    return scores

In [None]:
# log file for all experiments and CV scores
log_file = "hyperparameter_search_log.csv"
if not os.path.exists(log_file):
    with open(log_file, "w") as f:
        f.write("n_estimators,max_depth,min_child_weight,gamma,subsample,colsample_bytree,eta,lambda,alpha,loss\n")

In [None]:
def eval_function(parameters):
    scores = CV_score(parameters)
    # read scores from file
    with open(log_file, "a") as f:
        f.write(f"{parameters.get('n_estimators')},{parameters.get('max_depth')},{parameters.get('min_child_weight')},{parameters.get('gamma')},{parameters.get('subsample')},{parameters.get('colsample_bytree')},{parameters.get('eta')},{parameters.get('lambda')},{parameters.get('alpha')},{np.mean(scores)}\n")
    return -np.mean(scores)

In [None]:
#eval_function({'n_estimators': 510, 'max_depth': 4, 'min_child_weight': 9, 'gamma': 0.06360600612445994, 'subsample': 0.7326052348306491, 'colsample_bytree': 0.7798921755137922, 'eta': 0.01, 'lambda': 0.5856295363545664, 'alpha': 0.4046439680543726})

In [None]:
best_parameters, values, experiment, model = optimize(
    parameters=parameters,
    evaluation_function=eval_function,
    objective_name='CV_score',
    total_trials=20,
)

In [None]:
print(best_parameters)

In [None]:
# train model on best parameters
CV_score({'n_estimators': 510, 'max_depth': 4, 'min_child_weight': 9, 'gamma': 0.06360600612445994, 'subsample': 0.7326052348306491, 'colsample_bytree': 0.7798921755137922, 'eta': 0.01, 'lambda': 0.5856295363545664, 'alpha': 0.4046439680543726}, save_models = True)