In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import pandas as pd
import numpy as np
from joblib import dump

In [5]:
from my_krml_adv_mla_2023.data.sets import load_sets

In [6]:
X_train, y_train, X_val, y_val, X_test, y_test = load_sets(path="../data/processed/")

In [7]:
import xgboost as xgb

In [8]:
xgboost1 = xgb.XGBClassifier()

In [9]:
from my_krml_adv_mla_2023.models.performance import fit_assess_classifier

In [10]:
xgboost1 = fit_assess_classifier(xgboost1, X_train, y_train, X_val, y_val)

Accuracy Training: 0.9239609230015057
F1 Training: 0.9236854561126538
Accuracy Validation: 0.9070568709426668
F1 Validation: 0.9065289905847367


In [11]:
from joblib import dump
dump(xgboost1, "../models/xgboost_default.joblib")

['../models/xgboost_default.joblib']

In [12]:
from hyperopt import Trials, STATUS_OK, tpe, hp, fmin
rstate = np.random.default_rng(42)

In [13]:
space = {
    'max_depth' : hp.choice('max_depth', range(3, 15, 1)),
    'min_child_weight' : hp.choice('min_child_weight', range(3, 15, 1)),
    'learning_rate' : hp.quniform('learning_rate', 0.01, 0.4, 0.05),
    'subsample' : hp.quniform('subsample', 0.7, 1, 0.05),
    'colsample_bytree' : hp.quniform('colsample_bytree', 0.2, 0.5, 0.05)
}

In [14]:
def objective(space):
    from sklearn.model_selection import cross_val_score

    xgboost = xgb.XGBClassifier(
        max_depth = int(space['max_depth']),
        learning_rate = space['learning_rate'],
        min_child_weight = space['min_child_weight'],
        subsample = space['subsample'],
        colsample_bytree = space['colsample_bytree']
    )

    acc = cross_val_score(xgboost, X_train, y_train, cv=5, scoring="accuracy").mean()

    return{'loss': 1-acc, 'status': STATUS_OK }

In [15]:
best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=3,
    rstate=rstate
)

100%|██████████| 3/3 [02:21<00:00, 47.17s/trial, best loss: 0.09912111768619347]


In [16]:
print("Best: ", best)

Best:  {'colsample_bytree': 0.4, 'learning_rate': 0.35000000000000003, 'max_depth': 7, 'min_child_weight': 7, 'subsample': 0.9500000000000001}


In [17]:
xgboost2 = xgb.XGBClassifier(
    max_depth = best['max_depth'],
    learning_rate = best['learning_rate'],
    min_child_weight = best['min_child_weight'],
    subsample = best['subsample'],
    colsample_bytree = best['colsample_bytree']
)

In [18]:
xgboost2 = fit_assess_classifier(xgboost2, X_train, y_train, X_val, y_val)

Accuracy Training: 0.927770580202388
F1 Training: 0.9275307642134867
Accuracy Validation: 0.9053971722095002
F1 Validation: 0.9049096070290497


In [19]:
dump(xgboost2,  '../models/xgboost_best.joblib')

['../models/xgboost_best.joblib']