In [1]:
%cd ../

/notebooks/mt-tabnet-xgboost-model-comparison


In [4]:
import argparse
from functools import partial
import json
import os
import pickle
from model_trainer.xgboost.xgboost_trainer import XGBoostTrainer
from model_trainer.data.process_only_data_box import ProccessOnlyDataBox
from model_trainer.data import data_loader
from tuning_config import spaces, base_class_weights_large
import data_config
from hyperopt import space_eval
import numpy as np
from xgboost.callback import EarlyStopping
# TODO: import this
from sklearn.metrics import accuracy_score, matthews_corrcoef, f1_score, confusion_matrix
import pandas as pd

def process_params(params):
    cw_modifier = params.pop('cw_modifier')
    class_weights = base_class_weights_large.copy()
    class_weights[1] = class_weights[1]*cw_modifier
    class_weights = {index: value for index, value in enumerate(class_weights)}
    return params, class_weights


In [5]:
min_delta=1e-3
data_size="small"
base_data_path="data"
info_output_path="info-test"
early_stop_rounds=2

X_train, Y_train = data_loader.load(
    data_size, base_data_path, 'train_val')
X_test, Y_test = data_loader.load(
    data_size, base_data_path, 'test'
)

os.makedirs(info_output_path,exist_ok=True) # creating info output dir

print("Processing data")
po_db = ProccessOnlyDataBox(
    X_train, Y_train, X_test, Y_test, data_config.categorical_variables)
X_train, Y_train, X_test, Y_test = next(po_db.get_processed_data())

trials_in_path = f"trials/xgboost-{data_size}.p"
print(f"Loading trial from path: {trials_in_path}")
with open(trials_in_path, 'rb') as in_file:
    trials = pickle.load(in_file)

space = spaces['xgboost']
best_hyperparams = space_eval(space, trials.argmin)
best_hyperparams['n_estimators']=5

model_params, class_weights = process_params(best_hyperparams)
# reinitialize callbacks
callbacks = [
    EarlyStopping(rounds=early_stop_rounds,
                save_best=True, min_delta=min_delta)
]

xt = XGBoostTrainer(model_params, class_weights, callbacks)
model, metric = xt.train_and_validate(
    X_train, Y_train, X_test, Y_test, verbosity=1)


Processing data
Loading trial from path: trials/xgboost-small.p
[0]	validation_0-mlogloss:1.25505	validation_1-mlogloss:1.26526
[1]	validation_0-mlogloss:1.15374	validation_1-mlogloss:1.17273
[2]	validation_0-mlogloss:1.07113	validation_1-mlogloss:1.09805
[3]	validation_0-mlogloss:1.00424	validation_1-mlogloss:1.03814
[4]	validation_0-mlogloss:0.94728	validation_1-mlogloss:0.98928


In [9]:
print(model.feature_importances_)
print(model.feature_names_in_)

[0.01822665 0.01944143 0.01408613 0.01897242 0.08385193 0.00527491
 0.0536919  0.00551897 0.00996188 0.00582987 0.00455864 0.07889622
 0.0055125  0.00739548 0.02938766 0.12292625 0.0394145  0.01249719
 0.00558608 0.01657898 0.13931744 0.01933381 0.01630985 0.08851612
 0.01753619 0.00407501 0.00986527 0.018304   0.00353602 0.00827515
 0.         0.00744774 0.         0.00352645 0.00452249 0.
 0.01170093 0.00583692 0.00758948 0.01176694 0.01033749 0.00772074
 0.00463861 0.00866617 0.00103731 0.00494321 0.00573031 0.00368649
 0.00811127 0.00318009 0.00246109 0.00441776]
['Start_Lat' 'Start_Lng' 'End_Lat' 'End_Lng' 'Distance(mi)'
 'Temperature(F)' 'Wind_Chill(F)' 'Humidity(%)' 'Pressure(in)'
 'Visibility(mi)' 'Wind_Speed(mph)' 'Precipitation(in)' 'Wind_SN'
 'Wind_EW' 'Side_R' 'State_FL' 'State_MN' 'State_NC' 'State_NY' 'State_OR'
 'State_Other' 'State_PA' 'State_SC' 'State_TX' 'State_VA' 'Amenity_True'
 'Bump_True' 'Crossing_True' 'Give_Way_True' 'Junction_True'
 'No_Exit_True' 'Railway_Tr

In [6]:
from matplotlib import pyplot as plt
#TODO: metrics 
# TODO: learning curve
# evaluate performance
preds = model.predict(X_test)

mcc = matthews_corrcoef(Y_test,preds)
print(f"mcc: {mcc}")
f1_weighted = f1_score(Y_test,preds,average='weighted')
print(f"f1 weighted: {f1_weighted}")
accuracy = accuracy_score(Y_test,preds)
print(f"accuracy: {accuracy}%")
confusion_mat = confusion_matrix(Y_test,preds)
print(f"confusion mat:\n{confusion_mat}")

metrics = {
    "mcc": mcc,
    "f1": f1_weighted,
    "confusion_m":confusion_mat,
    "acc": accuracy
}

mcc: 0.35789091441111787
f1 weighted: 0.8057012519909181
accuracy: 0.7597659338921398%
confusion mat:
[[  458    30    26     7]
 [ 1999 39509  5685  3467]
 [  158   557  1994   393]
 [   40   741   568  1275]]


In [20]:
results = model.evals_result()
train_loss = results['validation_0']['mlogloss']
val_loss = results['validation_1']['mlogloss']
rng = np.arange(len(train_loss))
df_dict = {
    "iter":rng,
    "train_loss":train_loss,
    "val_loss":val_loss
}
df = pd.DataFrame.from_dict(df_dict)

Unnamed: 0,iter,train_loss,val_loss
0,0,1.255051,1.265255
1,1,1.153745,1.172727
2,2,1.071132,1.098047
3,3,1.004238,1.038136
4,4,0.947283,0.989283


In [10]:
trials_in_path = f"trials/xgboost-{data_size}.p"
print(f"Loading trial from path: {trials_in_path}")
with open(trials_in_path, 'rb') as in_file:
    trials = pickle.load(in_file)

space = spaces['xgboost']
best_hyperparams = space_eval(space, trials.argmin)


Loading trial from path: trials/xgboost-small.p


In [11]:
best_hyperparams['n_estimators']=5

{'alpha': 2.5254086047116173,
 'cw_modifier': 1.5,
 'eta': 0.1448110011361952,
 'gamma': 2.8184397966710937,
 'lambda': 2.2422484118022297,
 'max_delta_step': 7.123125948002439,
 'max_depth': 10,
 'min_child_weight': 4.2808020966224,
 'n_estimators': 1000,
 'subsample': 0.6875835975467501,
 'tree_method': 'exact'}