Model Building - Classification Template

In [None]:
import pandas as pd
import numpy as np
import joblib
import warnings
warnings.filterwarnings('ignore')

from config import modelling_file_path, model_output_path, prediction_output_path
from src.modelling_data_contract import ModellingDataContract
from src.modelling.hyperparameter_tuning import XGBHyperparameterTuner
from src.modelling.supermodel import SuperXGBClassifier
from src.modelling.optuna_xgb_param_grid import OptunaXGBParamGrid

pd.options.display.max_rows = 100
pd.options.display.max_columns = 999

%load_ext autoreload
%autoreload 2

Model Version

In [None]:
model_version = 1
model_name = ''
model_file_name = model_name + '_v' + str(model_version)

In [None]:
RESPONSE = ModellingDataContract.RESPONSE

In [None]:
FEATURES = ModellingDataContract.feature_list

In [None]:
MONOTONE_CONSTRAINTS = ModellingDataContract.monotone_constraints

Load Data

In [None]:
modelling_data = pd.read_csv(modelling_file_path)
modelling_data.tail()

In [None]:
training_data = modelling_data[modelling_data[RESPONSE+"TrainingSet"]]
test_data = modelling_data[modelling_data[RESPONSE+"TestSet"]]
cal_data = modelling_data[modelling_data[RESPONSE+"ValidationSet"]]

In [None]:
X, y = modelling_data.drop(columns=[RESPONSE]), modelling_data[RESPONSE]
X_train, y_train = training_data.drop(columns=[RESPONSE]), training_data[RESPONSE]
X_test, y_test = test_data.drop(columns=[RESPONSE]), test_data[RESPONSE]
X_cal, y_cal = cal_data.drop(columns=[RESPONSE]), cal_data[RESPONSE]

In [None]:
X_preproc = X[FEATURES]
X_train_preproc = X_train[FEATURES]
X_test_preproc = X_test[FEATURES]
X_cal_preproc = X_cal[FEATURES]

Optuna Hyperparameter Tuning Class - HyperParameterTuner & XGBHyperparameterTuner

In [None]:
xgb_tuner = XGBHyperparameterTuner(X_train_preproc, y_train, monotonicity_constraints=MONOTONE_CONSTRAINTS)

In [None]:
xgb_tuner.tune_hyperparameters()

In [None]:
params = xgb_tuner.get_best_params()
params

Training Model - SuperXGBClassifier class for training and predictions

In [None]:
params['objective'] = OptunaXGBParamGrid.error
params['num_rounds'] = 1000
params['early_stopping_rounds'] = 50
params['verbosity'] = 1
params['monotone_constraints'] = MONOTONE_CONSTRAINTS

In [None]:
super_xgb = SuperXGBClassifier(X_train = X_train_preproc, 
                               y_train = y_train, 
                               X_test = X_test_preproc, 
                               y_test = y_test,
                               X_cal = X_cal_preproc,
                               y_cal = y_cal,
                               params = params)

In [None]:
super_xgb.fit()

In [None]:
super_xgb.xgb_model

In [None]:
super_xgb.xgb_model.get_booster().feature_names

In [None]:
preds = super_xgb.predict(X_preproc)
train_preds = super_xgb.predict(X_train_preproc)
test_preds = super_xgb.predict(X_test_preproc)

In [None]:
probas = super_xgb.predict_proba(X_preproc)[:, 1]
train_probas = super_xgb.predict_proba(X_train_preproc)[:, 1]
test_probas = super_xgb.predict_proba(X_test_preproc)[:, 1]
cal_probas = super_xgb.predict_proba(X_cal_preproc)[:, 1]

In [None]:
super_xgb.calibrate()

In [None]:
cal_probas = super_xgb.predict_proba(X_preproc, calibrate=True)
train_cal_probas = super_xgb.predict_proba(X_train_preproc, calibrate=True)
test_cal_probas = super_xgb.predict_proba(X_test_preproc, calibrate=True)

Check Average Predictions

In [None]:
probas.mean(), modelling_data[RESPONSE].mean(), cal_probas.mean()

In [None]:
train_probas.mean(), training_data[RESPONSE].mean(), train_cal_probas.mean()

In [None]:
test_probas.mean(), test_data[RESPONSE].mean(), test_cal_probas.mean()

Export model

In [None]:
super_xgb.export_model(model_output_path + "/" + model_file_name + ".joblib")

Export data and predictions

In [None]:
modelling_info = modelling_data[['match_id', "chain_number", "order", RESPONSE]].reset_index(drop = True)
modelling_export = pd.concat([modelling_info, X_preproc.reset_index(drop=True)], axis=1)
modelling_export['xgb_preds'] = preds
modelling_export['xgb_probas'] = probas
modelling_export['xgb_probas_cal'] = cal_probas
modelling_export.to_csv(prediction_output_path + 'predictions_' + model_file_name + '.csv', index = False)
modelling_export.head()