Total Game Score Model - Model Tuning and Building - GBM

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import joblib
import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append('..')
from total_points_model.config import raw_data_file_path
from total_points_model.domain.preprocessing.data_preprocessor import DataPreprocessor
from total_points_model.domain.contracts.mappings import Mappings
from total_points_model.domain.contracts.rolling_columns import RollingColumns
from total_points_model.domain.contracts.modelling_data_contract import ModellingDataContract
from total_points_model.domain.modelling.hyperparameter_tuning import XGBHyperparameterTuner, XGBYearHyperparameterTuner
from total_points_model.domain.modelling.supermodel import SuperXGBRegressor
from total_points_model.domain.modelling.optuna_xgb_param_grid import OptunaXGBParamGrid

pd.options.display.max_rows = 100
pd.options.display.max_columns = 999

%load_ext autoreload
%autoreload 2

Model Version

In [None]:
model_version = 8
model_name = 'xgb_total_points'
model_file_name = model_name + '_v' + str(model_version)

model_output_path = "/afl-total-points-model/total_points_model/models/"
prediction_output_path = "/afl-total-points-model/total_points_model/predictions/"

preprocessor_file_name = 'preprocessor_v' + str(model_version)
preprocessor_output_path = "/afl-total-points-model/total_points_model/preprocessors/"

Load Data

In [None]:
afl_data = pd.read_csv(raw_data_file_path)
afl_data = afl_data[(afl_data['Year'] > 2004) & (afl_data['Year'] < 2023) & ~(afl_data['Year'] == 2020)]
afl_data.head()

In [None]:
training_data = afl_data[afl_data[ModellingDataContract.TRAIN_TEST_SPLIT_COL]]
test_data = afl_data[~afl_data[ModellingDataContract.TRAIN_TEST_SPLIT_COL]]

In [None]:
X_train, y_train = training_data.drop(columns=[ModellingDataContract.RESPONSE]), training_data[ModellingDataContract.RESPONSE]
X_test, y_test = test_data.drop(columns=[ModellingDataContract.RESPONSE]), test_data[ModellingDataContract.RESPONSE]

In [None]:
X_test.head()

Preprocess Data

In [None]:
preprocessor = DataPreprocessor(Mappings=Mappings, rolling_dict=RollingColumns.rolling_dict)

In [None]:
preprocessor.fit(X_train)

In [None]:
X_train_preproc = preprocessor.transform(X_train)
X_test_preproc = preprocessor.transform(X_test)

In [None]:
X_train_preproc.shape, X_test_preproc.shape

In [None]:
X_train_preproc.head()

In [None]:
X_test_preproc.head()

Optuna Hyperparameter Tuning Class - HyperParameterTuner & XGBHyperparameterTuner

In [None]:
monotone_cols = [x for x in list(X_train_preproc) if "avg" in x]
monotone_increasing = [1]*len(monotone_cols)
monotonicity_dict = dict(zip(monotone_cols, monotone_increasing))

In [None]:
xgb_tuner = XGBYearHyperparameterTuner(X_train_preproc, y_train, optuna_grid=OptunaXGBParamGrid, monotonicity_constraints={})

In [None]:
xgb_tuner.training_data.head()

In [None]:
xgb_tuner.tune_hyperparameters()

In [None]:
params = xgb_tuner.get_best_params()
params

Training Model - SuperXGBRegressor class for training and predictions

In [None]:
params['objective'] = 'reg:squarederror'
params['num_rounds'] = 1000
params['early_stopping_rounds'] = 50
params['verbosity'] = 1
params['monotone_constraints'] = {}

In [None]:
super_xgb = SuperXGBRegressor(X_train = X_train_preproc, 
                              y_train = y_train, 
                              X_test = X_test_preproc, 
                              y_test = y_test, 
                              params = params)

In [None]:
super_xgb.fit()

In [None]:
super_xgb.xgb_model

In [None]:
train_preds = super_xgb.predict(X_train_preproc.drop(columns=["Match_ID"]))
test_preds = super_xgb.predict(X_test_preproc.drop(columns=["Match_ID"]))

Check Average Predictions

In [None]:
train_preds.mean(), training_data[ModellingDataContract.RESPONSE].mean(), test_preds.mean(), test_data[ModellingDataContract.RESPONSE].mean()

Check Distribution

In [None]:
fig = sns.kdeplot(training_data[ModellingDataContract.RESPONSE], shade=True, color="r")
fig = sns.kdeplot(train_preds, shade=True, color="b")
fig = sns.kdeplot(test_data[ModellingDataContract.RESPONSE], shade=True, color="r", linestyle = "--")
fig = sns.kdeplot(test_preds, shade=True, color="b", linestyle = "--")

Export model

In [None]:
super_xgb.export_model(model_output_path + model_file_name + ".joblib")

Export data and predictions

In [None]:
train_export = training_data[['Match_ID', 'Home_Team', 'Away_Team', 'Round_ID', 'Total_Game_Score']].reset_index(drop = True)
train_export = pd.merge(train_export, X_train_preproc, how='left', on = "Match_ID")
train_export['xgb_preds'] = train_preds
train_export.to_csv(prediction_output_path + 'train_predictions_' + model_file_name + '.csv', index = False)
train_export.head()

In [None]:
test_export = test_data[['Match_ID', 'Home_Team', 'Away_Team', 'Round_ID', 'Total_Game_Score']].reset_index(drop = True)
test_export = pd.merge(test_export, X_test_preproc, how='left', on = "Match_ID")
test_export['xgb_preds'] = test_preds
test_export.to_csv(prediction_output_path + 'test_predictions_' + model_file_name + '.csv', index = False)
test_export.head()

Save preprocessor

In [None]:
joblib.dump(preprocessor, preprocessor_output_path + preprocessor_file_name + ".joblib")