Expected Behind Model - Model Tuning and Building - GBM

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import joblib
import warnings
warnings.filterwarnings('ignore')

from expected_score_model.config import behind_set_shots_file_path
from expected_score_model.domain.contracts.modelling_data_contract import ModellingDataContract
from expected_score_model.domain.modelling.hyperparameter_tuning import XGBHyperparameterTuner
from expected_score_model.domain.modelling.supermodel import SuperXGBClassifier
from expected_score_model.domain.modelling.optuna_xgb_param_grid import OptunaXGBParamGrid

pd.options.display.max_rows = 100
pd.options.display.max_columns = 999

%load_ext autoreload
%autoreload 2

Model Version

In [2]:
model_version = 7
model_name = 'expected_behind_set'
model_file_name = model_name + '_v' + str(model_version)

model_output_path = "/Users/ciaran/Documents/Projects/AFL/git-repositories/expected-score-model/model_outputs/models"
prediction_output_path = "/Users/ciaran/Documents/Projects/AFL/git-repositories/expected-score-model/model_outputs/predictions/"

preprocessor_file_name = 'preprocessor_v' + str(model_version)
preprocessor_output_path = "/Users/ciaran/Documents/Projects/AFL/git-repositories/expected-score-model/model_outputs/preprocessors/"

In [3]:
RESPONSE = ModellingDataContract.RESPONSE_BEHIND

In [4]:
FEATURES = ModellingDataContract.feature_list_set_behind

In [5]:
MONOTONE_CONSTRAINTS = ModellingDataContract.monotone_constraints_set_behind

Load Data

In [6]:
df_modelling = pd.read_csv(behind_set_shots_file_path)
df_modelling.tail()

Unnamed: 0,Chain_Number,Initial_State,Final_State,Order,Quarter,Quarter_Duration_Chain_Start,Quarter_Duration,Team_Chain,Team,Player,AFL_API_Player_ID,Description,x,y,Disposal,Shot_At_Goal,Behind_Detail,Venue_Width,Venue_Length,Home_Team,Away_Team,Home_Team_Direction_Q1,Match_ID,Round_ID,Year,Season,Goal,Behind,Miss,Score,Event_Type1,Set_Shot,x0,y0,Distance_to_Middle_Goal,Angle_to_Middle_Goal,Visible_Goal_Angle,BehindTrainingSet,BehindTestSet,BehindValidationSet
16981,186,possGain,behind,1447.0,4,185,224.0,Collingwood,Collingwood,Bobby Hill,Bobby_Hill,Kick,32.0,0.0,ineffective,True,,141,160,Collingwood,Brisbane Lions,right,2023F4_Collingwood_BrisbaneLions,2023F4,,2023.0,0,1,0,1,Uncontested Mark,True,32.0,0.0,48.0,0.0,0.133929,True,False,False
16982,196,possGain,behind,1527.0,4,425,476.0,Brisbane Lions,Brisbane Lions,Joe Daniher,Joe_Daniher,Kick,50.0,-19.0,ineffective,True,,141,160,Collingwood,Brisbane Lions,right,2023F4_Collingwood_BrisbaneLions,2023F4,,2023.0,0,1,0,1,Mark On Lead,True,50.0,-19.0,35.510562,0.564569,0.153507,False,False,True
16983,210,throwIn,behind,1610.0,4,804,841.0,Brisbane Lions,Brisbane Lions,Eric Hipwood,Eric_Hipwood,Kick,36.0,37.0,ineffective,True,,141,160,Collingwood,Brisbane Lions,right,2023F4_Collingwood_BrisbaneLions,2023F4,,2023.0,0,1,0,1,Uncontested Mark,True,36.0,37.0,57.489129,0.699193,0.085469,True,False,False
16984,224,possGain,goal,1740.0,4,1290,1348.0,Collingwood,Collingwood,Steele Sidebottom,Steele_Sidebottom,Kick,37.0,34.0,effective,True,,141,160,Collingwood,Brisbane Lions,right,2023F4_Collingwood_BrisbaneLions,2023F4,,2023.0,1,0,0,6,Mark On Lead,True,37.0,34.0,54.81788,0.669043,0.091894,True,False,False
16985,233,ballUp,goal,1811.0,4,1591,1629.0,Brisbane Lions,Brisbane Lions,Joe Daniher,Joe_Daniher,Kick,54.0,2.0,effective,True,,141,160,Collingwood,Brisbane Lions,right,2023F4_Collingwood_BrisbaneLions,2023F4,,2023.0,1,0,0,6,Uncontested Mark,True,54.0,2.0,26.07681,0.076772,0.248447,False,False,True


In [7]:
df_modelling[['ballUp', 'centreBounce', 'kickIn', 'possGain', 'throwIn']] = pd.get_dummies(df_modelling['Initial_State'])

In [8]:
training_data = df_modelling[df_modelling[RESPONSE+"TrainingSet"]]
test_data = df_modelling[df_modelling[RESPONSE+"TestSet"]]
cal_data = df_modelling[df_modelling[RESPONSE+"ValidationSet"]]

In [9]:
X_train, y_train = training_data.drop(columns=[RESPONSE]), training_data[RESPONSE]
X_test, y_test = test_data.drop(columns=[RESPONSE]), test_data[RESPONSE]
X_cal, y_cal = cal_data.drop(columns=[RESPONSE]), cal_data[RESPONSE]

In [10]:
y_train.mean(), y_test.mean(), y_cal.mean()

(0.33192272309107634, 0.3366686286050618, 0.31309786607799855)

In [11]:
X_test.head()

Unnamed: 0,Chain_Number,Initial_State,Final_State,Order,Quarter,Quarter_Duration_Chain_Start,Quarter_Duration,Team_Chain,Team,Player,AFL_API_Player_ID,Description,x,y,Disposal,Shot_At_Goal,Behind_Detail,Venue_Width,Venue_Length,Home_Team,Away_Team,Home_Team_Direction_Q1,Match_ID,Round_ID,Year,Season,Goal,Miss,Score,Event_Type1,Set_Shot,x0,y0,Distance_to_Middle_Goal,Angle_to_Middle_Goal,Visible_Goal_Angle,BehindTrainingSet,BehindTestSet,BehindValidationSet,ballUp,centreBounce,kickIn,possGain,throwIn
1,28,possGain,behind,295.0,1,859,911.0,Sydney,Sydney,Sam Reid,Sam_Reid,Kick,52.0,35.0,ineffective,True,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021.0,,0,0,1,Contested Mark,True,52.0,35.0,43.600459,0.931882,0.088007,False,True,False,False,False,False,True,False
17,168,centreBounce,miss,1480.0,3,1850,1899.0,Brisbane Lions,Brisbane Lions,Eric Hipwood,Eric_Hipwood,Kick,52.0,-31.0,clanger,True,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021.0,,0,1,0,Mark On Lead,True,52.0,-31.0,40.459857,0.872894,0.102289,False,True,False,False,True,False,False,False
19,189,kickIn,goal,1639.0,4,390,439.0,Brisbane Lions,Brisbane Lions,Lincoln McCarthy,Lincoln_McCarthy,Kick,55.0,-2.0,effective,True,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021.0,,1,0,6,Free For,True,55.0,-2.0,23.086793,0.086738,0.281582,False,True,False,False,False,True,False,False
25,11,centreBounce,goal,103.0,1,190,249.0,Western Bulldogs,Western Bulldogs,Aaron Naughton,Aaron_Naughton,Kick,54.0,-18.0,effective,True,,141,160,Collingwood,Western Bulldogs,right,202101_Collingwood_WesternBulldogs,202101,2021.0,,1,0,6,Mark On Lead,True,54.0,-18.0,31.622777,0.605545,0.168122,False,True,False,False,True,False,False,False
36,139,possGain,behind,1190.0,3,205,256.0,Collingwood,Collingwood,Brody Mihocek,Brody_Mihocek,Kick,50.0,-30.0,ineffective,True,,141,160,Collingwood,Western Bulldogs,right,202101_Collingwood_WesternBulldogs,202101,2021.0,,0,0,1,Uncontested Mark,True,50.0,-30.0,42.426407,0.785398,0.107277,False,True,False,False,False,False,True,False


Preprocess Data

In [12]:
X_train_preproc = X_train[FEATURES]
X_test_preproc = X_test[FEATURES]
X_cal_preproc = X_cal[FEATURES]

In [13]:
X_train_preproc.shape, X_test_preproc.shape

((10870, 5), (3398, 5))

In [14]:
X_train_preproc.head()

Unnamed: 0,x0,y0,Distance_to_Middle_Goal,Angle_to_Middle_Goal,Visible_Goal_Angle
0,46.0,-41.0,52.009614,0.908067,0.075999
2,45.0,16.0,36.674242,0.451453,0.158231
4,33.0,12.0,46.572524,0.260602,0.13341
5,52.0,36.0,44.407207,0.945311,0.084822
6,47.0,35.0,46.754679,0.84593,0.091187


In [15]:
X_test_preproc.head()

Unnamed: 0,x0,y0,Distance_to_Middle_Goal,Angle_to_Middle_Goal,Visible_Goal_Angle
1,52.0,35.0,43.600459,0.931882,0.088007
17,52.0,-31.0,40.459857,0.872894,0.102289
19,55.0,-2.0,23.086793,0.086738,0.281582
25,54.0,-18.0,31.622777,0.605545,0.168122
36,50.0,-30.0,42.426407,0.785398,0.107277


Optuna Hyperparameter Tuning Class - HyperParameterTuner & XGBHyperparameterTuner

In [16]:
xgb_tuner = XGBHyperparameterTuner(X_train_preproc, y_train, monotonicity_constraints=MONOTONE_CONSTRAINTS)

In [17]:
xgb_tuner.tune_hyperparameters()

[I 2023-10-21 13:33:54,671] A new study created in memory with name: no-name-28c20660-cec4-457d-948f-4ec5543ba5df
[I 2023-10-21 13:33:54,713] Trial 0 finished with value: 0.6782016151974789 and parameters: {'max_depth': 7, 'min_child_weight': 51, 'eta': 0.011452114065365677, 'gamma': 0.0031295321750993833, 'lambda': 0.0009103338124972951, 'alpha': 0.42986262648047296, 'subsample': 0.8980441881488417, 'colsample_bytree': 0.5093370236903694}. Best is trial 0 with value: 0.6782016151974789.
[I 2023-10-21 13:33:54,741] Trial 1 finished with value: 0.6096415374929813 and parameters: {'max_depth': 8, 'min_child_weight': 63, 'eta': 0.6413699964621897, 'gamma': 0.767309234609037, 'lambda': 1.8756163446558864, 'alpha': 0.0029522173417297634, 'subsample': 0.5475849812162268, 'colsample_bytree': 0.8014625367858947}. Best is trial 1 with value: 0.6096415374929813.
[I 2023-10-21 13:33:54,775] Trial 2 finished with value: 0.6677619249752951 and parameters: {'max_depth': 11, 'min_child_weight': 13, '

Number of finished trials:  1000
Best trial:
  Value: 0.5853429571994184
  Params: 
    max_depth: 2
    min_child_weight: 18
    eta: 0.32215869925866353
    gamma: 1.0440771645183424
    lambda: 0.20452596479571178
    alpha: 0.00109122067613544
    subsample: 0.8188029336352194
    colsample_bytree: 0.8354729847950436


<optuna.study.study.Study at 0x13d815050>

In [18]:
params = xgb_tuner.get_best_params()
params

{'max_depth': 2,
 'min_child_weight': 18,
 'eta': 0.32215869925866353,
 'gamma': 1.0440771645183424,
 'lambda': 0.20452596479571178,
 'alpha': 0.00109122067613544,
 'subsample': 0.8188029336352194,
 'colsample_bytree': 0.8354729847950436}

Training Model - SuperXGBClassifier class for training and predictions

In [19]:
params['objective'] = OptunaXGBParamGrid.error
params['num_rounds'] = 1000
params['early_stopping_rounds'] = 50
params['verbosity'] = 1
params['monotone_constraints'] = MONOTONE_CONSTRAINTS

In [20]:
super_xgb = SuperXGBClassifier(X_train = X_train_preproc, 
                               y_train = y_train, 
                               X_test = X_test_preproc, 
                               y_test = y_test,
                               X_cal = X_cal_preproc,
                               y_cal = y_cal,
                               params = params)

In [21]:
super_xgb.fit()

[0]	validation_0-logloss:0.65018	validation_1-logloss:0.65179
[1]	validation_0-logloss:0.62895	validation_1-logloss:0.63115
[2]	validation_0-logloss:0.61741	validation_1-logloss:0.62042
[3]	validation_0-logloss:0.61100	validation_1-logloss:0.61411
[4]	validation_0-logloss:0.60712	validation_1-logloss:0.61050
[5]	validation_0-logloss:0.60465	validation_1-logloss:0.60821
[6]	validation_0-logloss:0.60318	validation_1-logloss:0.60701
[7]	validation_0-logloss:0.60183	validation_1-logloss:0.60555
[8]	validation_0-logloss:0.60076	validation_1-logloss:0.60449
[9]	validation_0-logloss:0.60003	validation_1-logloss:0.60425
[10]	validation_0-logloss:0.59970	validation_1-logloss:0.60423
[11]	validation_0-logloss:0.59913	validation_1-logloss:0.60366
[12]	validation_0-logloss:0.59892	validation_1-logloss:0.60368
[13]	validation_0-logloss:0.59872	validation_1-logloss:0.60353
[14]	validation_0-logloss:0.59829	validation_1-logloss:0.60297
[15]	validation_0-logloss:0.59816	validation_1-logloss:0.60288
[1

In [22]:
super_xgb.xgb_model

In [23]:
super_xgb.xgb_model.get_booster().feature_names

['x0',
 'y0',
 'Distance_to_Middle_Goal',
 'Angle_to_Middle_Goal',
 'Visible_Goal_Angle']

In [24]:
train_preds = super_xgb.predict(X_train_preproc)
test_preds = super_xgb.predict(X_test_preproc)

In [25]:
train_probas = super_xgb.predict_proba(X_train_preproc)[:, 1]
test_probas = super_xgb.predict_proba(X_test_preproc)[:, 1]
cal_probas = super_xgb.predict_proba(X_cal_preproc)[:, 1]

In [26]:
super_xgb.calibrate()



In [27]:
train_cal_probas = super_xgb.predict_proba(X_train_preproc, calibrate=True)
test_cal_probas = super_xgb.predict_proba(X_test_preproc, calibrate=True)
cal_cal_probas = super_xgb.predict_proba(X_cal_preproc, calibrate=True)

Check Average Predictions

In [28]:
train_probas.mean(), training_data[RESPONSE].mean(), train_cal_probas.mean()

(0.33307832, 0.33192272309107634, 0.3149201485826931)

In [29]:
test_probas.mean(), test_data[RESPONSE].mean(), test_cal_probas.mean()

(0.3344802, 0.3366686286050618, 0.31628968876000235)

In [30]:
cal_probas.mean(), cal_data[RESPONSE].mean(), cal_cal_probas.mean()

(0.33087063, 0.31309786607799855, 0.3130978849577084)

Export model

In [31]:
super_xgb.export_model(model_output_path + "/" + model_file_name + ".joblib")

Export data and predictions

In [32]:
train_info = training_data[['Match_ID', "Chain_Number", "Order", RESPONSE]].reset_index(drop = True)
train_export = pd.concat([train_info, X_train_preproc.reset_index(drop=True)], axis=1)
train_export['xgb_preds'] = train_preds
train_export['xgb_probas'] = train_probas
train_export['xgb_probas_cal'] = train_cal_probas
train_export.to_csv(prediction_output_path + 'train_predictions_' + model_file_name + '.csv', index = False)
train_export.head()

Unnamed: 0,Match_ID,Chain_Number,Order,Behind,x0,y0,Distance_to_Middle_Goal,Angle_to_Middle_Goal,Visible_Goal_Angle,xgb_preds,xgb_probas,xgb_probas_cal
0,202101_BrisbaneLions_Sydney,17,183.0,1,46.0,-41.0,52.009614,0.908067,0.075999,0,0.327786,0.322267
1,202101_BrisbaneLions_Sydney,34,354.0,1,45.0,16.0,36.674242,0.451453,0.158231,0,0.324858,0.320271
2,202101_BrisbaneLions_Sydney,59,578.0,0,33.0,12.0,46.572524,0.260602,0.13341,0,0.395459,0.36543
3,202101_BrisbaneLions_Sydney,62,621.0,0,52.0,36.0,44.407207,0.945311,0.084822,0,0.425701,0.383036
4,202101_BrisbaneLions_Sydney,72,696.0,0,47.0,35.0,46.754679,0.84593,0.091187,0,0.392903,0.363898


In [33]:
test_info = test_data[['Match_ID', "Chain_Number", "Order", RESPONSE]].reset_index(drop = True)
test_export = pd.concat([test_info, X_test_preproc.reset_index(drop=True)], axis=1)
test_export['xgb_preds'] = test_preds
test_export['xgb_probas'] = test_probas
test_export['xgb_probas_cal'] = test_cal_probas
test_export.to_csv(prediction_output_path + 'test_predictions_' + model_file_name + '.csv', index = False)
test_export.head()

Unnamed: 0,Match_ID,Chain_Number,Order,Behind,x0,y0,Distance_to_Middle_Goal,Angle_to_Middle_Goal,Visible_Goal_Angle,xgb_preds,xgb_probas,xgb_probas_cal
0,202101_BrisbaneLions_Sydney,28,295.0,1,52.0,35.0,43.600459,0.931882,0.088007,0,0.425701,0.383036
1,202101_BrisbaneLions_Sydney,168,1480.0,0,52.0,-31.0,40.459857,0.872894,0.102289,0,0.418312,0.378823
2,202101_BrisbaneLions_Sydney,189,1639.0,0,55.0,-2.0,23.086793,0.086738,0.281582,0,0.132706,0.158961
3,202101_Collingwood_WesternBulldogs,11,103.0,0,54.0,-18.0,31.622777,0.605545,0.168122,0,0.272565,0.282599
4,202101_Collingwood_WesternBulldogs,139,1190.0,1,50.0,-30.0,42.426407,0.785398,0.107277,0,0.4246,0.382412
