Open Play Shots - Expected Behind Model - Model Tuning and Building - GBM

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import joblib
import warnings
warnings.filterwarnings('ignore')

from expected_score_model.config import behind_open_shots_file_path
from expected_score_model.domain.contracts.modelling_data_contract import ModellingDataContract
from expected_score_model.domain.modelling.hyperparameter_tuning import XGBHyperparameterTuner
from expected_score_model.domain.modelling.supermodel import SuperXGBClassifier
from expected_score_model.domain.modelling.optuna_xgb_param_grid import OptunaXGBParamGrid

pd.options.display.max_rows = 100
pd.options.display.max_columns = 999

%load_ext autoreload
%autoreload 2

Model Version

In [2]:
model_version = 8
model_name = 'expected_behind_open'
model_file_name = model_name + '_v' + str(model_version)

model_output_path = "/Users/ciaran/Documents/Projects/AFL/git-repositories/expected-score-model/model_outputs/models"
prediction_output_path = "/Users/ciaran/Documents/Projects/AFL/git-repositories/expected-score-model/model_outputs/predictions/"

preprocessor_file_name = 'preprocessor_v' + str(model_version)
preprocessor_output_path = "/Users/ciaran/Documents/Projects/AFL/git-repositories/expected-score-model/model_outputs/preprocessors/"

In [3]:
RESPONSE = ModellingDataContract.RESPONSE_BEHIND

In [4]:
FEATURES = ModellingDataContract.feature_list_open_behind

In [5]:
MONOTONE_CONSTRAINTS = ModellingDataContract.monotone_constraints_open_behind

Load Data

In [6]:
df_modelling = pd.read_csv(behind_open_shots_file_path)
df_modelling.tail()

Unnamed: 0,Chain_Number,Initial_State,Final_State,Order,Quarter,Quarter_Duration_Chain_Start,Quarter_Duration,Team_Chain,Team,Player,AFL_API_Player_ID,Description,x,y,Disposal,Shot_At_Goal,Behind_Detail,Venue_Width,Venue_Length,Home_Team,Away_Team,Home_Team_Direction_Q1,Match_ID,Round_ID,Year,Season,Goal,Behind,Miss,Score,Event_Type1,Set_Shot,x0,y0,Distance_Since_Last_Action,Distance_to_Middle_Goal,Angle_to_Middle_Goal,Visible_Goal_Angle,BehindTrainingSet,BehindTestSet,BehindValidationSet
14170,166,ballUp,behind,1260.0,3,1299,1303.0,Collingwood,Collingwood,Mason Cox,Mason_Cox,Kick,73.0,-9.0,ineffective,True,,141,160,Collingwood,Brisbane Lions,right,2023F4_Collingwood_BrisbaneLions,2023F4,,2023.0,0,1,0,1,Handball Received,False,73.0,-9.0,0.0,11.401754,0.909753,0.374081,False,False,True
14171,184,possGain,behind,1430.0,4,122,156.0,Collingwood,Collingwood,Nick Daicos,Nick_Daicos,Kick,72.0,24.0,ineffective,True,,141,160,Collingwood,Brisbane Lions,right,2023F4_Collingwood_BrisbaneLions,2023F4,,2023.0,0,1,0,1,Gather,False,72.0,24.0,2.0,25.298221,1.249046,0.081301,True,False,False
14172,207,possGain,behind,1587.0,4,744,746.0,Brisbane Lions,Brisbane Lions,Keidean Coleman,Keidean_Coleman,Kick,26.0,-1.0,ineffective,True,,141,160,Collingwood,Brisbane Lions,right,2023F4_Collingwood_BrisbaneLions,2023F4,,2023.0,0,1,0,1,Handball Received,False,26.0,-1.0,7.071068,54.009258,0.018516,0.118895,True,False,False
14173,221,ballUp,goal,1718.0,4,1109,1128.0,Brisbane Lions,Brisbane Lions,Charlie Cameron,Charlie_Cameron,Kick,73.0,6.0,effective,True,,141,160,Collingwood,Brisbane Lions,right,2023F4_Collingwood_BrisbaneLions,2023F4,,2023.0,1,0,0,6,Hard Ball Get,False,73.0,6.0,2.0,9.219544,0.708626,0.599251,True,False,False
14174,222,centreBounce,goal,1732.0,4,1202,1219.0,Collingwood,Collingwood,Jordan De Goey,Jordan_De_Goey,Kick,32.0,8.0,effective,True,,141,160,Collingwood,Brisbane Lions,right,2023F4_Collingwood_BrisbaneLions,2023F4,,2023.0,1,0,0,6,Handball Received,False,32.0,8.0,7.211103,48.6621,0.165149,0.130293,True,False,False


In [7]:
df_modelling[['ballUp', 'centreBounce', 'kickIn', 'possGain', 'throwIn']] = pd.get_dummies(df_modelling['Initial_State'])

In [8]:
training_data = df_modelling[df_modelling[RESPONSE+"TrainingSet"]]
test_data = df_modelling[df_modelling[RESPONSE+"TestSet"]]
cal_data = df_modelling[df_modelling[RESPONSE+"ValidationSet"]]

In [9]:
X_train, y_train = training_data.drop(columns=[RESPONSE]), training_data[RESPONSE]
X_test, y_test = test_data.drop(columns=[RESPONSE]), test_data[RESPONSE]
X_cal, y_cal = cal_data.drop(columns=[RESPONSE]), cal_data[RESPONSE]

In [10]:
X_test.head()

Unnamed: 0,Chain_Number,Initial_State,Final_State,Order,Quarter,Quarter_Duration_Chain_Start,Quarter_Duration,Team_Chain,Team,Player,AFL_API_Player_ID,Description,x,y,Disposal,Shot_At_Goal,Behind_Detail,Venue_Width,Venue_Length,Home_Team,Away_Team,Home_Team_Direction_Q1,Match_ID,Round_ID,Year,Season,Goal,Miss,Score,Event_Type1,Set_Shot,x0,y0,Distance_Since_Last_Action,Distance_to_Middle_Goal,Angle_to_Middle_Goal,Visible_Goal_Angle,BehindTrainingSet,BehindTestSet,BehindValidationSet,ballUp,centreBounce,kickIn,possGain,throwIn
1,6,possGain,goal,61.0,1,149,168.0,Brisbane Lions,Brisbane Lions,Zac Bailey,Zac_Bailey,Kick,35.0,19.0,effective,True,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021.0,,1,0,6,Handball Received,False,35.0,19.0,7.0,47.010637,0.416065,0.125105,False,True,False,False,False,False,True,False
5,43,possGain,behind,444.0,1,1272,1317.0,Brisbane Lions,Brisbane Lions,Joe Daniher,Joe_Daniher,Kick,27.0,31.0,ineffective,True,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021.0,,0,0,1,Handball Received,False,27.0,31.0,3.162278,59.682493,0.546167,0.091898,False,True,False,False,False,False,True,False
9,83,throwIn,goal,781.0,2,848,861.0,Brisbane Lions,Brisbane Lions,Jarryd Lyons,Jarryd_Lyons,Kick,46.0,2.0,effective,True,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021.0,,1,0,6,Gather,False,46.0,2.0,5.0,32.062439,0.062419,0.201226,False,True,False,False,False,False,False,True
11,91,possGain,miss,863.0,2,1223,1226.0,Brisbane Lions,Brisbane Lions,Zac Bailey,Zac_Bailey,Kick,54.0,-39.0,clanger,True,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021.0,,0,1,0,Hard Ball Get Crumb,False,54.0,-39.0,7.81025,45.793013,1.019141,0.073607,False,True,False,False,False,False,True,False
19,161,possGain,goal,1422.0,3,1570,1575.0,Brisbane Lions,Brisbane Lions,Jarrod Berry,Jarrod_Berry,Kick,38.0,-37.0,effective,True,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021.0,,1,0,6,Handball Received,False,38.0,-37.0,9.219544,54.488531,0.746457,0.086523,False,True,False,False,False,False,True,False


Preprocess Data

In [11]:
# preprocessor = DataPreprocessor()
# preprocessor.fit(X_train)

# X_train_preproc = preprocessor.transform(X_train)
# X_test_preproc = preprocessor.transform(X_test)

In [12]:
X_train_preproc = X_train[FEATURES]
X_test_preproc = X_test[FEATURES]
X_cal_preproc = X_cal[FEATURES]

In [13]:
X_train_preproc.shape, X_test_preproc.shape

((9072, 6), (2835, 6))

In [14]:
X_train_preproc.head()

Unnamed: 0,x0,y0,Distance_Since_Last_Action,Distance_to_Middle_Goal,Angle_to_Middle_Goal,Visible_Goal_Angle
0,26.0,-21.0,8.246211,56.0803,0.383818,0.106164
2,50.0,24.0,4.0,36.878178,0.708626,0.132764
3,34.0,15.0,15.132746,46.486557,0.328553,0.13093
4,40.0,-11.0,4.123106,39.560081,0.281772,0.156423
6,56.0,6.0,15.132746,22.803509,0.266252,0.276208


In [15]:
X_test_preproc.head()

Unnamed: 0,x0,y0,Distance_Since_Last_Action,Distance_to_Middle_Goal,Angle_to_Middle_Goal,Visible_Goal_Angle
1,35.0,19.0,7.0,47.010637,0.416065,0.125105
5,27.0,31.0,3.162278,59.682493,0.546167,0.091898
9,46.0,2.0,5.0,32.062439,0.062419,0.201226
11,54.0,-39.0,7.81025,45.793013,1.019141,0.073607
19,38.0,-37.0,9.219544,54.488531,0.746457,0.086523


Optuna Hyperparameter Tuning Class - HyperParameterTuner & XGBHyperparameterTuner

In [16]:
xgb_tuner = XGBHyperparameterTuner(X_train_preproc, y_train, monotonicity_constraints=MONOTONE_CONSTRAINTS)

In [17]:
xgb_tuner.training_data.head()

Unnamed: 0,x0,y0,Distance_Since_Last_Action,Distance_to_Middle_Goal,Angle_to_Middle_Goal,Visible_Goal_Angle
0,26.0,-21.0,8.246211,56.0803,0.383818,0.106164
2,50.0,24.0,4.0,36.878178,0.708626,0.132764
3,34.0,15.0,15.132746,46.486557,0.328553,0.13093
4,40.0,-11.0,4.123106,39.560081,0.281772,0.156423
6,56.0,6.0,15.132746,22.803509,0.266252,0.276208


In [18]:
xgb_tuner.tune_hyperparameters()

[I 2023-10-21 14:03:54,352] A new study created in memory with name: no-name-abdfb04e-2241-4af8-b367-6c43e2e78183
[I 2023-10-21 14:03:54,385] Trial 0 finished with value: 0.654629824011258 and parameters: {'max_depth': 3, 'min_child_weight': 72, 'eta': 0.032152682977203306, 'gamma': 0.0011232458320316868, 'lambda': 0.0385756337951647, 'alpha': 0.01028032150847008, 'subsample': 0.215368993238393, 'colsample_bytree': 0.7314403954064685}. Best is trial 0 with value: 0.654629824011258.
[I 2023-10-21 14:03:54,403] Trial 1 finished with value: 0.6484961693402147 and parameters: {'max_depth': 4, 'min_child_weight': 40, 'eta': 0.3791951397051201, 'gamma': 0.3102319746708409, 'lambda': 1.8346043206602882, 'alpha': 0.1947119874339484, 'subsample': 0.2401440183911411, 'colsample_bytree': 0.3993962382600858}. Best is trial 1 with value: 0.6484961693402147.
[I 2023-10-21 14:03:54,423] Trial 2 finished with value: 0.6483183112896453 and parameters: {'max_depth': 14, 'min_child_weight': 94, 'eta': 0.

Number of finished trials:  1000
Best trial:
  Value: 0.6249692973223142
  Params: 
    max_depth: 8
    min_child_weight: 5
    eta: 0.45167799059210706
    gamma: 0.0030406096470583983
    lambda: 3.0079512673143083
    alpha: 1.2007909209885528
    subsample: 0.8776680686942998
    colsample_bytree: 0.5701354897031586


<optuna.study.study.Study at 0x14d99e5d0>

In [19]:
params = xgb_tuner.get_best_params()
params

{'max_depth': 8,
 'min_child_weight': 5,
 'eta': 0.45167799059210706,
 'gamma': 0.0030406096470583983,
 'lambda': 3.0079512673143083,
 'alpha': 1.2007909209885528,
 'subsample': 0.8776680686942998,
 'colsample_bytree': 0.5701354897031586}

Training Model - SuperXGBClassifier class for training and predictions

In [20]:
params['objective'] = OptunaXGBParamGrid.error
params['num_rounds'] = 1000
params['early_stopping_rounds'] = 50
params['verbosity'] = 1
params['monotone_constraints'] = MONOTONE_CONSTRAINTS

In [21]:
super_xgb = SuperXGBClassifier(X_train = X_train_preproc, 
                               y_train = y_train, 
                               X_test = X_test_preproc, 
                               y_test = y_test,
                               X_cal = X_cal_preproc,
                               y_cal = y_cal,
                               params = params)

In [22]:
super_xgb.fit()

[0]	validation_0-logloss:0.64551	validation_1-logloss:0.64289
[1]	validation_0-logloss:0.64183	validation_1-logloss:0.63958
[2]	validation_0-logloss:0.63804	validation_1-logloss:0.63671
[3]	validation_0-logloss:0.63704	validation_1-logloss:0.63547
[4]	validation_0-logloss:0.63420	validation_1-logloss:0.63389
[5]	validation_0-logloss:0.63290	validation_1-logloss:0.63337
[6]	validation_0-logloss:0.63146	validation_1-logloss:0.63353
[7]	validation_0-logloss:0.62964	validation_1-logloss:0.63400
[8]	validation_0-logloss:0.62768	validation_1-logloss:0.63330
[9]	validation_0-logloss:0.62649	validation_1-logloss:0.63426
[10]	validation_0-logloss:0.62638	validation_1-logloss:0.63429
[11]	validation_0-logloss:0.62536	validation_1-logloss:0.63382
[12]	validation_0-logloss:0.62498	validation_1-logloss:0.63364
[13]	validation_0-logloss:0.62289	validation_1-logloss:0.63421
[14]	validation_0-logloss:0.62249	validation_1-logloss:0.63386
[15]	validation_0-logloss:0.62224	validation_1-logloss:0.63374
[1

In [23]:
super_xgb.xgb_model

In [24]:
super_xgb.xgb_model.get_booster().feature_names

['x0',
 'y0',
 'Distance_Since_Last_Action',
 'Distance_to_Middle_Goal',
 'Angle_to_Middle_Goal',
 'Visible_Goal_Angle']

In [25]:
train_preds = super_xgb.predict(X_train_preproc)
test_preds = super_xgb.predict(X_test_preproc)

In [26]:
train_probas = super_xgb.predict_proba(X_train_preproc)[:, 1]
test_probas = super_xgb.predict_proba(X_test_preproc)[:, 1]
cal_probas = super_xgb.predict_proba(X_cal_preproc)[:, 1]

In [27]:
super_xgb.calibrate()

In [28]:
train_cal_probas = super_xgb.predict_proba(X_train_preproc, calibrate=True)
test_cal_probas = super_xgb.predict_proba(X_test_preproc, calibrate=True)

Check Average Predictions

In [29]:
train_probas.mean(), training_data[RESPONSE].mean(), train_cal_probas.mean()

(0.36394462, 0.36408730158730157, 0.36599968725184856)

In [30]:
test_probas.mean(), test_data[RESPONSE].mean(), test_cal_probas.mean()

(0.36590478, 0.3576719576719577, 0.36788082262693556)

Export model

In [31]:
super_xgb.export_model(model_output_path + "/" + model_file_name + ".joblib")

In [34]:
model_output_path + "/" + model_file_name + ".joblib"

'/Users/ciaran/Documents/Projects/AFL/git-repositories/expected-score-model/model_outputs/models/expected_behind_open_v8.joblib'

In [35]:
joblib.load(model_output_path + "/" + model_file_name + ".joblib")

<expected_score_model.domain.modelling.supermodel.SuperXGBClassifier at 0x14dca3510>

Export data and predictions

In [32]:
train_info = training_data[['Match_ID', "Chain_Number", "Order", RESPONSE]].reset_index(drop = True)
train_export = pd.concat([train_info, X_train_preproc.reset_index(drop=True)], axis=1)
train_export['xgb_preds'] = train_preds
train_export['xgb_probas'] = train_probas
train_export['xgb_probas_cal'] = train_cal_probas
train_export.to_csv(prediction_output_path + 'train_predictions_' + model_file_name + '.csv', index = False)
train_export.head()

Unnamed: 0,Match_ID,Chain_Number,Order,Behind,x0,y0,Distance_Since_Last_Action,Distance_to_Middle_Goal,Angle_to_Middle_Goal,Visible_Goal_Angle,xgb_preds,xgb_probas,xgb_probas_cal
0,202101_BrisbaneLions_Sydney,1,11.0,0,26.0,-21.0,8.246211,56.0803,0.383818,0.106164,0,0.42105,0.412663
1,202101_BrisbaneLions_Sydney,10,110.0,0,50.0,24.0,4.0,36.878178,0.708626,0.132764,0,0.365892,0.373314
2,202101_BrisbaneLions_Sydney,21,222.0,0,34.0,15.0,15.132746,46.486557,0.328553,0.13093,0,0.375845,0.380724
3,202101_BrisbaneLions_Sydney,38,392.0,0,40.0,-11.0,4.123106,39.560081,0.281772,0.156423,0,0.372426,0.378194
4,202101_BrisbaneLions_Sydney,46,476.0,0,56.0,6.0,15.132746,22.803509,0.266252,0.276208,0,0.285732,0.308165


In [33]:
test_info = test_data[['Match_ID', "Chain_Number", "Order", RESPONSE]].reset_index(drop = True)
test_export = pd.concat([test_info, X_test_preproc.reset_index(drop=True)], axis=1)
test_export['xgb_preds'] = test_preds
test_export['xgb_probas'] = test_probas
test_export['xgb_probas_cal'] = test_cal_probas
test_export.to_csv(prediction_output_path + 'test_predictions_' + model_file_name + '.csv', index = False)
test_export.head()

Unnamed: 0,Match_ID,Chain_Number,Order,Behind,x0,y0,Distance_Since_Last_Action,Distance_to_Middle_Goal,Angle_to_Middle_Goal,Visible_Goal_Angle,xgb_preds,xgb_probas,xgb_probas_cal
0,202101_BrisbaneLions_Sydney,6,61.0,0,35.0,19.0,7.0,47.010637,0.416065,0.125105,0,0.349093,0.360482
1,202101_BrisbaneLions_Sydney,43,444.0,1,27.0,31.0,3.162278,59.682493,0.546167,0.091898,0,0.498217,0.46129
2,202101_BrisbaneLions_Sydney,83,781.0,0,46.0,2.0,5.0,32.062439,0.062419,0.201226,0,0.331838,0.346862
3,202101_BrisbaneLions_Sydney,91,863.0,0,54.0,-39.0,7.81025,45.793013,1.019141,0.073607,0,0.379578,0.383467
4,202101_BrisbaneLions_Sydney,161,1422.0,0,38.0,-37.0,9.219544,54.488531,0.746457,0.086523,0,0.301577,0.321854
