Open Play Shots - Expected Goal Model - Model Tuning and Building - GBM

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import joblib
import warnings
warnings.filterwarnings('ignore')

from expected_score_model.config import goal_open_shots_file_path
from expected_score_model.domain.contracts.modelling_data_contract import ModellingDataContract
from expected_score_model.domain.modelling.hyperparameter_tuning import XGBHyperparameterTuner
from expected_score_model.domain.modelling.supermodel import SuperXGBClassifier
from expected_score_model.domain.modelling.optuna_xgb_param_grid import OptunaXGBParamGrid

pd.options.display.max_rows = 100
pd.options.display.max_columns = 999

%load_ext autoreload
%autoreload 2

Model Version

In [2]:
model_version = 9
model_name = 'expected_goal_open'
model_file_name = model_name + '_v' + str(model_version)

model_output_path = "/Users/ciaran/Documents/Projects/AFL/git-repositories/expected-score-model/model_outputs/models"
prediction_output_path = "/Users/ciaran/Documents/Projects/AFL/git-repositories/expected-score-model/model_outputs/predictions/"

preprocessor_file_name = 'preprocessor_v' + str(model_version)
preprocessor_output_path = "/Users/ciaran/Documents/Projects/AFL/git-repositories/expected-score-model/model_outputs/preprocessors/"

In [3]:
RESPONSE = ModellingDataContract.RESPONSE_GOAL

In [5]:
FEATURES = ModellingDataContract.feature_list_open_goal

In [6]:
MONOTONE_CONSTRAINTS = ModellingDataContract.monotone_constraints_open_goal

Load Data

In [7]:
df_modelling = pd.read_csv(goal_open_shots_file_path)
df_modelling.tail()

Unnamed: 0,Chain_Number,Initial_State,Final_State,Order,Quarter,Quarter_Duration_Chain_Start,Quarter_Duration,Team_Chain,Team,Player,AFL_API_Player_ID,Description,x,y,Disposal,Shot_At_Goal,Behind_Detail,Venue_Width,Venue_Length,Home_Team,Away_Team,Home_Team_Direction_Q1,Match_ID,Round_ID,Year,Season,Goal,Behind,Miss,Score,Event_Type1,Set_Shot,x0,x1,x2,x3,Distance_Since_Last_Action,Distance_to_Middle_Goal,Angle_to_Middle_Goal,Visible_Goal_Angle,ballUp,centreBounce,kickIn,possGain,throwIn,Distance_to_Right_Goal_x,Distance_to_Middle_y,Chain_Duration,Time_Since_Last_Action,GoalTrainingSet,GoalTestSet,GoalValidationSet
14170,166,ballUp,behind,1260.0,3,1299,1303.0,Collingwood,Collingwood,Mason Cox,Mason_Cox,Kick,73.0,-9.0,ineffective,True,,141,160,Collingwood,Brisbane Lions,right,2023F4_Collingwood_BrisbaneLions,2023F4,,2023.0,0,1,0,1,Handball Received,False,73.0,73.0,63.0,60.0,0.0,11.401754,0.909753,0.374081,True,False,False,False,False,7.0,9.0,4.0,1.0,False,False,True
14171,184,possGain,behind,1430.0,4,122,156.0,Collingwood,Collingwood,Nick Daicos,Nick_Daicos,Kick,72.0,24.0,ineffective,True,,141,160,Collingwood,Brisbane Lions,right,2023F4_Collingwood_BrisbaneLions,2023F4,,2023.0,0,1,0,1,Gather,False,72.0,70.0,52.0,50.0,2.0,25.298221,1.249046,0.081301,False,False,False,True,False,8.0,24.0,34.0,2.0,True,False,False
14172,207,possGain,behind,1587.0,4,744,746.0,Brisbane Lions,Brisbane Lions,Keidean Coleman,Keidean_Coleman,Kick,26.0,-1.0,ineffective,True,,141,160,Collingwood,Brisbane Lions,right,2023F4_Collingwood_BrisbaneLions,2023F4,,2023.0,0,1,0,1,Handball Received,False,26.0,21.0,19.0,19.0,7.071068,54.009258,0.018516,0.118895,False,False,False,True,False,54.0,1.0,2.0,0.0,True,False,False
14173,221,ballUp,goal,1718.0,4,1109,1128.0,Brisbane Lions,Brisbane Lions,Charlie Cameron,Charlie_Cameron,Kick,73.0,6.0,effective,True,,141,160,Collingwood,Brisbane Lions,right,2023F4_Collingwood_BrisbaneLions,2023F4,,2023.0,1,0,0,6,Hard Ball Get,False,73.0,73.0,67.0,67.0,2.0,9.219544,0.708626,0.599251,True,False,False,False,False,7.0,6.0,19.0,1.0,True,False,False
14174,222,centreBounce,goal,1732.0,4,1202,1219.0,Collingwood,Collingwood,Jordan De Goey,Jordan_De_Goey,Kick,32.0,8.0,effective,True,,141,160,Collingwood,Brisbane Lions,right,2023F4_Collingwood_BrisbaneLions,2023F4,,2023.0,1,0,0,6,Handball Received,False,32.0,26.0,26.0,27.0,7.211103,48.6621,0.165149,0.130293,False,True,False,False,False,48.0,8.0,17.0,4.0,True,False,False


In [8]:
df_modelling[['ballUp', 'centreBounce', 'kickIn', 'possGain', 'throwIn']] = pd.get_dummies(df_modelling['Initial_State'])

In [9]:
training_data = df_modelling[df_modelling[RESPONSE+"TrainingSet"]]
test_data = df_modelling[df_modelling[RESPONSE+"TestSet"]]
cal_data = df_modelling[df_modelling[RESPONSE+"ValidationSet"]]

In [10]:
X_train, y_train = training_data.drop(columns=[RESPONSE]), training_data[RESPONSE]
X_test, y_test = test_data.drop(columns=[RESPONSE]), test_data[RESPONSE]
X_cal, y_cal = cal_data.drop(columns=[RESPONSE]), cal_data[RESPONSE]

In [11]:
y_train.mean(), y_test.mean(), y_cal.mean()

(0.4289021164021164, 0.4264550264550265, 0.414021164021164)

In [12]:
X_test.head()

Unnamed: 0,Chain_Number,Initial_State,Final_State,Order,Quarter,Quarter_Duration_Chain_Start,Quarter_Duration,Team_Chain,Team,Player,AFL_API_Player_ID,Description,x,y,Disposal,Shot_At_Goal,Behind_Detail,Venue_Width,Venue_Length,Home_Team,Away_Team,Home_Team_Direction_Q1,Match_ID,Round_ID,Year,Season,Behind,Miss,Score,Event_Type1,Set_Shot,x0,x1,x2,x3,Distance_Since_Last_Action,Distance_to_Middle_Goal,Angle_to_Middle_Goal,Visible_Goal_Angle,ballUp,centreBounce,kickIn,possGain,throwIn,Distance_to_Right_Goal_x,Distance_to_Middle_y,Chain_Duration,Time_Since_Last_Action,GoalTrainingSet,GoalTestSet,GoalValidationSet
1,6,possGain,goal,61.0,1,149,168.0,Brisbane Lions,Brisbane Lions,Zac Bailey,Zac_Bailey,Kick,35.0,19.0,effective,True,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021.0,,0,0,6,Handball Received,False,35.0,28.0,25.0,22.0,7.0,47.010637,0.416065,0.125105,False,False,False,True,False,43.0,19.0,19.0,1.0,False,True,False
5,43,possGain,behind,444.0,1,1272,1317.0,Brisbane Lions,Brisbane Lions,Joe Daniher,Joe_Daniher,Kick,27.0,31.0,ineffective,True,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021.0,,1,0,1,Handball Received,False,27.0,28.0,34.0,35.0,3.162278,59.682493,0.546167,0.091898,False,False,False,True,False,51.0,31.0,45.0,1.0,False,True,False
9,83,throwIn,goal,781.0,2,848,861.0,Brisbane Lions,Brisbane Lions,Jarryd Lyons,Jarryd_Lyons,Kick,46.0,2.0,effective,True,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021.0,,0,0,6,Gather,False,46.0,41.0,35.0,30.0,5.0,32.062439,0.062419,0.201226,False,False,False,False,True,32.0,2.0,13.0,0.0,False,True,False
11,91,possGain,miss,863.0,2,1223,1226.0,Brisbane Lions,Brisbane Lions,Zac Bailey,Zac_Bailey,Kick,54.0,-39.0,clanger,True,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021.0,,0,1,0,Hard Ball Get Crumb,False,54.0,49.0,-49.0,-26.0,7.81025,45.793013,1.019141,0.073607,False,False,False,True,False,24.0,39.0,3.0,3.0,False,True,False
19,161,possGain,goal,1422.0,3,1570,1575.0,Brisbane Lions,Brisbane Lions,Jarrod Berry,Jarrod_Berry,Kick,38.0,-37.0,effective,True,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021.0,,0,0,6,Handball Received,False,38.0,36.0,33.0,30.0,9.219544,54.488531,0.746457,0.086523,False,False,False,True,False,40.0,37.0,5.0,2.0,False,True,False


Preprocess Data

In [13]:
# preprocessor = DataPreprocessor()
# preprocessor.fit(X_train)

# X_train_preproc = preprocessor.transform(X_train)
# X_test_preproc = preprocessor.transform(X_test)

In [14]:
X_train_preproc = X_train[FEATURES]
X_cal_preproc = X_cal[FEATURES]
X_test_preproc = X_test[FEATURES]

In [15]:
X_train_preproc.shape, X_cal_preproc.shape, X_test_preproc.shape

((9072, 17), (2268, 17), (2835, 17))

In [16]:
X_train_preproc.head()

Unnamed: 0,x0,x1,x2,x3,Distance_Since_Last_Action,Distance_to_Middle_Goal,Angle_to_Middle_Goal,Visible_Goal_Angle,ballUp,centreBounce,kickIn,possGain,throwIn,Distance_to_Right_Goal_x,Distance_to_Middle_y,Chain_Duration,Time_Since_Last_Action
0,26.0,18.0,11.0,6.0,8.246211,56.0803,0.383818,0.106164,False,True,False,False,False,52.0,21.0,26.0,1.0
2,50.0,50.0,50.0,-8.0,4.0,36.878178,0.708626,0.132764,False,False,False,True,False,28.0,24.0,21.0,1.0
3,34.0,19.0,10.0,7.0,15.132746,46.486557,0.328553,0.13093,False,False,False,True,False,44.0,15.0,13.0,2.0
4,40.0,36.0,34.0,32.0,4.123106,39.560081,0.281772,0.156423,False,False,False,True,False,38.0,11.0,5.0,3.0
6,56.0,58.0,58.0,58.0,15.132746,22.803509,0.266252,0.276208,False,False,False,True,False,22.0,6.0,6.0,0.0


In [17]:
X_test_preproc.head()

Unnamed: 0,x0,x1,x2,x3,Distance_Since_Last_Action,Distance_to_Middle_Goal,Angle_to_Middle_Goal,Visible_Goal_Angle,ballUp,centreBounce,kickIn,possGain,throwIn,Distance_to_Right_Goal_x,Distance_to_Middle_y,Chain_Duration,Time_Since_Last_Action
1,35.0,28.0,25.0,22.0,7.0,47.010637,0.416065,0.125105,False,False,False,True,False,43.0,19.0,19.0,1.0
5,27.0,28.0,34.0,35.0,3.162278,59.682493,0.546167,0.091898,False,False,False,True,False,51.0,31.0,45.0,1.0
9,46.0,41.0,35.0,30.0,5.0,32.062439,0.062419,0.201226,False,False,False,False,True,32.0,2.0,13.0,0.0
11,54.0,49.0,-49.0,-26.0,7.81025,45.793013,1.019141,0.073607,False,False,False,True,False,24.0,39.0,3.0,3.0
19,38.0,36.0,33.0,30.0,9.219544,54.488531,0.746457,0.086523,False,False,False,True,False,40.0,37.0,5.0,2.0


Optuna Hyperparameter Tuning Class - HyperParameterTuner & XGBHyperparameterTuner

In [18]:
xgb_tuner = XGBHyperparameterTuner(X_train_preproc, y_train, monotonicity_constraints=MONOTONE_CONSTRAINTS)

In [19]:
xgb_tuner.training_data.head()

Unnamed: 0,x0,x1,x2,x3,Distance_Since_Last_Action,Distance_to_Middle_Goal,Angle_to_Middle_Goal,Visible_Goal_Angle,ballUp,centreBounce,kickIn,possGain,throwIn,Distance_to_Right_Goal_x,Distance_to_Middle_y,Chain_Duration,Time_Since_Last_Action
0,26.0,18.0,11.0,6.0,8.246211,56.0803,0.383818,0.106164,False,True,False,False,False,52.0,21.0,26.0,1.0
2,50.0,50.0,50.0,-8.0,4.0,36.878178,0.708626,0.132764,False,False,False,True,False,28.0,24.0,21.0,1.0
3,34.0,19.0,10.0,7.0,15.132746,46.486557,0.328553,0.13093,False,False,False,True,False,44.0,15.0,13.0,2.0
4,40.0,36.0,34.0,32.0,4.123106,39.560081,0.281772,0.156423,False,False,False,True,False,38.0,11.0,5.0,3.0
6,56.0,58.0,58.0,58.0,15.132746,22.803509,0.266252,0.276208,False,False,False,True,False,22.0,6.0,6.0,0.0


In [20]:
xgb_tuner.tune_hyperparameters()

[I 2023-10-21 13:21:55,018] A new study created in memory with name: no-name-45a5b44f-7fab-4108-82ec-c0fb363288ae
[I 2023-10-21 13:21:55,097] Trial 0 finished with value: 0.6517230023421061 and parameters: {'max_depth': 8, 'min_child_weight': 31, 'eta': 0.06515578539221199, 'gamma': 0.032465421664986824, 'lambda': 1.1284728574167582, 'alpha': 0.19292630862677493, 'subsample': 0.855277152280477, 'colsample_bytree': 0.7573791081444361}. Best is trial 0 with value: 0.6517230023421061.
[I 2023-10-21 13:21:55,130] Trial 1 finished with value: 0.6323993969332977 and parameters: {'max_depth': 4, 'min_child_weight': 50, 'eta': 0.22846317665243876, 'gamma': 0.00016419746268592607, 'lambda': 0.007702990169377534, 'alpha': 0.00016204811448761422, 'subsample': 0.736869034944625, 'colsample_bytree': 0.4889353637141221}. Best is trial 1 with value: 0.6323993969332977.
[I 2023-10-21 13:21:55,164] Trial 2 finished with value: 0.6810224181308127 and parameters: {'max_depth': 15, 'min_child_weight': 82,

Number of finished trials:  1000
Best trial:
  Value: 0.605096265757997
  Params: 
    max_depth: 6
    min_child_weight: 29
    eta: 0.32354218461894785
    gamma: 0.2808185453014687
    lambda: 0.006600220301190592
    alpha: 0.027044331526577953
    subsample: 0.5790549143868143
    colsample_bytree: 0.6669317029720887


<optuna.study.study.Study at 0x10f733b50>

In [21]:
params = xgb_tuner.get_best_params()
params

{'max_depth': 6,
 'min_child_weight': 29,
 'eta': 0.32354218461894785,
 'gamma': 0.2808185453014687,
 'lambda': 0.006600220301190592,
 'alpha': 0.027044331526577953,
 'subsample': 0.5790549143868143,
 'colsample_bytree': 0.6669317029720887}

Training Model - SuperXGBClassifier class for training and predictions

In [22]:
params['objective'] = OptunaXGBParamGrid.error
params['num_rounds'] = 1000
params['early_stopping_rounds'] = 50
params['verbosity'] = 1
params['monotone_constraints'] = MONOTONE_CONSTRAINTS

In [23]:
super_xgb = SuperXGBClassifier(X_train = X_train_preproc, 
                               y_train = y_train, 
                               X_test = X_test_preproc, 
                               y_test = y_test,
                               X_cal = X_cal_preproc,
                               y_cal = y_cal,
                               params = params)

In [24]:
super_xgb.fit()

[0]	validation_0-logloss:0.65974	validation_1-logloss:0.66012
[1]	validation_0-logloss:0.64352	validation_1-logloss:0.64352
[2]	validation_0-logloss:0.63184	validation_1-logloss:0.63243
[3]	validation_0-logloss:0.62438	validation_1-logloss:0.62516
[4]	validation_0-logloss:0.61915	validation_1-logloss:0.62082
[5]	validation_0-logloss:0.61550	validation_1-logloss:0.61779
[6]	validation_0-logloss:0.61227	validation_1-logloss:0.61606
[7]	validation_0-logloss:0.60994	validation_1-logloss:0.61382
[8]	validation_0-logloss:0.60876	validation_1-logloss:0.61344
[9]	validation_0-logloss:0.60782	validation_1-logloss:0.61252
[10]	validation_0-logloss:0.60624	validation_1-logloss:0.61217
[11]	validation_0-logloss:0.60399	validation_1-logloss:0.61222
[12]	validation_0-logloss:0.60128	validation_1-logloss:0.61014
[13]	validation_0-logloss:0.60041	validation_1-logloss:0.61094
[14]	validation_0-logloss:0.59930	validation_1-logloss:0.61160
[15]	validation_0-logloss:0.59879	validation_1-logloss:0.61180
[1

In [25]:
super_xgb.xgb_model

In [26]:
super_xgb.xgb_model.get_booster().feature_names

['x0',
 'x1',
 'x2',
 'x3',
 'Distance_Since_Last_Action',
 'Distance_to_Middle_Goal',
 'Angle_to_Middle_Goal',
 'Visible_Goal_Angle',
 'ballUp',
 'centreBounce',
 'kickIn',
 'possGain',
 'throwIn',
 'Distance_to_Right_Goal_x',
 'Distance_to_Middle_y',
 'Chain_Duration',
 'Time_Since_Last_Action']

In [27]:
train_preds = super_xgb.predict(X_train_preproc)
test_preds = super_xgb.predict(X_test_preproc)

In [28]:
train_probas = super_xgb.predict_proba(X_train_preproc)[:, 1]
test_probas = super_xgb.predict_proba(X_test_preproc)[:, 1]
cal_probas = super_xgb.predict_proba(X_cal_preproc)[:, 1]

In [29]:
super_xgb.calibrate()



In [30]:
train_cal_probas = super_xgb.predict_proba(X_train_preproc, calibrate=True)
test_cal_probas = super_xgb.predict_proba(X_test_preproc, calibrate=True)

Check Average Predictions

In [31]:
train_probas.mean(), training_data[RESPONSE].mean(), train_cal_probas.mean()

(0.43395114, 0.4289021164021164, 0.41547466255123044)

In [32]:
test_probas.mean(), test_data[RESPONSE].mean(), test_cal_probas.mean()

(0.43153793, 0.4264550264550265, 0.4125820339511535)

Export model

In [33]:
super_xgb.export_model(model_output_path + "/" + model_file_name + ".joblib")

Export data and predictions

In [34]:
train_info = training_data[['Match_ID', "Chain_Number", "Order", RESPONSE]].reset_index(drop = True)
train_export = pd.concat([train_info, X_train_preproc.reset_index(drop=True)], axis=1)
train_export['xgb_preds'] = train_preds
train_export['xgb_probas'] = train_probas
train_export['xgb_probas_cal'] = train_cal_probas
train_export.to_csv(prediction_output_path + 'train_predictions_' + model_file_name + '.csv', index = False)
train_export.head()

Unnamed: 0,Match_ID,Chain_Number,Order,Goal,x0,x1,x2,x3,Distance_Since_Last_Action,Distance_to_Middle_Goal,Angle_to_Middle_Goal,Visible_Goal_Angle,ballUp,centreBounce,kickIn,possGain,throwIn,Distance_to_Right_Goal_x,Distance_to_Middle_y,Chain_Duration,Time_Since_Last_Action,xgb_preds,xgb_probas,xgb_probas_cal
0,202101_BrisbaneLions_Sydney,1,11.0,1,26.0,18.0,11.0,6.0,8.246211,56.0803,0.383818,0.106164,False,True,False,False,False,52.0,21.0,26.0,1.0,0,0.404758,0.374345
1,202101_BrisbaneLions_Sydney,10,110.0,1,50.0,50.0,50.0,-8.0,4.0,36.878178,0.708626,0.132764,False,False,False,True,False,28.0,24.0,21.0,1.0,0,0.374302,0.347603
2,202101_BrisbaneLions_Sydney,21,222.0,1,34.0,19.0,10.0,7.0,15.132746,46.486557,0.328553,0.13093,False,False,False,True,False,44.0,15.0,13.0,2.0,0,0.477122,0.44152
3,202101_BrisbaneLions_Sydney,38,392.0,0,40.0,36.0,34.0,32.0,4.123106,39.560081,0.281772,0.156423,False,False,False,True,False,38.0,11.0,5.0,3.0,0,0.356175,0.332075
4,202101_BrisbaneLions_Sydney,46,476.0,1,56.0,58.0,58.0,58.0,15.132746,22.803509,0.266252,0.276208,False,False,False,True,False,22.0,6.0,6.0,0.0,1,0.699364,0.683023


In [35]:
test_info = test_data[['Match_ID', "Chain_Number", "Order", RESPONSE]].reset_index(drop = True)
test_export = pd.concat([test_info, X_test_preproc.reset_index(drop=True)], axis=1)
test_export['xgb_preds'] = test_preds
test_export['xgb_probas'] = test_probas
test_export['xgb_probas_cal'] = test_cal_probas
test_export.to_csv(prediction_output_path + 'test_predictions_' + model_file_name + '.csv', index = False)
test_export.head()

Unnamed: 0,Match_ID,Chain_Number,Order,Goal,x0,x1,x2,x3,Distance_Since_Last_Action,Distance_to_Middle_Goal,Angle_to_Middle_Goal,Visible_Goal_Angle,ballUp,centreBounce,kickIn,possGain,throwIn,Distance_to_Right_Goal_x,Distance_to_Middle_y,Chain_Duration,Time_Since_Last_Action,xgb_preds,xgb_probas,xgb_probas_cal
0,202101_BrisbaneLions_Sydney,6,61.0,1,35.0,28.0,25.0,22.0,7.0,47.010637,0.416065,0.125105,False,False,False,True,False,43.0,19.0,19.0,1.0,0,0.421267,0.389205
1,202101_BrisbaneLions_Sydney,43,444.0,0,27.0,28.0,34.0,35.0,3.162278,59.682493,0.546167,0.091898,False,False,False,True,False,51.0,31.0,45.0,1.0,0,0.280717,0.270125
2,202101_BrisbaneLions_Sydney,83,781.0,1,46.0,41.0,35.0,30.0,5.0,32.062439,0.062419,0.201226,False,False,False,False,True,32.0,2.0,13.0,0.0,1,0.602336,0.571165
3,202101_BrisbaneLions_Sydney,91,863.0,0,54.0,49.0,-49.0,-26.0,7.81025,45.793013,1.019141,0.073607,False,False,False,True,False,24.0,39.0,3.0,3.0,0,0.270196,0.261776
4,202101_BrisbaneLions_Sydney,161,1422.0,1,38.0,36.0,33.0,30.0,9.219544,54.488531,0.746457,0.086523,False,False,False,True,False,40.0,37.0,5.0,2.0,0,0.333455,0.312993
