Open Play Shots - Expected Goal Model - Model Tuning and Building - GBM

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import joblib
import warnings
warnings.filterwarnings('ignore')

from expected_score_model.config import goal_open_shots_file_path
from expected_score_model.domain.contracts.modelling_data_contract import ModellingDataContract
from expected_score_model.domain.modelling.hyperparameter_tuning import XGBHyperparameterTuner
from expected_score_model.domain.modelling.supermodel import SuperXGBClassifier
from expected_score_model.domain.modelling.optuna_xgb_param_grid import OptunaXGBParamGrid

pd.options.display.max_rows = 100
pd.options.display.max_columns = 999

%load_ext autoreload
%autoreload 2

Model Version

In [2]:
model_version = 9
model_name = 'expected_goal_open'
model_file_name = model_name + '_v' + str(model_version)

model_output_path = "/Users/ciaran/Documents/Projects/AFL/git-repositories/expected-score-model/model_outputs/models"
prediction_output_path = "/Users/ciaran/Documents/Projects/AFL/git-repositories/expected-score-model/model_outputs/predictions/"

preprocessor_file_name = 'preprocessor_v' + str(model_version)
preprocessor_output_path = "/Users/ciaran/Documents/Projects/AFL/git-repositories/expected-score-model/model_outputs/preprocessors/"

In [3]:
RESPONSE = ModellingDataContract.RESPONSE_GOAL

In [4]:
FEATURES = ModellingDataContract.feature_list_open_goal

In [5]:
MONOTONE_CONSTRAINTS = ModellingDataContract.monotone_constraints_open_goal

Load Data

In [6]:
df_modelling = pd.read_csv(goal_open_shots_file_path)
df_modelling.tail()

Unnamed: 0,Chain_Number,Initial_State,Final_State,Order,Quarter,Quarter_Duration_Chain_Start,Quarter_Duration,Team_Chain,Team,Player,AFL_API_Player_ID,Description,x,y,Disposal,Shot_At_Goal,Behind_Detail,Venue_Width,Venue_Length,Home_Team,Away_Team,Home_Team_Direction_Q1,Match_ID,Round_ID,Year,Season,Goal,Behind,Miss,Score,Event_Type1,Set_Shot,x0,x1,x2,x3,Distance_Since_Last_Action,Distance_to_Middle_Goal,Angle_to_Middle_Goal,Visible_Goal_Angle,ballUp,centreBounce,kickIn,possGain,throwIn,Distance_to_Right_Goal_x,Distance_to_Middle_y,Chain_Duration,Time_Since_Last_Action,GoalTrainingSet,GoalTestSet,GoalValidationSet
14170,166,ballUp,behind,1260.0,3,1299,1303.0,Collingwood,Collingwood,Mason Cox,Mason_Cox,Kick,73.0,-9.0,ineffective,True,,141,160,Collingwood,Brisbane Lions,right,2023F4_Collingwood_BrisbaneLions,2023F4,,2023.0,0,1,0,1,Handball Received,False,73.0,73.0,63.0,60.0,0.0,11.401754,0.909753,0.374081,True,False,False,False,False,7.0,9.0,4.0,1.0,False,False,True
14171,184,possGain,behind,1430.0,4,122,156.0,Collingwood,Collingwood,Nick Daicos,Nick_Daicos,Kick,72.0,24.0,ineffective,True,,141,160,Collingwood,Brisbane Lions,right,2023F4_Collingwood_BrisbaneLions,2023F4,,2023.0,0,1,0,1,Gather,False,72.0,70.0,52.0,50.0,2.0,25.298221,1.249046,0.081301,False,False,False,True,False,8.0,24.0,34.0,2.0,True,False,False
14172,207,possGain,behind,1587.0,4,744,746.0,Brisbane Lions,Brisbane Lions,Keidean Coleman,Keidean_Coleman,Kick,26.0,-1.0,ineffective,True,,141,160,Collingwood,Brisbane Lions,right,2023F4_Collingwood_BrisbaneLions,2023F4,,2023.0,0,1,0,1,Handball Received,False,26.0,21.0,19.0,19.0,7.071068,54.009258,0.018516,0.118895,False,False,False,True,False,54.0,1.0,2.0,0.0,True,False,False
14173,221,ballUp,goal,1718.0,4,1109,1128.0,Brisbane Lions,Brisbane Lions,Charlie Cameron,Charlie_Cameron,Kick,73.0,6.0,effective,True,,141,160,Collingwood,Brisbane Lions,right,2023F4_Collingwood_BrisbaneLions,2023F4,,2023.0,1,0,0,6,Hard Ball Get,False,73.0,73.0,67.0,67.0,2.0,9.219544,0.708626,0.599251,True,False,False,False,False,7.0,6.0,19.0,1.0,True,False,False
14174,222,centreBounce,goal,1732.0,4,1202,1219.0,Collingwood,Collingwood,Jordan De Goey,Jordan_De_Goey,Kick,32.0,8.0,effective,True,,141,160,Collingwood,Brisbane Lions,right,2023F4_Collingwood_BrisbaneLions,2023F4,,2023.0,1,0,0,6,Handball Received,False,32.0,26.0,26.0,27.0,7.211103,48.6621,0.165149,0.130293,False,True,False,False,False,48.0,8.0,17.0,4.0,True,False,False


In [7]:
df_modelling[['ballUp', 'centreBounce', 'kickIn', 'possGain', 'throwIn']] = pd.get_dummies(df_modelling['Initial_State'])

In [8]:
training_data = df_modelling[df_modelling[RESPONSE+"TrainingSet"]]
test_data = df_modelling[df_modelling[RESPONSE+"TestSet"]]
cal_data = df_modelling[df_modelling[RESPONSE+"ValidationSet"]]

In [9]:
X_train, y_train = training_data.drop(columns=[RESPONSE]), training_data[RESPONSE]
X_test, y_test = test_data.drop(columns=[RESPONSE]), test_data[RESPONSE]
X_cal, y_cal = cal_data.drop(columns=[RESPONSE]), cal_data[RESPONSE]

In [10]:
y_train.mean(), y_test.mean(), y_cal.mean()

(0.4289021164021164, 0.4264550264550265, 0.414021164021164)

In [11]:
X_test.head()

Unnamed: 0,Chain_Number,Initial_State,Final_State,Order,Quarter,Quarter_Duration_Chain_Start,Quarter_Duration,Team_Chain,Team,Player,AFL_API_Player_ID,Description,x,y,Disposal,Shot_At_Goal,Behind_Detail,Venue_Width,Venue_Length,Home_Team,Away_Team,Home_Team_Direction_Q1,Match_ID,Round_ID,Year,Season,Behind,Miss,Score,Event_Type1,Set_Shot,x0,x1,x2,x3,Distance_Since_Last_Action,Distance_to_Middle_Goal,Angle_to_Middle_Goal,Visible_Goal_Angle,ballUp,centreBounce,kickIn,possGain,throwIn,Distance_to_Right_Goal_x,Distance_to_Middle_y,Chain_Duration,Time_Since_Last_Action,GoalTrainingSet,GoalTestSet,GoalValidationSet
1,6,possGain,goal,61.0,1,149,168.0,Brisbane Lions,Brisbane Lions,Zac Bailey,Zac_Bailey,Kick,35.0,19.0,effective,True,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021.0,,0,0,6,Handball Received,False,35.0,28.0,25.0,22.0,7.0,47.010637,0.416065,0.125105,False,False,False,True,False,43.0,19.0,19.0,1.0,False,True,False
5,43,possGain,behind,444.0,1,1272,1317.0,Brisbane Lions,Brisbane Lions,Joe Daniher,Joe_Daniher,Kick,27.0,31.0,ineffective,True,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021.0,,1,0,1,Handball Received,False,27.0,28.0,34.0,35.0,3.162278,59.682493,0.546167,0.091898,False,False,False,True,False,51.0,31.0,45.0,1.0,False,True,False
9,83,throwIn,goal,781.0,2,848,861.0,Brisbane Lions,Brisbane Lions,Jarryd Lyons,Jarryd_Lyons,Kick,46.0,2.0,effective,True,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021.0,,0,0,6,Gather,False,46.0,41.0,35.0,30.0,5.0,32.062439,0.062419,0.201226,False,False,False,False,True,32.0,2.0,13.0,0.0,False,True,False
11,91,possGain,miss,863.0,2,1223,1226.0,Brisbane Lions,Brisbane Lions,Zac Bailey,Zac_Bailey,Kick,54.0,-39.0,clanger,True,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021.0,,0,1,0,Hard Ball Get Crumb,False,54.0,49.0,-49.0,-26.0,7.81025,45.793013,1.019141,0.073607,False,False,False,True,False,24.0,39.0,3.0,3.0,False,True,False
19,161,possGain,goal,1422.0,3,1570,1575.0,Brisbane Lions,Brisbane Lions,Jarrod Berry,Jarrod_Berry,Kick,38.0,-37.0,effective,True,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021.0,,0,0,6,Handball Received,False,38.0,36.0,33.0,30.0,9.219544,54.488531,0.746457,0.086523,False,False,False,True,False,40.0,37.0,5.0,2.0,False,True,False


Preprocess Data

In [12]:
# preprocessor = DataPreprocessor()
# preprocessor.fit(X_train)

# X_train_preproc = preprocessor.transform(X_train)
# X_test_preproc = preprocessor.transform(X_test)

In [13]:
X_train_preproc = X_train[FEATURES]
X_cal_preproc = X_cal[FEATURES]
X_test_preproc = X_test[FEATURES]

In [14]:
X_train_preproc.shape, X_cal_preproc.shape, X_test_preproc.shape

((9072, 17), (2268, 17), (2835, 17))

In [15]:
X_train_preproc.head()

Unnamed: 0,x0,x1,x2,x3,Distance_Since_Last_Action,Distance_to_Middle_Goal,Angle_to_Middle_Goal,Visible_Goal_Angle,ballUp,centreBounce,kickIn,possGain,throwIn,Distance_to_Right_Goal_x,Distance_to_Middle_y,Chain_Duration,Time_Since_Last_Action
0,26.0,18.0,11.0,6.0,8.246211,56.0803,0.383818,0.106164,False,True,False,False,False,52.0,21.0,26.0,1.0
2,50.0,50.0,50.0,-8.0,4.0,36.878178,0.708626,0.132764,False,False,False,True,False,28.0,24.0,21.0,1.0
3,34.0,19.0,10.0,7.0,15.132746,46.486557,0.328553,0.13093,False,False,False,True,False,44.0,15.0,13.0,2.0
4,40.0,36.0,34.0,32.0,4.123106,39.560081,0.281772,0.156423,False,False,False,True,False,38.0,11.0,5.0,3.0
6,56.0,58.0,58.0,58.0,15.132746,22.803509,0.266252,0.276208,False,False,False,True,False,22.0,6.0,6.0,0.0


In [16]:
X_test_preproc.head()

Unnamed: 0,x0,x1,x2,x3,Distance_Since_Last_Action,Distance_to_Middle_Goal,Angle_to_Middle_Goal,Visible_Goal_Angle,ballUp,centreBounce,kickIn,possGain,throwIn,Distance_to_Right_Goal_x,Distance_to_Middle_y,Chain_Duration,Time_Since_Last_Action
1,35.0,28.0,25.0,22.0,7.0,47.010637,0.416065,0.125105,False,False,False,True,False,43.0,19.0,19.0,1.0
5,27.0,28.0,34.0,35.0,3.162278,59.682493,0.546167,0.091898,False,False,False,True,False,51.0,31.0,45.0,1.0
9,46.0,41.0,35.0,30.0,5.0,32.062439,0.062419,0.201226,False,False,False,False,True,32.0,2.0,13.0,0.0
11,54.0,49.0,-49.0,-26.0,7.81025,45.793013,1.019141,0.073607,False,False,False,True,False,24.0,39.0,3.0,3.0
19,38.0,36.0,33.0,30.0,9.219544,54.488531,0.746457,0.086523,False,False,False,True,False,40.0,37.0,5.0,2.0


Optuna Hyperparameter Tuning Class - HyperParameterTuner & XGBHyperparameterTuner

In [17]:
xgb_tuner = XGBHyperparameterTuner(X_train_preproc, y_train, monotonicity_constraints=MONOTONE_CONSTRAINTS)

In [18]:
xgb_tuner.training_data.head()

Unnamed: 0,x0,x1,x2,x3,Distance_Since_Last_Action,Distance_to_Middle_Goal,Angle_to_Middle_Goal,Visible_Goal_Angle,ballUp,centreBounce,kickIn,possGain,throwIn,Distance_to_Right_Goal_x,Distance_to_Middle_y,Chain_Duration,Time_Since_Last_Action
0,26.0,18.0,11.0,6.0,8.246211,56.0803,0.383818,0.106164,False,True,False,False,False,52.0,21.0,26.0,1.0
2,50.0,50.0,50.0,-8.0,4.0,36.878178,0.708626,0.132764,False,False,False,True,False,28.0,24.0,21.0,1.0
3,34.0,19.0,10.0,7.0,15.132746,46.486557,0.328553,0.13093,False,False,False,True,False,44.0,15.0,13.0,2.0
4,40.0,36.0,34.0,32.0,4.123106,39.560081,0.281772,0.156423,False,False,False,True,False,38.0,11.0,5.0,3.0
6,56.0,58.0,58.0,58.0,15.132746,22.803509,0.266252,0.276208,False,False,False,True,False,22.0,6.0,6.0,0.0


In [19]:
xgb_tuner.tune_hyperparameters()

[I 2023-10-21 14:32:41,542] A new study created in memory with name: no-name-9c972ff1-9657-41ac-b78b-60a751cf3f7c
[I 2023-10-21 14:32:41,603] Trial 0 finished with value: 0.6734503246821664 and parameters: {'max_depth': 10, 'min_child_weight': 33, 'eta': 0.011543587316964681, 'gamma': 0.008651012068461247, 'lambda': 0.17748453880110593, 'alpha': 0.15251209627151163, 'subsample': 0.6442561361244927, 'colsample_bytree': 0.2546823505320681}. Best is trial 0 with value: 0.6734503246821664.
[I 2023-10-21 14:32:41,642] Trial 1 finished with value: 0.6353812980033079 and parameters: {'max_depth': 16, 'min_child_weight': 40, 'eta': 0.17399856329323762, 'gamma': 0.3444598826995078, 'lambda': 2.1193558504564405, 'alpha': 1.0884255435240684, 'subsample': 0.39616433215202485, 'colsample_bytree': 0.8428297020312716}. Best is trial 1 with value: 0.6353812980033079.
[I 2023-10-21 14:32:41,670] Trial 2 finished with value: 0.6226176333652137 and parameters: {'max_depth': 7, 'min_child_weight': 71, 'et

Number of finished trials:  1000
Best trial:
  Value: 0.6006376640847529
  Params: 
    max_depth: 20
    min_child_weight: 62
    eta: 0.39188914113492584
    gamma: 0.08158495872453035
    lambda: 0.03798260286986489
    alpha: 0.14077783642820885
    subsample: 0.8723151141243798
    colsample_bytree: 0.7025338715571116


<optuna.study.study.Study at 0x15204d910>

In [20]:
params = xgb_tuner.get_best_params()
params

{'max_depth': 20,
 'min_child_weight': 62,
 'eta': 0.39188914113492584,
 'gamma': 0.08158495872453035,
 'lambda': 0.03798260286986489,
 'alpha': 0.14077783642820885,
 'subsample': 0.8723151141243798,
 'colsample_bytree': 0.7025338715571116}

Training Model - SuperXGBClassifier class for training and predictions

In [21]:
params['objective'] = OptunaXGBParamGrid.error
params['num_rounds'] = 1000
params['early_stopping_rounds'] = 50
params['verbosity'] = 1
params['monotone_constraints'] = MONOTONE_CONSTRAINTS

In [22]:
super_xgb = SuperXGBClassifier(X_train = X_train_preproc, 
                               y_train = y_train, 
                               X_test = X_test_preproc, 
                               y_test = y_test,
                               X_cal = X_cal_preproc,
                               y_cal = y_cal,
                               params = params)

In [23]:
super_xgb.fit()

[0]	validation_0-logloss:0.64926	validation_1-logloss:0.64853
[1]	validation_0-logloss:0.63320	validation_1-logloss:0.63381
[2]	validation_0-logloss:0.62566	validation_1-logloss:0.62580
[3]	validation_0-logloss:0.61964	validation_1-logloss:0.61905
[4]	validation_0-logloss:0.61569	validation_1-logloss:0.61707
[5]	validation_0-logloss:0.61145	validation_1-logloss:0.61479
[6]	validation_0-logloss:0.60829	validation_1-logloss:0.61353
[7]	validation_0-logloss:0.60633	validation_1-logloss:0.61181
[8]	validation_0-logloss:0.60461	validation_1-logloss:0.61249
[9]	validation_0-logloss:0.60372	validation_1-logloss:0.61165
[10]	validation_0-logloss:0.60163	validation_1-logloss:0.61081
[11]	validation_0-logloss:0.60067	validation_1-logloss:0.61143
[12]	validation_0-logloss:0.59723	validation_1-logloss:0.61091
[13]	validation_0-logloss:0.59554	validation_1-logloss:0.61195
[14]	validation_0-logloss:0.59276	validation_1-logloss:0.61171
[15]	validation_0-logloss:0.59158	validation_1-logloss:0.61143
[1

In [24]:
super_xgb.xgb_model

In [25]:
super_xgb.xgb_model.get_booster().feature_names

['x0',
 'x1',
 'x2',
 'x3',
 'Distance_Since_Last_Action',
 'Distance_to_Middle_Goal',
 'Angle_to_Middle_Goal',
 'Visible_Goal_Angle',
 'ballUp',
 'centreBounce',
 'kickIn',
 'possGain',
 'throwIn',
 'Distance_to_Right_Goal_x',
 'Distance_to_Middle_y',
 'Chain_Duration',
 'Time_Since_Last_Action']

In [26]:
train_preds = super_xgb.predict(X_train_preproc)
test_preds = super_xgb.predict(X_test_preproc)

In [27]:
train_probas = super_xgb.predict_proba(X_train_preproc)[:, 1]
test_probas = super_xgb.predict_proba(X_test_preproc)[:, 1]
cal_probas = super_xgb.predict_proba(X_cal_preproc)[:, 1]

In [28]:
super_xgb.calibrate()

In [29]:
train_cal_probas = super_xgb.predict_proba(X_train_preproc, calibrate=True)
test_cal_probas = super_xgb.predict_proba(X_test_preproc, calibrate=True)

Check Average Predictions

In [30]:
train_probas.mean(), training_data[RESPONSE].mean(), train_cal_probas.mean()

(0.4313068, 0.4289021164021164, 0.41319621083170827)

In [31]:
test_probas.mean(), test_data[RESPONSE].mean(), test_cal_probas.mean()

(0.42717725, 0.4264550264550265, 0.4085943366568256)

Export model

In [32]:
super_xgb.export_model(model_output_path + "/" + model_file_name + ".joblib")

Export data and predictions

In [33]:
train_info = training_data[['Match_ID', "Chain_Number", "Order", RESPONSE]].reset_index(drop = True)
train_export = pd.concat([train_info, X_train_preproc.reset_index(drop=True)], axis=1)
train_export['xgb_preds'] = train_preds
train_export['xgb_probas'] = train_probas
train_export['xgb_probas_cal'] = train_cal_probas
train_export.to_csv(prediction_output_path + 'train_predictions_' + model_file_name + '.csv', index = False)
train_export.head()

Unnamed: 0,Match_ID,Chain_Number,Order,Goal,x0,x1,x2,x3,Distance_Since_Last_Action,Distance_to_Middle_Goal,Angle_to_Middle_Goal,Visible_Goal_Angle,ballUp,centreBounce,kickIn,possGain,throwIn,Distance_to_Right_Goal_x,Distance_to_Middle_y,Chain_Duration,Time_Since_Last_Action,xgb_preds,xgb_probas,xgb_probas_cal
0,202101_BrisbaneLions_Sydney,1,11.0,1,26.0,18.0,11.0,6.0,8.246211,56.0803,0.383818,0.106164,False,True,False,False,False,52.0,21.0,26.0,1.0,0,0.348942,0.327113
1,202101_BrisbaneLions_Sydney,10,110.0,1,50.0,50.0,50.0,-8.0,4.0,36.878178,0.708626,0.132764,False,False,False,True,False,28.0,24.0,21.0,1.0,0,0.374002,0.348274
2,202101_BrisbaneLions_Sydney,21,222.0,1,34.0,19.0,10.0,7.0,15.132746,46.486557,0.328553,0.13093,False,False,False,True,False,44.0,15.0,13.0,2.0,0,0.376332,0.350268
3,202101_BrisbaneLions_Sydney,38,392.0,0,40.0,36.0,34.0,32.0,4.123106,39.560081,0.281772,0.156423,False,False,False,True,False,38.0,11.0,5.0,3.0,0,0.381175,0.354425
4,202101_BrisbaneLions_Sydney,46,476.0,1,56.0,58.0,58.0,58.0,15.132746,22.803509,0.266252,0.276208,False,False,False,True,False,22.0,6.0,6.0,0.0,1,0.783832,0.780589


In [34]:
test_info = test_data[['Match_ID', "Chain_Number", "Order", RESPONSE]].reset_index(drop = True)
test_export = pd.concat([test_info, X_test_preproc.reset_index(drop=True)], axis=1)
test_export['xgb_preds'] = test_preds
test_export['xgb_probas'] = test_probas
test_export['xgb_probas_cal'] = test_cal_probas
test_export.to_csv(prediction_output_path + 'test_predictions_' + model_file_name + '.csv', index = False)
test_export.head()

Unnamed: 0,Match_ID,Chain_Number,Order,Goal,x0,x1,x2,x3,Distance_Since_Last_Action,Distance_to_Middle_Goal,Angle_to_Middle_Goal,Visible_Goal_Angle,ballUp,centreBounce,kickIn,possGain,throwIn,Distance_to_Right_Goal_x,Distance_to_Middle_y,Chain_Duration,Time_Since_Last_Action,xgb_preds,xgb_probas,xgb_probas_cal
0,202101_BrisbaneLions_Sydney,6,61.0,1,35.0,28.0,25.0,22.0,7.0,47.010637,0.416065,0.125105,False,False,False,True,False,43.0,19.0,19.0,1.0,0,0.412656,0.38194
1,202101_BrisbaneLions_Sydney,43,444.0,0,27.0,28.0,34.0,35.0,3.162278,59.682493,0.546167,0.091898,False,False,False,True,False,51.0,31.0,45.0,1.0,0,0.22217,0.226069
2,202101_BrisbaneLions_Sydney,83,781.0,1,46.0,41.0,35.0,30.0,5.0,32.062439,0.062419,0.201226,False,False,False,False,True,32.0,2.0,13.0,0.0,0,0.472046,0.436369
3,202101_BrisbaneLions_Sydney,91,863.0,0,54.0,49.0,-49.0,-26.0,7.81025,45.793013,1.019141,0.073607,False,False,False,True,False,24.0,39.0,3.0,3.0,0,0.280513,0.271566
4,202101_BrisbaneLions_Sydney,161,1422.0,1,38.0,36.0,33.0,30.0,9.219544,54.488531,0.746457,0.086523,False,False,False,True,False,40.0,37.0,5.0,2.0,0,0.381376,0.354597
