Set Shots - Expected Miss Model - Model Tuning and Building - GBM

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import joblib
import warnings
warnings.filterwarnings('ignore')

from expected_score_model.config import miss_set_shots_file_path
from expected_score_model.domain.contracts.modelling_data_contract import ModellingDataContract
from expected_score_model.domain.modelling.hyperparameter_tuning import XGBHyperparameterTuner
from expected_score_model.domain.modelling.supermodel import SuperXGBClassifier
from expected_score_model.domain.modelling.optuna_xgb_param_grid import OptunaXGBParamGrid

pd.options.display.max_rows = 100
pd.options.display.max_columns = 999

%load_ext autoreload
%autoreload 2

Model Version

In [2]:
model_version = 6
model_name = 'expected_miss_set'
model_file_name = model_name + '_v' + str(model_version)

model_output_path = "/Users/ciaran/Documents/Projects/AFL/git-repositories/expected-score-model/model_outputs/models"
prediction_output_path = "/Users/ciaran/Documents/Projects/AFL/git-repositories/expected-score-model/model_outputs/predictions/"

preprocessor_file_name = 'preprocessor_v' + str(model_version)
preprocessor_output_path = "/Users/ciaran/Documents/Projects/AFL/git-repositories/expected-score-model/model_outputs/preprocessors/"

In [3]:
RESPONSE = ModellingDataContract.RESPONSE_MISS

In [4]:
FEATURES = ModellingDataContract.feature_list_set_miss

In [5]:
MONOTONE_CONSTRAINTS = ModellingDataContract.monotone_constraints_set_miss

Load Data

In [6]:
df_modelling = pd.read_csv(miss_set_shots_file_path)
df_modelling.tail()

Unnamed: 0,Chain_Number,Initial_State,Final_State,Order,Quarter,Quarter_Duration_Chain_Start,Quarter_Duration,Team_Chain,Team,Player,AFL_API_Player_ID,Description,x,y,Disposal,Shot_At_Goal,Behind_Detail,Venue_Width,Venue_Length,Home_Team,Away_Team,Home_Team_Direction_Q1,Match_ID,Round_ID,Year,Season,Goal,Behind,Miss,Score,Event_Type1,Set_Shot,x0,y0,Distance_to_Middle_Goal,Angle_to_Middle_Goal,Visible_Goal_Angle,MissTrainingSet,MissTestSet,MissValidationSet
16981,186,possGain,behind,1447.0,4,185,224.0,Collingwood,Collingwood,Bobby Hill,Bobby_Hill,Kick,32.0,0.0,ineffective,True,,141,160,Collingwood,Brisbane Lions,right,2023F4_Collingwood_BrisbaneLions,2023F4,,2023.0,0,1,0,1,Uncontested Mark,True,32.0,0.0,48.0,0.0,0.133929,True,False,False
16982,196,possGain,behind,1527.0,4,425,476.0,Brisbane Lions,Brisbane Lions,Joe Daniher,Joe_Daniher,Kick,50.0,-19.0,ineffective,True,,141,160,Collingwood,Brisbane Lions,right,2023F4_Collingwood_BrisbaneLions,2023F4,,2023.0,0,1,0,1,Mark On Lead,True,50.0,-19.0,35.510562,0.564569,0.153507,False,False,True
16983,210,throwIn,behind,1610.0,4,804,841.0,Brisbane Lions,Brisbane Lions,Eric Hipwood,Eric_Hipwood,Kick,36.0,37.0,ineffective,True,,141,160,Collingwood,Brisbane Lions,right,2023F4_Collingwood_BrisbaneLions,2023F4,,2023.0,0,1,0,1,Uncontested Mark,True,36.0,37.0,57.489129,0.699193,0.085469,True,False,False
16984,224,possGain,goal,1740.0,4,1290,1348.0,Collingwood,Collingwood,Steele Sidebottom,Steele_Sidebottom,Kick,37.0,34.0,effective,True,,141,160,Collingwood,Brisbane Lions,right,2023F4_Collingwood_BrisbaneLions,2023F4,,2023.0,1,0,0,6,Mark On Lead,True,37.0,34.0,54.81788,0.669043,0.091894,True,False,False
16985,233,ballUp,goal,1811.0,4,1591,1629.0,Brisbane Lions,Brisbane Lions,Joe Daniher,Joe_Daniher,Kick,54.0,2.0,effective,True,,141,160,Collingwood,Brisbane Lions,right,2023F4_Collingwood_BrisbaneLions,2023F4,,2023.0,1,0,0,6,Uncontested Mark,True,54.0,2.0,26.07681,0.076772,0.248447,False,False,True


In [7]:
df_modelling[['ballUp', 'centreBounce', 'kickIn', 'possGain', 'throwIn']] = pd.get_dummies(df_modelling['Initial_State'])

In [8]:
training_data = df_modelling[df_modelling[RESPONSE+"TrainingSet"]]
test_data = df_modelling[df_modelling[RESPONSE+"TestSet"]]
cal_data = df_modelling[df_modelling[RESPONSE+"ValidationSet"]]

In [9]:
X_train, y_train = training_data.drop(columns=[RESPONSE]), training_data[RESPONSE]
X_test, y_test = test_data.drop(columns=[RESPONSE]), test_data[RESPONSE]
X_cal, y_cal = cal_data.drop(columns=[RESPONSE]), cal_data[RESPONSE]

In [10]:
y_train.mean(), y_test.mean(), y_cal.mean()

(0.1295308187672493, 0.12625073572689818, 0.12214863870493009)

In [11]:
X_test.head()

Unnamed: 0,Chain_Number,Initial_State,Final_State,Order,Quarter,Quarter_Duration_Chain_Start,Quarter_Duration,Team_Chain,Team,Player,AFL_API_Player_ID,Description,x,y,Disposal,Shot_At_Goal,Behind_Detail,Venue_Width,Venue_Length,Home_Team,Away_Team,Home_Team_Direction_Q1,Match_ID,Round_ID,Year,Season,Goal,Behind,Score,Event_Type1,Set_Shot,x0,y0,Distance_to_Middle_Goal,Angle_to_Middle_Goal,Visible_Goal_Angle,MissTrainingSet,MissTestSet,MissValidationSet,ballUp,centreBounce,kickIn,possGain,throwIn
1,28,possGain,behind,295.0,1,859,911.0,Sydney,Sydney,Sam Reid,Sam_Reid,Kick,52.0,35.0,ineffective,True,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021.0,,0,1,1,Contested Mark,True,52.0,35.0,43.600459,0.931882,0.088007,False,True,False,False,False,False,True,False
17,168,centreBounce,miss,1480.0,3,1850,1899.0,Brisbane Lions,Brisbane Lions,Eric Hipwood,Eric_Hipwood,Kick,52.0,-31.0,clanger,True,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021.0,,0,0,0,Mark On Lead,True,52.0,-31.0,40.459857,0.872894,0.102289,False,True,False,False,True,False,False,False
19,189,kickIn,goal,1639.0,4,390,439.0,Brisbane Lions,Brisbane Lions,Lincoln McCarthy,Lincoln_McCarthy,Kick,55.0,-2.0,effective,True,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021.0,,1,0,6,Free For,True,55.0,-2.0,23.086793,0.086738,0.281582,False,True,False,False,False,True,False,False
25,11,centreBounce,goal,103.0,1,190,249.0,Western Bulldogs,Western Bulldogs,Aaron Naughton,Aaron_Naughton,Kick,54.0,-18.0,effective,True,,141,160,Collingwood,Western Bulldogs,right,202101_Collingwood_WesternBulldogs,202101,2021.0,,1,0,6,Mark On Lead,True,54.0,-18.0,31.622777,0.605545,0.168122,False,True,False,False,True,False,False,False
36,139,possGain,behind,1190.0,3,205,256.0,Collingwood,Collingwood,Brody Mihocek,Brody_Mihocek,Kick,50.0,-30.0,ineffective,True,,141,160,Collingwood,Western Bulldogs,right,202101_Collingwood_WesternBulldogs,202101,2021.0,,0,1,1,Uncontested Mark,True,50.0,-30.0,42.426407,0.785398,0.107277,False,True,False,False,False,False,True,False


Preprocess Data

In [12]:
# preprocessor = DataPreprocessor()
# preprocessor.fit(X_train)

# X_train_preproc = preprocessor.transform(X_train)
# X_test_preproc = preprocessor.transform(X_test)

In [13]:
X_train_preproc = X_train[FEATURES]
X_test_preproc = X_test[FEATURES]
X_cal_preproc = X_cal[FEATURES]

In [14]:
X_train_preproc.shape, X_test_preproc.shape

((10870, 5), (3398, 5))

In [15]:
X_train_preproc.head()

Unnamed: 0,x0,y0,Distance_to_Middle_Goal,Angle_to_Middle_Goal,Visible_Goal_Angle
0,46.0,-41.0,52.009614,0.908067,0.075999
2,45.0,16.0,36.674242,0.451453,0.158231
4,33.0,12.0,46.572524,0.260602,0.13341
5,52.0,36.0,44.407207,0.945311,0.084822
6,47.0,35.0,46.754679,0.84593,0.091187


In [16]:
X_test_preproc.head()

Unnamed: 0,x0,y0,Distance_to_Middle_Goal,Angle_to_Middle_Goal,Visible_Goal_Angle
1,52.0,35.0,43.600459,0.931882,0.088007
17,52.0,-31.0,40.459857,0.872894,0.102289
19,55.0,-2.0,23.086793,0.086738,0.281582
25,54.0,-18.0,31.622777,0.605545,0.168122
36,50.0,-30.0,42.426407,0.785398,0.107277


Optuna Hyperparameter Tuning Class - HyperParameterTuner & XGBHyperparameterTuner

In [17]:
xgb_tuner = XGBHyperparameterTuner(X_train_preproc, y_train, monotonicity_constraints=MONOTONE_CONSTRAINTS)

In [18]:
xgb_tuner.training_data.head()

Unnamed: 0,x0,y0,Distance_to_Middle_Goal,Angle_to_Middle_Goal,Visible_Goal_Angle
0,46.0,-41.0,52.009614,0.908067,0.075999
2,45.0,16.0,36.674242,0.451453,0.158231
4,33.0,12.0,46.572524,0.260602,0.13341
5,52.0,36.0,44.407207,0.945311,0.084822
6,47.0,35.0,46.754679,0.84593,0.091187


In [19]:
xgb_tuner.tune_hyperparameters()

[I 2023-10-21 13:42:37,522] A new study created in memory with name: no-name-533f8386-4df3-419f-a683-78f217c4e267


[I 2023-10-21 13:42:37,575] Trial 0 finished with value: 0.4243803890998717 and parameters: {'max_depth': 4, 'min_child_weight': 76, 'eta': 0.08500386673510839, 'gamma': 0.0022837756346407475, 'lambda': 0.82519191685076, 'alpha': 0.647204513695434, 'subsample': 0.6657467173359228, 'colsample_bytree': 0.8618946150901998}. Best is trial 0 with value: 0.4243803890998717.
[I 2023-10-21 13:42:37,604] Trial 1 finished with value: 0.31922824047284715 and parameters: {'max_depth': 9, 'min_child_weight': 40, 'eta': 0.3497700211915785, 'gamma': 0.00037517667914096635, 'lambda': 0.06681312097968865, 'alpha': 0.0036717447802253776, 'subsample': 0.6092217105283817, 'colsample_bytree': 0.7089141912883903}. Best is trial 1 with value: 0.31922824047284715.
[I 2023-10-21 13:42:37,623] Trial 2 finished with value: 0.31572150273704325 and parameters: {'max_depth': 13, 'min_child_weight': 99, 'eta': 0.8104789577015301, 'gamma': 0.05359472716152071, 'lambda': 0.5660371764287507, 'alpha': 0.0012058330105895

Number of finished trials:  1000
Best trial:
  Value: 0.2847134803353733
  Params: 
    max_depth: 6
    min_child_weight: 77
    eta: 0.9643382332134234
    gamma: 0.002819923652022961
    lambda: 0.1368414895152792
    alpha: 3.170744440169933
    subsample: 0.7447164738725273
    colsample_bytree: 0.7921087297152302


<optuna.study.study.Study at 0x13be350d0>

In [20]:
params = xgb_tuner.get_best_params()
params

{'max_depth': 6,
 'min_child_weight': 77,
 'eta': 0.9643382332134234,
 'gamma': 0.002819923652022961,
 'lambda': 0.1368414895152792,
 'alpha': 3.170744440169933,
 'subsample': 0.7447164738725273,
 'colsample_bytree': 0.7921087297152302}

Training Model - SuperXGBClassifier class for training and predictions

In [21]:
params['objective'] = OptunaXGBParamGrid.error
params['num_rounds'] = 1000
params['early_stopping_rounds'] = 50
params['verbosity'] = 1
params['monotone_constraints'] = MONOTONE_CONSTRAINTS

In [22]:
super_xgb = SuperXGBClassifier(X_train = X_train_preproc, 
                               y_train = y_train, 
                               X_test = X_test_preproc, 
                               y_test = y_test,
                               X_cal = X_cal_preproc,
                               y_cal = y_cal,
                               params = params)

In [23]:
super_xgb.fit()

[0]	validation_0-logloss:0.36588	validation_1-logloss:0.36565
[1]	validation_0-logloss:0.32681	validation_1-logloss:0.32673
[2]	validation_0-logloss:0.31770	validation_1-logloss:0.31834
[3]	validation_0-logloss:0.31611	validation_1-logloss:0.31821
[4]	validation_0-logloss:0.31572	validation_1-logloss:0.31869
[5]	validation_0-logloss:0.31524	validation_1-logloss:0.31878
[6]	validation_0-logloss:0.31439	validation_1-logloss:0.31905
[7]	validation_0-logloss:0.31416	validation_1-logloss:0.31938
[8]	validation_0-logloss:0.31417	validation_1-logloss:0.31943
[9]	validation_0-logloss:0.31370	validation_1-logloss:0.31909
[10]	validation_0-logloss:0.31314	validation_1-logloss:0.31912
[11]	validation_0-logloss:0.31310	validation_1-logloss:0.31951
[12]	validation_0-logloss:0.31304	validation_1-logloss:0.32033
[13]	validation_0-logloss:0.31309	validation_1-logloss:0.32064
[14]	validation_0-logloss:0.31312	validation_1-logloss:0.32133
[15]	validation_0-logloss:0.31236	validation_1-logloss:0.31982
[1

In [24]:
super_xgb.xgb_model

In [25]:
super_xgb.xgb_model.get_booster().feature_names

['x0',
 'y0',
 'Distance_to_Middle_Goal',
 'Angle_to_Middle_Goal',
 'Visible_Goal_Angle']

In [26]:
train_preds = super_xgb.predict(X_train_preproc)
test_preds = super_xgb.predict(X_test_preproc)

In [27]:
train_probas = super_xgb.predict_proba(X_train_preproc)[:, 1]
test_probas = super_xgb.predict_proba(X_test_preproc)[:, 1]
cal_probas = super_xgb.predict_proba(X_cal_preproc)[:, 1]

In [28]:
super_xgb.calibrate()



In [29]:
train_cal_probas = super_xgb.predict_proba(X_train_preproc, calibrate=True)
test_cal_probas = super_xgb.predict_proba(X_test_preproc, calibrate=True)
cal_cal_probas = super_xgb.predict_proba(X_cal_preproc, calibrate=True)

Check Average Predictions

In [30]:
train_probas.mean(), training_data[RESPONSE].mean(), train_cal_probas.mean()

(0.13332483, 0.1295308187672493, 0.1273921483452305)

In [31]:
test_probas.mean(), test_data[RESPONSE].mean(), test_cal_probas.mean()

(0.13474149, 0.12625073572689818, 0.12863423206238184)

In [32]:
cal_probas.mean(), cal_data[RESPONSE].mean(), cal_cal_probas.mean()

(0.12804866, 0.12214863870493009, 0.12214838810126345)

Export model

In [33]:
super_xgb.export_model(model_output_path + "/" + model_file_name + ".joblib")

Export data and predictions

In [34]:
train_info = training_data[['Match_ID', "Chain_Number", "Order", RESPONSE]].reset_index(drop = True)
train_export = pd.concat([train_info, X_train_preproc.reset_index(drop=True)], axis=1)
train_export['xgb_preds'] = train_preds
train_export['xgb_probas'] = train_probas
train_export['xgb_probas_cal'] = train_cal_probas
train_export.to_csv(prediction_output_path + 'train_predictions_' + model_file_name + '.csv', index = False)
train_export.head()

Unnamed: 0,Match_ID,Chain_Number,Order,Miss,x0,y0,Distance_to_Middle_Goal,Angle_to_Middle_Goal,Visible_Goal_Angle,xgb_preds,xgb_probas,xgb_probas_cal
0,202101_BrisbaneLions_Sydney,17,183.0,0,46.0,-41.0,52.009614,0.908067,0.075999,0,0.400648,0.374358
1,202101_BrisbaneLions_Sydney,34,354.0,0,45.0,16.0,36.674242,0.451453,0.158231,0,0.020466,0.015017
2,202101_BrisbaneLions_Sydney,59,578.0,0,33.0,12.0,46.572524,0.260602,0.13341,0,0.090615,0.08725
3,202101_BrisbaneLions_Sydney,62,621.0,0,52.0,36.0,44.407207,0.945311,0.084822,0,0.293074,0.289184
4,202101_BrisbaneLions_Sydney,72,696.0,0,47.0,35.0,46.754679,0.84593,0.091187,0,0.237101,0.238512


In [35]:
test_info = test_data[['Match_ID', "Chain_Number", "Order", RESPONSE]].reset_index(drop = True)
test_export = pd.concat([test_info, X_test_preproc.reset_index(drop=True)], axis=1)
test_export['xgb_preds'] = test_preds
test_export['xgb_probas'] = test_probas
test_export['xgb_probas_cal'] = test_cal_probas
test_export.to_csv(prediction_output_path + 'test_predictions_' + model_file_name + '.csv', index = False)
test_export.head()

Unnamed: 0,Match_ID,Chain_Number,Order,Miss,x0,y0,Distance_to_Middle_Goal,Angle_to_Middle_Goal,Visible_Goal_Angle,xgb_preds,xgb_probas,xgb_probas_cal
0,202101_BrisbaneLions_Sydney,28,295.0,0,52.0,35.0,43.600459,0.931882,0.088007,0,0.224974,0.226946
1,202101_BrisbaneLions_Sydney,168,1480.0,1,52.0,-31.0,40.459857,0.872894,0.102289,0,0.130559,0.130441
2,202101_BrisbaneLions_Sydney,189,1639.0,0,55.0,-2.0,23.086793,0.086738,0.281582,0,0.015349,0.010576
3,202101_Collingwood_WesternBulldogs,11,103.0,0,54.0,-18.0,31.622777,0.605545,0.168122,0,0.023477,0.017738
4,202101_Collingwood_WesternBulldogs,139,1190.0,0,50.0,-30.0,42.426407,0.785398,0.107277,0,0.154326,0.155684
