Set Shots - Expected Goal Model - Model Tuning and Building - GBM

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import joblib
import warnings
warnings.filterwarnings('ignore')

from expected_score_model.config import goal_set_shots_file_path
from expected_score_model.domain.contracts.modelling_data_contract import ModellingDataContract
from expected_score_model.domain.modelling.hyperparameter_tuning import XGBHyperparameterTuner
from expected_score_model.domain.modelling.supermodel import SuperXGBClassifier
from expected_score_model.domain.modelling.optuna_xgb_param_grid import OptunaXGBParamGrid

pd.options.display.max_rows = 100
pd.options.display.max_columns = 999

%load_ext autoreload
%autoreload 2

Model Version

In [2]:
model_version = 8
model_name = 'expected_goal_set'
model_file_name = model_name + '_v' + str(model_version)

model_output_path = "/Users/ciaran/Documents/Projects/AFL/git-repositories/expected-score-model/model_outputs/models"
prediction_output_path = "/Users/ciaran/Documents/Projects/AFL/git-repositories/expected-score-model/model_outputs/predictions/"

preprocessor_file_name = 'preprocessor_v' + str(model_version)
preprocessor_output_path = "/Users/ciaran/Documents/Projects/AFL/git-repositories/expected-score-model/model_outputs/preprocessors/"

In [3]:
RESPONSE = ModellingDataContract.RESPONSE_GOAL

In [4]:
FEATURES = ModellingDataContract.feature_list_set_goal

In [5]:
MONOTONE_CONSTRAINTS = ModellingDataContract.monotone_constraints_set_goal

Load Data

In [6]:
df_modelling = pd.read_csv(goal_set_shots_file_path)
df_modelling.tail()

Unnamed: 0,Chain_Number,Initial_State,Final_State,Order,Quarter,Quarter_Duration_Chain_Start,Quarter_Duration,Team_Chain,Team,Player,AFL_API_Player_ID,Description,x,y,Disposal,Shot_At_Goal,Behind_Detail,Venue_Width,Venue_Length,Home_Team,Away_Team,Home_Team_Direction_Q1,Match_ID,Round_ID,Year,Season,Goal,Behind,Miss,Score,Event_Type1,Set_Shot,x0,Distance_to_Middle_Goal,Angle_to_Middle_Goal,Visible_Goal_Angle,ballUp,centreBounce,kickIn,possGain,throwIn,GoalTrainingSet,GoalTestSet,GoalValidationSet
16981,186,possGain,behind,1447.0,4,185,224.0,Collingwood,Collingwood,Bobby Hill,Bobby_Hill,Kick,32.0,0.0,ineffective,True,,141,160,Collingwood,Brisbane Lions,right,2023F4_Collingwood_BrisbaneLions,2023F4,,2023.0,0,1,0,1,Uncontested Mark,True,32.0,48.0,0.0,0.133929,False,False,False,True,False,True,False,False
16982,196,possGain,behind,1527.0,4,425,476.0,Brisbane Lions,Brisbane Lions,Joe Daniher,Joe_Daniher,Kick,50.0,-19.0,ineffective,True,,141,160,Collingwood,Brisbane Lions,right,2023F4_Collingwood_BrisbaneLions,2023F4,,2023.0,0,1,0,1,Mark On Lead,True,50.0,35.510562,0.564569,0.153507,False,False,False,True,False,False,False,True
16983,210,throwIn,behind,1610.0,4,804,841.0,Brisbane Lions,Brisbane Lions,Eric Hipwood,Eric_Hipwood,Kick,36.0,37.0,ineffective,True,,141,160,Collingwood,Brisbane Lions,right,2023F4_Collingwood_BrisbaneLions,2023F4,,2023.0,0,1,0,1,Uncontested Mark,True,36.0,57.489129,0.699193,0.085469,False,False,False,False,True,True,False,False
16984,224,possGain,goal,1740.0,4,1290,1348.0,Collingwood,Collingwood,Steele Sidebottom,Steele_Sidebottom,Kick,37.0,34.0,effective,True,,141,160,Collingwood,Brisbane Lions,right,2023F4_Collingwood_BrisbaneLions,2023F4,,2023.0,1,0,0,6,Mark On Lead,True,37.0,54.81788,0.669043,0.091894,False,False,False,True,False,True,False,False
16985,233,ballUp,goal,1811.0,4,1591,1629.0,Brisbane Lions,Brisbane Lions,Joe Daniher,Joe_Daniher,Kick,54.0,2.0,effective,True,,141,160,Collingwood,Brisbane Lions,right,2023F4_Collingwood_BrisbaneLions,2023F4,,2023.0,1,0,0,6,Uncontested Mark,True,54.0,26.07681,0.076772,0.248447,True,False,False,False,False,False,False,True


In [7]:
df_modelling[['ballUp', 'centreBounce', 'kickIn', 'possGain', 'throwIn']] = pd.get_dummies(df_modelling['Initial_State'])

In [8]:
training_data = df_modelling[df_modelling[RESPONSE+"TrainingSet"]]
test_data = df_modelling[df_modelling[RESPONSE+"TestSet"]]
cal_data = df_modelling[df_modelling[RESPONSE+"ValidationSet"]]

In [9]:
X_train, y_train = training_data.drop(columns=[RESPONSE]), training_data[RESPONSE]
X_test, y_test = test_data.drop(columns=[RESPONSE]), test_data[RESPONSE]
X_cal, y_cal = cal_data.drop(columns=[RESPONSE]), cal_data[RESPONSE]

In [10]:
y_train.mean(), y_test.mean(), y_cal.mean()

(0.5385464581416743, 0.53708063566804, 0.5647534952170714)

In [11]:
X_test.head()

Unnamed: 0,Chain_Number,Initial_State,Final_State,Order,Quarter,Quarter_Duration_Chain_Start,Quarter_Duration,Team_Chain,Team,Player,AFL_API_Player_ID,Description,x,y,Disposal,Shot_At_Goal,Behind_Detail,Venue_Width,Venue_Length,Home_Team,Away_Team,Home_Team_Direction_Q1,Match_ID,Round_ID,Year,Season,Behind,Miss,Score,Event_Type1,Set_Shot,x0,Distance_to_Middle_Goal,Angle_to_Middle_Goal,Visible_Goal_Angle,ballUp,centreBounce,kickIn,possGain,throwIn,GoalTrainingSet,GoalTestSet,GoalValidationSet
1,28,possGain,behind,295.0,1,859,911.0,Sydney,Sydney,Sam Reid,Sam_Reid,Kick,52.0,35.0,ineffective,True,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021.0,,1,0,1,Contested Mark,True,52.0,43.600459,0.931882,0.088007,False,False,False,True,False,False,True,False
17,168,centreBounce,miss,1480.0,3,1850,1899.0,Brisbane Lions,Brisbane Lions,Eric Hipwood,Eric_Hipwood,Kick,52.0,-31.0,clanger,True,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021.0,,0,1,0,Mark On Lead,True,52.0,40.459857,0.872894,0.102289,False,True,False,False,False,False,True,False
19,189,kickIn,goal,1639.0,4,390,439.0,Brisbane Lions,Brisbane Lions,Lincoln McCarthy,Lincoln_McCarthy,Kick,55.0,-2.0,effective,True,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021.0,,0,0,6,Free For,True,55.0,23.086793,0.086738,0.281582,False,False,True,False,False,False,True,False
25,11,centreBounce,goal,103.0,1,190,249.0,Western Bulldogs,Western Bulldogs,Aaron Naughton,Aaron_Naughton,Kick,54.0,-18.0,effective,True,,141,160,Collingwood,Western Bulldogs,right,202101_Collingwood_WesternBulldogs,202101,2021.0,,0,0,6,Mark On Lead,True,54.0,31.622777,0.605545,0.168122,False,True,False,False,False,False,True,False
36,139,possGain,behind,1190.0,3,205,256.0,Collingwood,Collingwood,Brody Mihocek,Brody_Mihocek,Kick,50.0,-30.0,ineffective,True,,141,160,Collingwood,Western Bulldogs,right,202101_Collingwood_WesternBulldogs,202101,2021.0,,1,0,1,Uncontested Mark,True,50.0,42.426407,0.785398,0.107277,False,False,False,True,False,False,True,False


Preprocess Data

In [12]:
# preprocessor = DataPreprocessor()
# preprocessor.fit(X_train)

# X_train_preproc = preprocessor.transform(X_train)
# X_test_preproc = preprocessor.transform(X_test)

In [13]:
X_train_preproc = X_train[FEATURES]
X_test_preproc = X_test[FEATURES]
X_cal_preproc = X_cal[FEATURES]

In [14]:
X_train_preproc.shape, X_test_preproc.shape

((10870, 9), (3398, 9))

In [15]:
X_train_preproc.head()

Unnamed: 0,x0,Distance_to_Middle_Goal,Angle_to_Middle_Goal,Visible_Goal_Angle,ballUp,centreBounce,kickIn,possGain,throwIn
0,46.0,52.009614,0.908067,0.075999,False,False,True,False,False
2,45.0,36.674242,0.451453,0.158231,False,False,False,True,False
4,33.0,46.572524,0.260602,0.13341,False,True,False,False,False
5,52.0,44.407207,0.945311,0.084822,False,False,False,True,False
6,47.0,46.754679,0.84593,0.091187,False,False,False,True,False


In [16]:
X_test_preproc.head()

Unnamed: 0,x0,Distance_to_Middle_Goal,Angle_to_Middle_Goal,Visible_Goal_Angle,ballUp,centreBounce,kickIn,possGain,throwIn
1,52.0,43.600459,0.931882,0.088007,False,False,False,True,False
17,52.0,40.459857,0.872894,0.102289,False,True,False,False,False
19,55.0,23.086793,0.086738,0.281582,False,False,True,False,False
25,54.0,31.622777,0.605545,0.168122,False,True,False,False,False
36,50.0,42.426407,0.785398,0.107277,False,False,False,True,False


Optuna Hyperparameter Tuning Class - HyperParameterTuner & XGBHyperparameterTuner

In [17]:
xgb_tuner = XGBHyperparameterTuner(X_train_preproc, y_train, monotonicity_constraints=MONOTONE_CONSTRAINTS)

In [18]:
xgb_tuner.training_data.head()

Unnamed: 0,x0,Distance_to_Middle_Goal,Angle_to_Middle_Goal,Visible_Goal_Angle,ballUp,centreBounce,kickIn,possGain,throwIn
0,46.0,52.009614,0.908067,0.075999,False,False,True,False,False
2,45.0,36.674242,0.451453,0.158231,False,False,False,True,False
4,33.0,46.572524,0.260602,0.13341,False,True,False,False,False
5,52.0,44.407207,0.945311,0.084822,False,False,False,True,False
6,47.0,46.754679,0.84593,0.091187,False,False,False,True,False


In [19]:
xgb_tuner.tune_hyperparameters()

[I 2023-10-21 14:11:23,351] A new study created in memory with name: no-name-c768c49b-18fa-4f6d-baa4-6bad833e2964
[I 2023-10-21 14:11:23,389] Trial 0 finished with value: 0.6158034853905376 and parameters: {'max_depth': 10, 'min_child_weight': 39, 'eta': 0.17436598030183398, 'gamma': 0.05253546567926054, 'lambda': 9.474878264690282, 'alpha': 0.0927097159951296, 'subsample': 0.3002259140392822, 'colsample_bytree': 0.7488630337339748}. Best is trial 0 with value: 0.6158034853905376.
[I 2023-10-21 14:11:23,406] Trial 1 finished with value: 0.6456172071090344 and parameters: {'max_depth': 4, 'min_child_weight': 38, 'eta': 0.050329900808759986, 'gamma': 0.05524238538999337, 'lambda': 5.558146841346903, 'alpha': 3.1864930792836073, 'subsample': 0.31167351048269576, 'colsample_bytree': 0.7289254571854513}. Best is trial 0 with value: 0.6158034853905376.
[I 2023-10-21 14:11:23,426] Trial 2 finished with value: 0.606815830146099 and parameters: {'max_depth': 8, 'min_child_weight': 27, 'eta': 0.

Number of finished trials:  1000
Best trial:
  Value: 0.5790151800487838
  Params: 
    max_depth: 4
    min_child_weight: 69
    eta: 0.4526318621985348
    gamma: 0.4927640372391809
    lambda: 0.003985001731547372
    alpha: 0.03753585158509338
    subsample: 0.8481086488656402
    colsample_bytree: 0.8439578496361944


<optuna.study.study.Study at 0x14d50ba10>

In [20]:
params = xgb_tuner.get_best_params()
params

{'max_depth': 4,
 'min_child_weight': 69,
 'eta': 0.4526318621985348,
 'gamma': 0.4927640372391809,
 'lambda': 0.003985001731547372,
 'alpha': 0.03753585158509338,
 'subsample': 0.8481086488656402,
 'colsample_bytree': 0.8439578496361944}

Training Model - SuperXGBClassifier class for training and predictions

In [21]:
params['objective'] = OptunaXGBParamGrid.error
params['num_rounds'] = 1000
params['early_stopping_rounds'] = 50
params['verbosity'] = 1
params['monotone_constraints'] = MONOTONE_CONSTRAINTS

In [22]:
super_xgb = SuperXGBClassifier(X_train = X_train_preproc, 
                               y_train = y_train, 
                               X_test = X_test_preproc, 
                               y_test = y_test,
                               X_cal = X_cal_preproc,
                               y_cal = y_cal,
                               params = params)

In [23]:
super_xgb.fit()

[0]	validation_0-logloss:0.63318	validation_1-logloss:0.63469
[1]	validation_0-logloss:0.61211	validation_1-logloss:0.61477
[2]	validation_0-logloss:0.60909	validation_1-logloss:0.61224
[3]	validation_0-logloss:0.60116	validation_1-logloss:0.60434
[4]	validation_0-logloss:0.59692	validation_1-logloss:0.60050
[5]	validation_0-logloss:0.59517	validation_1-logloss:0.59894
[6]	validation_0-logloss:0.59458	validation_1-logloss:0.59869
[7]	validation_0-logloss:0.59407	validation_1-logloss:0.59819
[8]	validation_0-logloss:0.59392	validation_1-logloss:0.59812
[9]	validation_0-logloss:0.59390	validation_1-logloss:0.59820
[10]	validation_0-logloss:0.59385	validation_1-logloss:0.59816
[11]	validation_0-logloss:0.59372	validation_1-logloss:0.59781
[12]	validation_0-logloss:0.59368	validation_1-logloss:0.59765
[13]	validation_0-logloss:0.59359	validation_1-logloss:0.59805
[14]	validation_0-logloss:0.59351	validation_1-logloss:0.59827
[15]	validation_0-logloss:0.59351	validation_1-logloss:0.59828
[1

In [24]:
super_xgb.xgb_model

In [25]:
super_xgb.xgb_model.get_booster().feature_names

['x0',
 'Distance_to_Middle_Goal',
 'Angle_to_Middle_Goal',
 'Visible_Goal_Angle',
 'ballUp',
 'centreBounce',
 'kickIn',
 'possGain',
 'throwIn']

In [26]:
train_preds = super_xgb.predict(X_train_preproc)
test_preds = super_xgb.predict(X_test_preproc)

In [27]:
train_probas = super_xgb.predict_proba(X_train_preproc)[:, 1]
test_probas = super_xgb.predict_proba(X_test_preproc)[:, 1]
cal_probas = super_xgb.predict_proba(X_cal_preproc)[:, 1]

In [28]:
super_xgb.calibrate()

In [29]:
train_cal_probas = super_xgb.predict_proba(X_train_preproc, calibrate=True)
test_cal_probas = super_xgb.predict_proba(X_test_preproc, calibrate=True)
cal_cal_probas = super_xgb.predict_proba(X_cal_preproc, calibrate=True)

Check Average Predictions

In [30]:
train_probas.mean(), training_data[RESPONSE].mean(), train_cal_probas.mean()

(0.53824776, 0.5385464581416743, 0.5574862982407187)

In [31]:
test_probas.mean(), test_data[RESPONSE].mean(), test_cal_probas.mean()

(0.5354432, 0.53708063566804, 0.5548791956978055)

In [32]:
cal_probas.mean(), cal_data[RESPONSE].mean(), cal_cal_probas.mean()

(0.5464367, 0.5647534952170714, 0.5647528529799077)

Export model

In [33]:
model_output_path + "/" + model_file_name + ".joblib"

'/Users/ciaran/Documents/Projects/AFL/git-repositories/expected-score-model/model_outputs/models/expected_goal_set_v8.joblib'

In [34]:
super_xgb.export_model(model_output_path + "/" + model_file_name + ".joblib")

Export data and predictions

In [35]:
train_info = training_data[['Match_ID', "Chain_Number", "Order", RESPONSE]].reset_index(drop = True)
train_export = pd.concat([train_info, X_train_preproc.reset_index(drop=True)], axis=1)
train_export['xgb_preds'] = train_preds
train_export['xgb_probas'] = train_probas
train_export['xgb_probas_cal'] = train_cal_probas
train_export.to_csv(prediction_output_path + 'train_predictions_' + model_file_name + '.csv', index = False)
train_export.head()

Unnamed: 0,Match_ID,Chain_Number,Order,Goal,x0,Distance_to_Middle_Goal,Angle_to_Middle_Goal,Visible_Goal_Angle,ballUp,centreBounce,kickIn,possGain,throwIn,xgb_preds,xgb_probas,xgb_probas_cal
0,202101_BrisbaneLions_Sydney,17,183.0,0,46.0,52.009614,0.908067,0.075999,False,False,True,False,False,0,0.222634,0.278716
1,202101_BrisbaneLions_Sydney,34,354.0,0,45.0,36.674242,0.451453,0.158231,False,False,False,True,False,1,0.631807,0.634317
2,202101_BrisbaneLions_Sydney,59,578.0,1,33.0,46.572524,0.260602,0.13341,False,True,False,False,False,1,0.552385,0.564326
3,202101_BrisbaneLions_Sydney,62,621.0,1,52.0,44.407207,0.945311,0.084822,False,False,False,True,False,0,0.299501,0.347111
4,202101_BrisbaneLions_Sydney,72,696.0,1,47.0,46.754679,0.84593,0.091187,False,False,False,True,False,0,0.32089,0.365683


In [36]:
test_info = test_data[['Match_ID', "Chain_Number", "Order", RESPONSE]].reset_index(drop = True)
test_export = pd.concat([test_info, X_test_preproc.reset_index(drop=True)], axis=1)
test_export['xgb_preds'] = test_preds
test_export['xgb_probas'] = test_probas
test_export['xgb_probas_cal'] = test_cal_probas
test_export.to_csv(prediction_output_path + 'test_predictions_' + model_file_name + '.csv', index = False)
test_export.head()

Unnamed: 0,Match_ID,Chain_Number,Order,Goal,x0,Distance_to_Middle_Goal,Angle_to_Middle_Goal,Visible_Goal_Angle,ballUp,centreBounce,kickIn,possGain,throwIn,xgb_preds,xgb_probas,xgb_probas_cal
0,202101_BrisbaneLions_Sydney,28,295.0,0,52.0,43.600459,0.931882,0.088007,False,False,False,True,False,0,0.299501,0.347111
1,202101_BrisbaneLions_Sydney,168,1480.0,0,52.0,40.459857,0.872894,0.102289,False,True,False,False,False,0,0.439971,0.467646
2,202101_BrisbaneLions_Sydney,189,1639.0,1,55.0,23.086793,0.086738,0.281582,False,False,True,False,False,1,0.888778,0.879794
3,202101_Collingwood_WesternBulldogs,11,103.0,1,54.0,31.622777,0.605545,0.168122,False,True,False,False,False,1,0.654677,0.654856
4,202101_Collingwood_WesternBulldogs,139,1190.0,0,50.0,42.426407,0.785398,0.107277,False,False,False,True,False,0,0.441848,0.469248
