Model Building - Disposal %

In [1]:
import pandas as pd
import numpy as np
import joblib
import warnings
warnings.filterwarnings('ignore')

from expected_disposal_model.config import modelling_file_path, model_output_path, prediction_output_path
from expected_disposal_model.modelling_data_contract import ModellingDataContract
from expected_disposal_model.modelling.hyperparameter_tuning import XGBHyperparameterTuner
from expected_disposal_model.modelling.supermodel import SuperXGBClassifier
from expected_disposal_model.modelling.optuna_xgb_param_grid import OptunaXGBParamGrid

pd.options.display.max_rows = 100
pd.options.display.max_columns = 999

%load_ext autoreload
%autoreload 2

Model Version

In [2]:
model_version = 1
model_name = 'disposal'
model_file_name = model_name + '_v' + str(model_version)

In [3]:
RESPONSE = ModellingDataContract.RESPONSE

In [4]:
FEATURES = ModellingDataContract.feature_list

In [5]:
MONOTONE_CONSTRAINTS = ModellingDataContract.monotone_constraints

Load Data

In [6]:
modelling_data = pd.read_csv(modelling_file_path)
modelling_data.tail()

Unnamed: 0,match_id,chain_number,order,quarter,quarter_seconds,overall_seconds,team,player,start_x,start_y,end_x,end_y,action_type,outcome_type,type_Kick_a0,type_Handball_a0,quarter_a0,quarter_seconds_a0,overall_seconds_a0,start_x_a0,start_y_a0,end_x_a0,end_y_a0,dx_a0,dy_a0,movement_a0,type_Kick_a1,type_Handball_a1,outcome_effective_a1,outcome_ineffective_a1,outcome_clanger_a1,quarter_a1,quarter_seconds_a1,overall_seconds_a1,start_x_a1,start_y_a1,end_x_a1,end_y_a1,dx_a1,dy_a1,movement_a1,type_Kick_a2,type_Handball_a2,outcome_effective_a2,outcome_ineffective_a2,outcome_clanger_a2,quarter_a2,quarter_seconds_a2,overall_seconds_a2,start_x_a2,start_y_a2,end_x_a2,end_y_a2,dx_a2,dy_a2,movement_a2,team_1,team_2,time_delta1,time_delta2,dx_a01,dy_a01,move_a01,dx_a02,dy_a02,move_a02,Disposal,DisposalTrainingSet,DisposalTestSet,DisposalValidationSet
427203,2023F4_Collingwood_BrisbaneLions,240,1845.0,4,1771.0,7759.0,Collingwood,Scott Pendlebury,-45.0,-38.0,-27.0,-35.0,Kick,effective,True,False,4,1771.0,7759.0,-45.0,-38.0,-27.0,-35.0,18.0,3.0,18.248288,False,False,True,False,False,4,1769.0,7757.0,-46.0,-38.0,-45.0,-38.0,1.0,0.0,1.0,True,False,True,False,False,4,1768.0,7756.0,-23.0,-35.0,-46.0,-38.0,-23.0,-3.0,23.194827,True,True,-2.0,-3.0,0.0,0.0,0.0,-1.0,0.0,1.0,1,False,True,False
427204,2023F4_Collingwood_BrisbaneLions,240,1847.0,4,1779.0,7767.0,Collingwood,Tom Mitchell,-29.0,-34.0,-46.0,-48.0,Kick,effective,True,False,4,1779.0,7767.0,-29.0,-34.0,-46.0,-48.0,-17.0,-14.0,22.022716,False,False,True,False,False,4,1773.0,7761.0,-27.0,-35.0,-29.0,-34.0,-2.0,1.0,2.236068,True,False,True,False,False,4,1771.0,7759.0,-45.0,-38.0,-27.0,-35.0,18.0,3.0,18.248288,True,True,-6.0,-8.0,0.0,0.0,0.0,2.0,-1.0,2.236068,1,True,False,False
427205,2023F4_Collingwood_BrisbaneLions,240,1849.0,4,1788.0,7776.0,Collingwood,Brody Mihocek,-49.0,-48.0,16.0,-63.0,Kick,effective,True,False,4,1788.0,7776.0,-49.0,-48.0,16.0,-63.0,65.0,-15.0,66.70832,False,False,True,False,False,4,1780.0,7768.0,-46.0,-48.0,-49.0,-48.0,-3.0,0.0,3.0,True,False,True,False,False,4,1779.0,7767.0,-29.0,-34.0,-46.0,-48.0,-17.0,-14.0,22.022716,True,True,-8.0,-9.0,0.0,0.0,0.0,3.0,0.0,3.0,1,True,False,False
427206,2023F4_Collingwood_BrisbaneLions,240,1852.0,4,1794.0,7782.0,Collingwood,Will Hoskin-Elliott,14.0,-52.0,26.0,-47.0,Kick,ineffective,True,False,4,1794.0,7782.0,14.0,-52.0,26.0,-47.0,12.0,5.0,13.0,False,False,True,False,False,4,1793.0,7781.0,14.0,-53.0,14.0,-52.0,0.0,1.0,1.0,False,False,True,False,False,4,1792.0,7780.0,16.0,-63.0,14.0,-53.0,-2.0,10.0,10.198039,True,True,-1.0,-2.0,0.0,0.0,0.0,0.0,-1.0,1.0,0,False,True,False
427207,2023F4_Collingwood_BrisbaneLions,240,1853.0,4,1805.0,7793.0,Collingwood,Will Hoskin-Elliott,26.0,-47.0,26.0,-47.0,Kick,effective,True,False,4,1805.0,7793.0,26.0,-47.0,26.0,-47.0,0.0,0.0,0.0,True,False,False,True,False,4,1794.0,7782.0,14.0,-52.0,26.0,-47.0,12.0,5.0,13.0,False,False,True,False,False,4,1793.0,7781.0,14.0,-53.0,14.0,-52.0,0.0,1.0,1.0,True,True,-11.0,-12.0,0.0,0.0,0.0,-12.0,-5.0,13.0,1,True,False,False


In [7]:
training_data = modelling_data[modelling_data[RESPONSE+"TrainingSet"]]
test_data = modelling_data[modelling_data[RESPONSE+"TestSet"]]
cal_data = modelling_data[modelling_data[RESPONSE+"ValidationSet"]]

In [8]:
X, y = modelling_data.drop(columns=[RESPONSE]), modelling_data[RESPONSE]
X_train, y_train = training_data.drop(columns=[RESPONSE]), training_data[RESPONSE]
X_test, y_test = test_data.drop(columns=[RESPONSE]), test_data[RESPONSE]
X_cal, y_cal = cal_data.drop(columns=[RESPONSE]), cal_data[RESPONSE]

In [9]:
X_preproc = X[FEATURES]
X_train_preproc = X_train[FEATURES]
X_test_preproc = X_test[FEATURES]
X_cal_preproc = X_cal[FEATURES]

Optuna Hyperparameter Tuning Class - HyperParameterTuner & XGBHyperparameterTuner

In [10]:
xgb_tuner = XGBHyperparameterTuner(X_train_preproc, y_train, monotonicity_constraints=MONOTONE_CONSTRAINTS)

In [11]:
xgb_tuner.tune_hyperparameters()

[I 2023-10-09 12:09:06,392] A new study created in memory with name: no-name-a1a35468-4285-4f1d-bcee-f8f14dde17ad
[I 2023-10-09 12:09:07,997] Trial 0 finished with value: 0.5032810186180928 and parameters: {'max_depth': 17, 'min_child_weight': 86, 'eta': 0.019230342107459195, 'gamma': 0.008603367545316317, 'lambda': 0.010071972646444096, 'alpha': 0.0002487320076410316, 'subsample': 0.4835582218401331, 'colsample_bytree': 0.6764915176153983}. Best is trial 0 with value: 0.5032810186180928.
[I 2023-10-09 12:09:09,431] Trial 1 finished with value: 0.30896212522238115 and parameters: {'max_depth': 16, 'min_child_weight': 79, 'eta': 0.21871100043677055, 'gamma': 3.5347159696494126, 'lambda': 3.250699717754418, 'alpha': 0.12103232177759077, 'subsample': 0.6307344806122284, 'colsample_bytree': 0.7705629690680276}. Best is trial 1 with value: 0.30896212522238115.
[I 2023-10-09 12:09:10,890] Trial 2 finished with value: 0.29910694283147815 and parameters: {'max_depth': 18, 'min_child_weight': 5

Number of finished trials:  100
Best trial:
  Value: 0.27750646828604714
  Params: 
    max_depth: 15
    min_child_weight: 36
    eta: 0.4407782051144824
    gamma: 0.20151244347593691
    lambda: 0.0006807252626325407
    alpha: 1.133918227265541
    subsample: 0.7323695436623067
    colsample_bytree: 0.8157949738034416


<optuna.study.study.Study at 0x150a88a50>

In [12]:
params = xgb_tuner.get_best_params()
params

{'max_depth': 15,
 'min_child_weight': 36,
 'eta': 0.4407782051144824,
 'gamma': 0.20151244347593691,
 'lambda': 0.0006807252626325407,
 'alpha': 1.133918227265541,
 'subsample': 0.7323695436623067,
 'colsample_bytree': 0.8157949738034416}

Training Model - SuperXGBClassifier class for training and predictions

In [13]:
params['objective'] = OptunaXGBParamGrid.error
params['num_rounds'] = 1000
params['early_stopping_rounds'] = 50
params['verbosity'] = 1
params['monotone_constraints'] = MONOTONE_CONSTRAINTS

In [14]:
super_xgb = SuperXGBClassifier(X_train = X_train_preproc, 
                               y_train = y_train, 
                               X_test = X_test_preproc, 
                               y_test = y_test,
                               X_cal = X_cal_preproc,
                               y_cal = y_cal,
                               params = params)

In [15]:
super_xgb.fit()

[0]	validation_0-logloss:0.40242	validation_1-logloss:0.40435
[1]	validation_0-logloss:0.34614	validation_1-logloss:0.34962
[2]	validation_0-logloss:0.31312	validation_1-logloss:0.31851
[3]	validation_0-logloss:0.29448	validation_1-logloss:0.30180
[4]	validation_0-logloss:0.28381	validation_1-logloss:0.29283
[5]	validation_0-logloss:0.27585	validation_1-logloss:0.28650
[6]	validation_0-logloss:0.27069	validation_1-logloss:0.28271
[7]	validation_0-logloss:0.26622	validation_1-logloss:0.27944
[8]	validation_0-logloss:0.26191	validation_1-logloss:0.27702
[9]	validation_0-logloss:0.25920	validation_1-logloss:0.27600
[10]	validation_0-logloss:0.25723	validation_1-logloss:0.27536
[11]	validation_0-logloss:0.25565	validation_1-logloss:0.27460
[12]	validation_0-logloss:0.25446	validation_1-logloss:0.27455
[13]	validation_0-logloss:0.25375	validation_1-logloss:0.27431
[14]	validation_0-logloss:0.25242	validation_1-logloss:0.27421
[15]	validation_0-logloss:0.25133	validation_1-logloss:0.27423
[1

In [16]:
super_xgb.xgb_model

In [17]:
super_xgb.xgb_model.get_booster().feature_names

['type_Kick_a0',
 'type_Handball_a0',
 'quarter_a0',
 'quarter_seconds_a0',
 'overall_seconds_a0',
 'start_x_a0',
 'start_y_a0',
 'end_x_a0',
 'end_y_a0',
 'dx_a0',
 'dy_a0',
 'movement_a0',
 'type_Kick_a1',
 'type_Handball_a1',
 'outcome_effective_a1',
 'outcome_ineffective_a1',
 'outcome_clanger_a1',
 'quarter_a1',
 'quarter_seconds_a1',
 'overall_seconds_a1',
 'start_x_a1',
 'start_y_a1',
 'end_x_a1',
 'end_y_a1',
 'dx_a1',
 'dy_a1',
 'movement_a1',
 'type_Kick_a2',
 'type_Handball_a2',
 'outcome_effective_a2',
 'outcome_ineffective_a2',
 'outcome_clanger_a2',
 'quarter_a2',
 'quarter_seconds_a2',
 'overall_seconds_a2',
 'start_x_a2',
 'start_y_a2',
 'end_x_a2',
 'end_y_a2',
 'dx_a2',
 'dy_a2',
 'movement_a2',
 'team_1',
 'team_2',
 'time_delta1',
 'time_delta2',
 'dx_a01',
 'dy_a01',
 'move_a01',
 'dx_a02',
 'dy_a02',
 'move_a02']

In [18]:
preds = super_xgb.predict(X_preproc)
train_preds = super_xgb.predict(X_train_preproc)
test_preds = super_xgb.predict(X_test_preproc)

In [19]:
probas = super_xgb.predict_proba(X_preproc)[:, 1]
train_probas = super_xgb.predict_proba(X_train_preproc)[:, 1]
test_probas = super_xgb.predict_proba(X_test_preproc)[:, 1]
cal_probas = super_xgb.predict_proba(X_cal_preproc)[:, 1]

In [20]:
super_xgb.calibrate()

In [21]:
cal_probas = super_xgb.predict_proba(X_preproc, calibrate=True)
train_cal_probas = super_xgb.predict_proba(X_train_preproc, calibrate=True)
test_cal_probas = super_xgb.predict_proba(X_test_preproc, calibrate=True)

Check Average Predictions

In [22]:
probas.mean(), modelling_data[RESPONSE].mean(), cal_probas.mean()

(0.74303824, 0.7432351454092619, 0.7413527810879347)

In [23]:
train_probas.mean(), training_data[RESPONSE].mean(), train_cal_probas.mean()

(0.74325114, 0.7437457024563662, 0.7415564299250738)

In [24]:
test_probas.mean(), test_data[RESPONSE].mean(), test_cal_probas.mean()

(0.7426029, 0.7433463636150839, 0.7409402019538036)

Export model

In [25]:
super_xgb.export_model(model_output_path + "/" + model_file_name + ".joblib")

Export data and predictions

In [27]:
modelling_data['xgb_preds'] = preds
modelling_data['xgb_probas'] = probas
modelling_data['xgb_probas_cal'] = cal_probas
modelling_data.to_csv(prediction_output_path + '/predictions_' + model_file_name + '.csv', index = False)
modelling_data.head()

Unnamed: 0,match_id,chain_number,order,quarter,quarter_seconds,overall_seconds,team,player,start_x,start_y,end_x,end_y,action_type,outcome_type,type_Kick_a0,type_Handball_a0,quarter_a0,quarter_seconds_a0,overall_seconds_a0,start_x_a0,start_y_a0,end_x_a0,end_y_a0,dx_a0,dy_a0,movement_a0,type_Kick_a1,type_Handball_a1,outcome_effective_a1,outcome_ineffective_a1,outcome_clanger_a1,quarter_a1,quarter_seconds_a1,overall_seconds_a1,start_x_a1,start_y_a1,end_x_a1,end_y_a1,dx_a1,dy_a1,movement_a1,type_Kick_a2,type_Handball_a2,outcome_effective_a2,outcome_ineffective_a2,outcome_clanger_a2,quarter_a2,quarter_seconds_a2,overall_seconds_a2,start_x_a2,start_y_a2,end_x_a2,end_y_a2,dx_a2,dy_a2,movement_a2,team_1,team_2,time_delta1,time_delta2,dx_a01,dy_a01,move_a01,dx_a02,dy_a02,move_a02,Disposal,DisposalTrainingSet,DisposalTestSet,DisposalValidationSet,xgb_preds,xgb_probas,xgb_probas_cal
0,202101_BrisbaneLions_Sydney,1,3.0,1,24.0,24.0,Brisbane Lions,Dayne Zorko,9.0,-6.0,-11.0,7.0,Handball,ineffective,False,True,1,24.0,24.0,9.0,-6.0,-11.0,7.0,-20.0,13.0,23.853721,False,False,True,False,False,1,24.0,24.0,8.0,-5.0,9.0,-6.0,1.0,-1.0,1.414214,False,False,True,False,False,1,24.0,24.0,8.0,-5.0,9.0,-6.0,1.0,-1.0,1.414214,True,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,True,False,False,0,0.001389,0.00143
1,202101_BrisbaneLions_Sydney,1,5.0,1,29.0,29.0,Sydney,Oliver Florent,-12.0,5.0,-14.0,2.0,Handball,effective,False,True,1,29.0,29.0,-12.0,5.0,-14.0,2.0,-2.0,-3.0,3.605551,False,False,True,False,False,1,28.0,28.0,-11.0,7.0,-12.0,5.0,-1.0,-2.0,2.236068,False,True,False,True,False,1,24.0,24.0,9.0,-6.0,-11.0,7.0,-20.0,13.0,23.853721,True,False,-1.0,-5.0,0.0,0.0,0.0,1.0,2.0,2.236068,1,True,False,False,1,0.93296,0.923152
2,202101_BrisbaneLions_Sydney,1,7.0,1,31.0,31.0,Sydney,George Hewett,-22.0,2.0,6.0,-27.0,Kick,clanger,True,False,1,31.0,31.0,-22.0,2.0,6.0,-27.0,28.0,-29.0,40.311289,False,False,True,False,False,1,30.0,30.0,-14.0,2.0,-22.0,2.0,-8.0,0.0,8.0,False,True,True,False,False,1,29.0,29.0,-12.0,5.0,-14.0,2.0,-2.0,-3.0,3.605551,True,True,-1.0,-2.0,0.0,0.0,0.0,8.0,0.0,8.0,0,False,True,False,0,0.480095,0.500333
3,202101_BrisbaneLions_Sydney,1,9.0,1,37.0,37.0,Brisbane Lions,Hugh McCluggage,11.0,-26.0,18.0,-23.0,Handball,effective,False,True,1,37.0,37.0,11.0,-26.0,18.0,-23.0,7.0,3.0,7.615773,False,False,True,False,False,1,36.0,36.0,6.0,-27.0,11.0,-26.0,5.0,1.0,5.09902,True,False,False,False,True,1,31.0,31.0,-22.0,2.0,6.0,-27.0,28.0,-29.0,40.311289,True,False,-1.0,-6.0,0.0,0.0,0.0,-5.0,-1.0,5.09902,1,False,True,False,1,0.948358,0.938931
4,202101_BrisbaneLions_Sydney,2,17.0,1,93.0,93.0,Brisbane Lions,Oscar McInerney,0.0,0.0,0.0,0.0,Kick,effective,True,False,1,93.0,93.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,False,True,False,False,1,92.0,92.0,-1.0,0.0,0.0,0.0,1.0,0.0,1.0,False,False,True,False,False,1,40.0,40.0,26.0,-21.0,-0.0,-0.0,-26.0,21.0,33.42155,True,True,-1.0,-53.0,0.0,0.0,0.0,-0.0,-0.0,0.0,1,False,True,False,1,0.577267,0.591969
