In [1]:
import pandas as pd
import numpy as np
import joblib

pd.options.display.max_columns = None
pd.options.display.max_rows = None

import warnings
warnings.filterwarnings("ignore")  

from expected_disposal_model.config import raw_file_path, modelling_file_path, preprocessor_file_path
from expected_disposal_model.data_preparation.data_preprocessor import Preprocessor
from expected_disposal_model.data_preparation.preprocessing import convert_chains_to_schema
from expected_disposal_model.data_preparation.preprocessing import filter_disposals, create_labels
from expected_disposal_model.data_preparation.preprocessing import get_stratified_train_test_val_columns
from expected_disposal_model.modelling_data_contract import ModellingDataContract


Load Data

In [2]:
data = pd.read_csv(raw_file_path)
data.head()

Unnamed: 0,Chain_Number,Initial_State,Final_State,Order,Quarter,Quarter_Duration_Chain_Start,Quarter_Duration,Team_Chain,Team,Player,AFL_API_Player_ID,Description,x,y,Disposal,Shot_At_Goal,Behind_Detail,Venue_Width,Venue_Length,Home_Team,Away_Team,Home_Team_Direction_Q1,Match_ID,Round_ID,Year,Season
0,1,centreBounce,goal,1.0,1,13,13.0,Brisbane Lions,,,,Centre Bounce,0.0,0.0,,,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021.0,
1,1,centreBounce,goal,2.0,1,13,24.0,Brisbane Lions,Brisbane Lions,Dayne Zorko,Dayne_Zorko,Hard Ball Get,8.0,-5.0,,,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021.0,
2,1,centreBounce,goal,3.0,1,13,24.0,Brisbane Lions,Brisbane Lions,Dayne Zorko,Dayne_Zorko,Handball,9.0,-6.0,ineffective,,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021.0,
3,1,centreBounce,goal,4.0,1,13,28.0,Brisbane Lions,Sydney,Oliver Florent,Oliver_Florent,Loose Ball Get,11.0,-7.0,,,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021.0,
4,1,centreBounce,goal,5.0,1,13,29.0,Brisbane Lions,Sydney,Oliver Florent,Oliver_Florent,Handball,12.0,-5.0,effective,,,138,156,Brisbane Lions,Sydney,right,202101_BrisbaneLions_Sydney,202101,2021.0,


In [3]:
data.shape

(1287870, 26)

Preprocess Data

In [4]:
schema_chains = convert_chains_to_schema(data)
disposals = filter_disposals(schema_chains)

In [5]:
disposals.shape

(427208, 14)

In [6]:
preproc = Preprocessor()
preproc.fit(data)

In [7]:
X = preproc.transform(data)

In [8]:
X.head()

Unnamed: 0,type_Kick_a0,type_Handball_a0,quarter_a0,quarter_seconds_a0,overall_seconds_a0,start_x_a0,start_y_a0,end_x_a0,end_y_a0,dx_a0,dy_a0,movement_a0,type_Kick_a1,type_Handball_a1,outcome_effective_a1,outcome_ineffective_a1,outcome_clanger_a1,quarter_a1,quarter_seconds_a1,overall_seconds_a1,start_x_a1,start_y_a1,end_x_a1,end_y_a1,dx_a1,dy_a1,movement_a1,type_Kick_a2,type_Handball_a2,outcome_effective_a2,outcome_ineffective_a2,outcome_clanger_a2,quarter_a2,quarter_seconds_a2,overall_seconds_a2,start_x_a2,start_y_a2,end_x_a2,end_y_a2,dx_a2,dy_a2,movement_a2,team_1,team_2,time_delta1,time_delta2,dx_a01,dy_a01,move_a01,dx_a02,dy_a02,move_a02
2,False,True,1,24.0,24.0,9.0,-6.0,-11.0,7.0,-20.0,13.0,23.853721,False,False,True,False,False,1,24.0,24.0,8.0,-5.0,9.0,-6.0,1.0,-1.0,1.414214,False,False,True,False,False,1,24.0,24.0,8.0,-5.0,9.0,-6.0,1.0,-1.0,1.414214,True,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,False,True,1,29.0,29.0,-12.0,5.0,-14.0,2.0,-2.0,-3.0,3.605551,False,False,True,False,False,1,28.0,28.0,-11.0,7.0,-12.0,5.0,-1.0,-2.0,2.236068,False,True,False,True,False,1,24.0,24.0,9.0,-6.0,-11.0,7.0,-20.0,13.0,23.853721,True,False,-1.0,-5.0,0.0,0.0,0.0,1.0,2.0,2.236068
6,True,False,1,31.0,31.0,-22.0,2.0,6.0,-27.0,28.0,-29.0,40.311289,False,False,True,False,False,1,30.0,30.0,-14.0,2.0,-22.0,2.0,-8.0,0.0,8.0,False,True,True,False,False,1,29.0,29.0,-12.0,5.0,-14.0,2.0,-2.0,-3.0,3.605551,True,True,-1.0,-2.0,0.0,0.0,0.0,8.0,0.0,8.0
8,False,True,1,37.0,37.0,11.0,-26.0,18.0,-23.0,7.0,3.0,7.615773,False,False,True,False,False,1,36.0,36.0,6.0,-27.0,11.0,-26.0,5.0,1.0,5.09902,True,False,False,False,True,1,31.0,31.0,-22.0,2.0,6.0,-27.0,28.0,-29.0,40.311289,True,False,-1.0,-6.0,0.0,0.0,0.0,-5.0,-1.0,5.09902
16,True,False,1,93.0,93.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,False,True,False,False,1,92.0,92.0,-1.0,0.0,0.0,0.0,1.0,0.0,1.0,False,False,True,False,False,1,40.0,40.0,26.0,-21.0,-0.0,-0.0,-26.0,21.0,33.42155,True,True,-1.0,-53.0,0.0,0.0,0.0,-0.0,-0.0,0.0


In [9]:
X.shape

(427208, 52)

Create Labels

In [10]:
y = create_labels(data)

In [11]:
y.shape

(427208,)

Combine Data

In [12]:
modelling_data = pd.concat([disposals, X, y], axis='columns')
modelling_data.head()

Unnamed: 0,match_id,chain_number,order,quarter,quarter_seconds,overall_seconds,team,player,start_x,start_y,end_x,end_y,action_type,outcome_type,type_Kick_a0,type_Handball_a0,quarter_a0,quarter_seconds_a0,overall_seconds_a0,start_x_a0,start_y_a0,end_x_a0,end_y_a0,dx_a0,dy_a0,movement_a0,type_Kick_a1,type_Handball_a1,outcome_effective_a1,outcome_ineffective_a1,outcome_clanger_a1,quarter_a1,quarter_seconds_a1,overall_seconds_a1,start_x_a1,start_y_a1,end_x_a1,end_y_a1,dx_a1,dy_a1,movement_a1,type_Kick_a2,type_Handball_a2,outcome_effective_a2,outcome_ineffective_a2,outcome_clanger_a2,quarter_a2,quarter_seconds_a2,overall_seconds_a2,start_x_a2,start_y_a2,end_x_a2,end_y_a2,dx_a2,dy_a2,movement_a2,team_1,team_2,time_delta1,time_delta2,dx_a01,dy_a01,move_a01,dx_a02,dy_a02,move_a02,Disposal
2,202101_BrisbaneLions_Sydney,1,3.0,1,24.0,24.0,Brisbane Lions,Dayne Zorko,9.0,-6.0,-11.0,7.0,Handball,ineffective,False,True,1,24.0,24.0,9.0,-6.0,-11.0,7.0,-20.0,13.0,23.853721,False,False,True,False,False,1,24.0,24.0,8.0,-5.0,9.0,-6.0,1.0,-1.0,1.414214,False,False,True,False,False,1,24.0,24.0,8.0,-5.0,9.0,-6.0,1.0,-1.0,1.414214,True,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,202101_BrisbaneLions_Sydney,1,5.0,1,29.0,29.0,Sydney,Oliver Florent,-12.0,5.0,-14.0,2.0,Handball,effective,False,True,1,29.0,29.0,-12.0,5.0,-14.0,2.0,-2.0,-3.0,3.605551,False,False,True,False,False,1,28.0,28.0,-11.0,7.0,-12.0,5.0,-1.0,-2.0,2.236068,False,True,False,True,False,1,24.0,24.0,9.0,-6.0,-11.0,7.0,-20.0,13.0,23.853721,True,False,-1.0,-5.0,0.0,0.0,0.0,1.0,2.0,2.236068,1
6,202101_BrisbaneLions_Sydney,1,7.0,1,31.0,31.0,Sydney,George Hewett,-22.0,2.0,6.0,-27.0,Kick,clanger,True,False,1,31.0,31.0,-22.0,2.0,6.0,-27.0,28.0,-29.0,40.311289,False,False,True,False,False,1,30.0,30.0,-14.0,2.0,-22.0,2.0,-8.0,0.0,8.0,False,True,True,False,False,1,29.0,29.0,-12.0,5.0,-14.0,2.0,-2.0,-3.0,3.605551,True,True,-1.0,-2.0,0.0,0.0,0.0,8.0,0.0,8.0,0
8,202101_BrisbaneLions_Sydney,1,9.0,1,37.0,37.0,Brisbane Lions,Hugh McCluggage,11.0,-26.0,18.0,-23.0,Handball,effective,False,True,1,37.0,37.0,11.0,-26.0,18.0,-23.0,7.0,3.0,7.615773,False,False,True,False,False,1,36.0,36.0,6.0,-27.0,11.0,-26.0,5.0,1.0,5.09902,True,False,False,False,True,1,31.0,31.0,-22.0,2.0,6.0,-27.0,28.0,-29.0,40.311289,True,False,-1.0,-6.0,0.0,0.0,0.0,-5.0,-1.0,5.09902,1
16,202101_BrisbaneLions_Sydney,2,17.0,1,93.0,93.0,Brisbane Lions,Oscar McInerney,0.0,0.0,0.0,0.0,Kick,effective,True,False,1,93.0,93.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,False,True,False,False,1,92.0,92.0,-1.0,0.0,0.0,0.0,1.0,0.0,1.0,False,False,True,False,False,1,40.0,40.0,26.0,-21.0,-0.0,-0.0,-26.0,21.0,33.42155,True,True,-1.0,-53.0,0.0,0.0,0.0,-0.0,-0.0,0.0,1


In [13]:
modelling_data.shape

(427208, 67)

Create Train Test Validation Split

In [14]:
modelling_data = get_stratified_train_test_val_columns(modelling_data, response=ModellingDataContract.RESPONSE)
modelling_data.head()

Unnamed: 0,match_id,chain_number,order,quarter,quarter_seconds,overall_seconds,team,player,start_x,start_y,end_x,end_y,action_type,outcome_type,type_Kick_a0,type_Handball_a0,quarter_a0,quarter_seconds_a0,overall_seconds_a0,start_x_a0,start_y_a0,end_x_a0,end_y_a0,dx_a0,dy_a0,movement_a0,type_Kick_a1,type_Handball_a1,outcome_effective_a1,outcome_ineffective_a1,outcome_clanger_a1,quarter_a1,quarter_seconds_a1,overall_seconds_a1,start_x_a1,start_y_a1,end_x_a1,end_y_a1,dx_a1,dy_a1,movement_a1,type_Kick_a2,type_Handball_a2,outcome_effective_a2,outcome_ineffective_a2,outcome_clanger_a2,quarter_a2,quarter_seconds_a2,overall_seconds_a2,start_x_a2,start_y_a2,end_x_a2,end_y_a2,dx_a2,dy_a2,movement_a2,team_1,team_2,time_delta1,time_delta2,dx_a01,dy_a01,move_a01,dx_a02,dy_a02,move_a02,Disposal,DisposalTrainingSet,DisposalTestSet,DisposalValidationSet
2,202101_BrisbaneLions_Sydney,1,3.0,1,24.0,24.0,Brisbane Lions,Dayne Zorko,9.0,-6.0,-11.0,7.0,Handball,ineffective,False,True,1,24.0,24.0,9.0,-6.0,-11.0,7.0,-20.0,13.0,23.853721,False,False,True,False,False,1,24.0,24.0,8.0,-5.0,9.0,-6.0,1.0,-1.0,1.414214,False,False,True,False,False,1,24.0,24.0,8.0,-5.0,9.0,-6.0,1.0,-1.0,1.414214,True,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,True,False,False
4,202101_BrisbaneLions_Sydney,1,5.0,1,29.0,29.0,Sydney,Oliver Florent,-12.0,5.0,-14.0,2.0,Handball,effective,False,True,1,29.0,29.0,-12.0,5.0,-14.0,2.0,-2.0,-3.0,3.605551,False,False,True,False,False,1,28.0,28.0,-11.0,7.0,-12.0,5.0,-1.0,-2.0,2.236068,False,True,False,True,False,1,24.0,24.0,9.0,-6.0,-11.0,7.0,-20.0,13.0,23.853721,True,False,-1.0,-5.0,0.0,0.0,0.0,1.0,2.0,2.236068,1,True,False,False
6,202101_BrisbaneLions_Sydney,1,7.0,1,31.0,31.0,Sydney,George Hewett,-22.0,2.0,6.0,-27.0,Kick,clanger,True,False,1,31.0,31.0,-22.0,2.0,6.0,-27.0,28.0,-29.0,40.311289,False,False,True,False,False,1,30.0,30.0,-14.0,2.0,-22.0,2.0,-8.0,0.0,8.0,False,True,True,False,False,1,29.0,29.0,-12.0,5.0,-14.0,2.0,-2.0,-3.0,3.605551,True,True,-1.0,-2.0,0.0,0.0,0.0,8.0,0.0,8.0,0,False,True,False
8,202101_BrisbaneLions_Sydney,1,9.0,1,37.0,37.0,Brisbane Lions,Hugh McCluggage,11.0,-26.0,18.0,-23.0,Handball,effective,False,True,1,37.0,37.0,11.0,-26.0,18.0,-23.0,7.0,3.0,7.615773,False,False,True,False,False,1,36.0,36.0,6.0,-27.0,11.0,-26.0,5.0,1.0,5.09902,True,False,False,False,True,1,31.0,31.0,-22.0,2.0,6.0,-27.0,28.0,-29.0,40.311289,True,False,-1.0,-6.0,0.0,0.0,0.0,-5.0,-1.0,5.09902,1,False,True,False
16,202101_BrisbaneLions_Sydney,2,17.0,1,93.0,93.0,Brisbane Lions,Oscar McInerney,0.0,0.0,0.0,0.0,Kick,effective,True,False,1,93.0,93.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,False,True,False,False,1,92.0,92.0,-1.0,0.0,0.0,0.0,1.0,0.0,1.0,False,False,True,False,False,1,40.0,40.0,26.0,-21.0,-0.0,-0.0,-26.0,21.0,33.42155,True,True,-1.0,-53.0,0.0,0.0,0.0,-0.0,-0.0,0.0,1,False,True,False


Export Data

In [15]:
modelling_data.to_csv(modelling_file_path, index=False)

Export Preprocessor

In [16]:
joblib.dump(preproc, preprocessor_file_path)

['/Users/ciaran/Documents/Projects/AFL/git-repositories/expected-disposal-model/model_outputs/preprocessors/disposal_preorocessor.joblib']