Loading and Preprocessing AFL Match Data for Modelling

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

import sys 
sys.path.append('..')
from total_points_model.config import raw_data_file_path, preprocessed_output_path
from total_points_model.domain.preprocessing.data_preprocessor import DataPreprocessor
from total_points_model.domain.contracts.mappings import Mappings

In [2]:
pd.options.display.max_rows = 100
pd.options.display.max_columns = 999

%load_ext autoreload
%autoreload 2

Load data

In [3]:
afl_data = pd.read_csv(raw_data_file_path)
afl_data = afl_data[afl_data['Year'] > 2010]
afl_data.head(2)

Unnamed: 0,Home_Team,Venue,Round_ID,Match_ID,Year,Away_Team,Q1_Score,Q2_Score,Q3_Score,Q4_Score,Margin,Total_Game_Score,Home_Win,City,Date,Attendance,Home_Coach_ID,Away_Coach_ID,Q5_Score,Temperature,Weather_Type,Match_Status,Weather_Description,Ground_Width,Ground_Length,Home_Ground,ModellingFilter,DateTime
1111,Carlton,M.C.G.,201101,201101_Carlton_Richmond,2011.0,Richmond,2.8.20 - 3.0.18,9.10.64 - 7.0.42,9.13.67 - 13.3.81,14.20.104 - 13.6.84,20.0,188.0,1.0,Melbourne,2011-03-24,60654.0,Brett_Ratten,Damien_Hardwick,,18.0,MOSTLY_CLEAR,CONCLUDED,Showers. Windy.,141,160,Secondary Home,True,2011-03-24 19:10:00
1112,Geelong,M.C.G.,201101,201101_Geelong_StKilda,2011.0,St Kilda,0.4.4 - 2.5.17,1.7.13 - 2.9.21,4.10.34 - 4.11.35,6.12.48 - 6.11.47,1.0,95.0,1.0,Melbourne,2011-03-25,42869.0,Chris_Scott,Ross_Lyon,,17.0,MOSTLY_SUNNY,CONCLUDED,Overcast. Cool,141,160,Neutral,True,2011-03-25 19:15:00


In [4]:
afl_data['ModellingFilter'].value_counts()

True     1644
False     792
Name: ModellingFilter, dtype: int64

In [5]:
training_data = afl_data[afl_data['ModellingFilter']]
test_data = afl_data[~afl_data['ModellingFilter']]

Create Preprocessor

In [33]:
preprocessor = DataPreprocessor(mapping=Mappings.mappings)

In [34]:
preprocessor.fit(training_data)

In [35]:
training_data_preproc = preprocessor.transform(training_data)
training_data_preproc.head()

Unnamed: 0,Round,Year,Temperature,random5,Home_Total_Q4_Score_avg2,Home_Total_Q4_Goals_avg2,Home_Total_Q4_Behinds_avg2,Home_Total_Q4_Shots_avg2,Home_Total_Q4_Conversion_avg2,Home_Att_Q4_Score_avg2,Home_Att_Q4_Goals_avg2,Home_Att_Q4_Behinds_avg2,Home_Att_Q4_Shots_avg2,Home_Att_Q4_Conversion_avg2,Home_Def_Q4_Score_avg2,Home_Def_Q4_Goals_avg2,Home_Def_Q4_Behinds_avg2,Home_Def_Q4_Shots_avg2,Home_Def_Q4_Conversion_avg2,Away_Total_Q4_Score_avg2,Away_Total_Q4_Goals_avg2,Away_Total_Q4_Behinds_avg2,Away_Total_Q4_Shots_avg2,Away_Total_Q4_Conversion_avg2,Away_Att_Q4_Score_avg2,Away_Att_Q4_Goals_avg2,Away_Att_Q4_Behinds_avg2,Away_Att_Q4_Shots_avg2,Away_Att_Q4_Conversion_avg2,Away_Def_Q4_Score_avg2,Away_Def_Q4_Goals_avg2,Away_Def_Q4_Behinds_avg2,Away_Def_Q4_Shots_avg2,Away_Def_Q4_Conversion_avg2,Home_Team_Adelaide,Home_Team_Brisbane Lions,Home_Team_Carlton,Home_Team_Collingwood,Home_Team_Essendon,Home_Team_Fremantle,Home_Team_Geelong,Home_Team_Gold Coast,Home_Team_Greater Western Sydney,Home_Team_Hawthorn,Home_Team_Melbourne,Home_Team_North Melbourne,Home_Team_Port Adelaide,Home_Team_Richmond,Home_Team_St Kilda,Home_Team_Sydney,Home_Team_West Coast,Home_Team_Western Bulldogs,Away_Team_Adelaide,Away_Team_Brisbane Lions,Away_Team_Carlton,Away_Team_Collingwood,Away_Team_Essendon,Away_Team_Fremantle,Away_Team_Geelong,Away_Team_Gold Coast,Away_Team_Greater Western Sydney,Away_Team_Hawthorn,Away_Team_Melbourne,Away_Team_North Melbourne,Away_Team_Port Adelaide,Away_Team_Richmond,Away_Team_St Kilda,Away_Team_Sydney,Away_Team_West Coast,Away_Team_Western Bulldogs,Venue_Adelaide Oval,Venue_Bellerive Oval,Venue_Blacktown,Venue_Carrara,Venue_Cazalys Stadium,Venue_Docklands,Venue_Eureka Stadium,Venue_Football Park,Venue_Gabba,Venue_Jiangwan Stadium,Venue_Kardinia Park,Venue_M.C.G.,Venue_Manuka Oval,Venue_Marrara Oval,Venue_Perth Stadium,Venue_S.C.G.,Venue_Stadium Australia,Venue_Subiaco,Venue_Sydney Showground,Venue_Traeger Park,Venue_Wellington,Venue_York Park,City_Adelaide,City_Alice Springs,City_Ballarat,City_Brisbane,City_Cairns,City_Canberra,City_Darwin,City_Geelong,City_Gold Coast,City_Hobart,City_Launceston,City_Melbourne,City_Perth,City_Shanghai,City_Sydney,City_Wellington,Weather_Type_Bad,Weather_Type_Good
0,1,2011.0,18.0,1,177.562424,25.767748,22.955936,48.723684,0.528387,89.538862,12.989902,11.599449,24.589351,0.525909,88.023562,12.777846,11.356487,24.134333,0.527566,177.905868,25.822127,22.973105,48.795232,0.528607,88.279951,12.818154,11.371027,24.189181,0.528045,89.625917,13.003973,11.602078,24.606051,0.526071,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
1,1,2011.0,17.0,4,177.562424,25.767748,22.955936,48.723684,0.528387,89.538862,12.989902,11.599449,24.589351,0.525909,88.023562,12.777846,11.356487,24.134333,0.527566,177.905868,25.822127,22.973105,48.795232,0.528607,88.279951,12.818154,11.371027,24.189181,0.528045,89.625917,13.003973,11.602078,24.606051,0.526071,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
2,1,2011.0,21.0,1,177.562424,25.767748,22.955936,48.723684,0.528387,89.538862,12.989902,11.599449,24.589351,0.525909,88.023562,12.777846,11.356487,24.134333,0.527566,177.905868,25.822127,22.973105,48.795232,0.528607,88.279951,12.818154,11.371027,24.189181,0.528045,89.625917,13.003973,11.602078,24.606051,0.526071,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
3,1,2011.0,23.0,2,177.562424,25.767748,22.955936,48.723684,0.528387,89.538862,12.989902,11.599449,24.589351,0.525909,88.023562,12.777846,11.356487,24.134333,0.527566,177.905868,25.822127,22.973105,48.795232,0.528607,88.279951,12.818154,11.371027,24.189181,0.528045,89.625917,13.003973,11.602078,24.606051,0.526071,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,1,2011.0,27.0,4,177.562424,25.767748,22.955936,48.723684,0.528387,89.538862,12.989902,11.599449,24.589351,0.525909,88.023562,12.777846,11.356487,24.134333,0.527566,177.905868,25.822127,22.973105,48.795232,0.528607,88.279951,12.818154,11.371027,24.189181,0.528045,89.625917,13.003973,11.602078,24.606051,0.526071,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [37]:
test_data_preproc = preprocessor.transform(test_data)

In [38]:
test_data_preproc.head()

Unnamed: 0,Round,Year,Temperature,random5,Home_Total_Q4_Score_avg2,Home_Total_Q4_Goals_avg2,Home_Total_Q4_Behinds_avg2,Home_Total_Q4_Shots_avg2,Home_Total_Q4_Conversion_avg2,Home_Att_Q4_Score_avg2,Home_Att_Q4_Goals_avg2,Home_Att_Q4_Behinds_avg2,Home_Att_Q4_Shots_avg2,Home_Att_Q4_Conversion_avg2,Home_Def_Q4_Score_avg2,Home_Def_Q4_Goals_avg2,Home_Def_Q4_Behinds_avg2,Home_Def_Q4_Shots_avg2,Home_Def_Q4_Conversion_avg2,Away_Total_Q4_Score_avg2,Away_Total_Q4_Goals_avg2,Away_Total_Q4_Behinds_avg2,Away_Total_Q4_Shots_avg2,Away_Total_Q4_Conversion_avg2,Away_Att_Q4_Score_avg2,Away_Att_Q4_Goals_avg2,Away_Att_Q4_Behinds_avg2,Away_Att_Q4_Shots_avg2,Away_Att_Q4_Conversion_avg2,Away_Def_Q4_Score_avg2,Away_Def_Q4_Goals_avg2,Away_Def_Q4_Behinds_avg2,Away_Def_Q4_Shots_avg2,Away_Def_Q4_Conversion_avg2,Home_Team_Adelaide,Home_Team_Brisbane Lions,Home_Team_Carlton,Home_Team_Collingwood,Home_Team_Essendon,Home_Team_Fremantle,Home_Team_Geelong,Home_Team_Gold Coast,Home_Team_Greater Western Sydney,Home_Team_Hawthorn,Home_Team_Melbourne,Home_Team_North Melbourne,Home_Team_Port Adelaide,Home_Team_Richmond,Home_Team_St Kilda,Home_Team_Sydney,Home_Team_West Coast,Home_Team_Western Bulldogs,Away_Team_Adelaide,Away_Team_Brisbane Lions,Away_Team_Carlton,Away_Team_Collingwood,Away_Team_Essendon,Away_Team_Fremantle,Away_Team_Geelong,Away_Team_Gold Coast,Away_Team_Greater Western Sydney,Away_Team_Hawthorn,Away_Team_Melbourne,Away_Team_North Melbourne,Away_Team_Port Adelaide,Away_Team_Richmond,Away_Team_St Kilda,Away_Team_Sydney,Away_Team_West Coast,Away_Team_Western Bulldogs,Venue_Adelaide Oval,Venue_Bellerive Oval,Venue_Blacktown,Venue_Carrara,Venue_Cazalys Stadium,Venue_Docklands,Venue_Eureka Stadium,Venue_Football Park,Venue_Gabba,Venue_Jiangwan Stadium,Venue_Kardinia Park,Venue_M.C.G.,Venue_Manuka Oval,Venue_Marrara Oval,Venue_Perth Stadium,Venue_S.C.G.,Venue_Stadium Australia,Venue_Subiaco,Venue_Sydney Showground,Venue_Traeger Park,Venue_Wellington,Venue_York Park,City_Adelaide,City_Alice Springs,City_Ballarat,City_Brisbane,City_Cairns,City_Canberra,City_Darwin,City_Geelong,City_Gold Coast,City_Hobart,City_Launceston,City_Melbourne,City_Perth,City_Shanghai,City_Sydney,City_Wellington,Weather_Type_Bad,Weather_Type_Good
0,1,2019.0,25.0,4,177.562424,25.767748,22.955936,48.723684,0.528387,89.538862,12.989902,11.599449,24.589351,0.525909,88.023562,12.777846,11.356487,24.134333,0.527566,177.905868,25.822127,22.973105,48.795232,0.528607,88.279951,12.818154,11.371027,24.189181,0.528045,89.625917,13.003973,11.602078,24.606051,0.526071,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
1,1,2019.0,29.0,5,177.562424,25.767748,22.955936,48.723684,0.528387,89.538862,12.989902,11.599449,24.589351,0.525909,88.023562,12.777846,11.356487,24.134333,0.527566,177.905868,25.822127,22.973105,48.795232,0.528607,88.279951,12.818154,11.371027,24.189181,0.528045,89.625917,13.003973,11.602078,24.606051,0.526071,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
2,1,2019.0,32.0,1,177.562424,25.767748,22.955936,48.723684,0.528387,89.538862,12.989902,11.599449,24.589351,0.525909,88.023562,12.777846,11.356487,24.134333,0.527566,177.905868,25.822127,22.973105,48.795232,0.528607,88.279951,12.818154,11.371027,24.189181,0.528045,89.625917,13.003973,11.602078,24.606051,0.526071,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
3,1,2019.0,18.0,2,177.562424,25.767748,22.955936,48.723684,0.528387,89.538862,12.989902,11.599449,24.589351,0.525909,88.023562,12.777846,11.356487,24.134333,0.527566,177.905868,25.822127,22.973105,48.795232,0.528607,88.279951,12.818154,11.371027,24.189181,0.528045,89.625917,13.003973,11.602078,24.606051,0.526071,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,1,2019.0,32.0,2,177.562424,25.767748,22.955936,48.723684,0.528387,89.538862,12.989902,11.599449,24.589351,0.525909,88.023562,12.777846,11.356487,24.134333,0.527566,177.905868,25.822127,22.973105,48.795232,0.528607,88.279951,12.818154,11.371027,24.189181,0.528045,89.625917,13.003973,11.602078,24.606051,0.526071,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0


Export modelling data .csv

In [39]:
modelling_data = pd.concat([training_data_preproc, test_data_preproc], axis=0)

In [43]:
modelling_data.to_csv(preprocessed_output_path + "/modelling_data_v2.csv", index = False)

In [44]:
training_data_preproc.to_csv(preprocessed_output_path + "/training_data_v2.csv", index = False)

In [45]:
test_data_preproc.to_csv(preprocessed_output_path + "/test_data_v2.csv", index = False)