Loading and Preprocessing AFL Match Data for Modelling

In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

from total_points_model.config import raw_data_file_path, preprocessed_output_path
from total_points_model.domain.preprocessing.data_preprocessor import DataPreprocessor
from total_points_model.domain.contracts.mappings import Mappings
from total_points_model.domain.contracts.rolling_columns import RollingColumns
from total_points_model.domain.contracts.modelling_data_contract import ModellingDataContract


In [None]:
pd.options.display.max_rows = 100
pd.options.display.max_columns = 999

%load_ext autoreload
%autoreload 2

Load data

In [None]:
afl_data = pd.read_csv(raw_data_file_path)
afl_data = afl_data[(afl_data['Year'] > 2004) & (afl_data['Year'] < 2023) & ~(afl_data['Year'] == 2020)]
afl_data.head(2)

In [None]:
afl_data[ModellingDataContract.TRAIN_TEST_SPLIT_COL].value_counts()

In [None]:
training_data = afl_data[afl_data[ModellingDataContract.TRAIN_TEST_SPLIT_COL]]
test_data = afl_data[~afl_data[ModellingDataContract.TRAIN_TEST_SPLIT_COL]]

Create Preprocessor

In [None]:
preprocessor = DataPreprocessor(Mappings=Mappings, rolling_dict=RollingColumns.rolling_dict)

In [None]:
preprocessor.fit(training_data)

In [None]:
training_data_preproc = preprocessor.transform(training_data)
training_data_preproc.head()

In [None]:
test_data_preproc = preprocessor.transform(test_data)
test_data_preproc.head()

Checking Rolling Columns

In [None]:
from total_points_model.domain.preprocessing.preprocessing_functions import score_col_splitter, get_team_rolling_averages, rename_rolling_columns, merge_rolling_data

In [None]:
# training_data_full = pd.concat([training_data, training_data_preproc], axis = 1)
# brisbane_preproc = training_data_full[(training_data_full['Home_Team'] == "Brisbane Lions") | (training_data_full['Away_Team'] == "Brisbane Lions")]
# brisbane_preproc[['Home_Team', 'Away_Team', 'Total_Game_Score', 'Home_Total_Q4_Score_avg2', 'Away_Total_Q4_Score_avg2', 'Home_Total_Q4_Score_wavg2', 'Away_Total_Q4_Score_wavg2', 'Home_Total_Q4_Score_exp_wavg2', 'Away_Total_Q4_Score_exp_wavg2']].head(10)

In [None]:
# training_data = score_col_splitter(training_data, 'Q4_Score')

In [None]:
# brisbane_data = training_data[(training_data['Home_Team'] == "Brisbane Lions") | (training_data['Away_Team'] == "Brisbane Lions")]
# weights = np.array([0.25, 0.75])
# brisbane_data['Total_Q4_Score_avg2'] = brisbane_data['Total_Q4_Score'].rolling(2).mean().shift()
# brisbane_data['Total_Q4_Score_wavg2'] = brisbane_data['Total_Q4_Score'].rolling(2).apply(lambda x: np.sum(weights*x)).shift()
# brisbane_data['Total_Q4_Score_exp_wavg2'] = brisbane_data['Total_Q4_Score'].ewm(span = 2).mean().shift()
# brisbane_data[['Home_Team', 'Away_Team', 'Total_Game_Score', 'Total_Q4_Score_avg2', 'Total_Q4_Score_wavg2', 'Total_Q4_Score_exp_wavg2']].head(10)

In [None]:
# hawthorn_data = training_data[(training_data['Home_Team'] == "Hawthorn") | (training_data['Away_Team'] == "Hawthorn")]
# weights = np.array([0.25, 0.75])
# hawthorn_data['Total_Q4_Score_avg2'] = hawthorn_data['Total_Q4_Score'].rolling(2).mean().shift()
# hawthorn_data['Total_Q4_Score_wavg2'] = hawthorn_data['Total_Q4_Score'].rolling(2).apply(lambda x: np.sum(weights*x)).shift()
# hawthorn_data['Total_Q4_Score_exp_wavg2'] = hawthorn_data['Total_Q4_Score'].ewm(span = 2).mean().shift()
# hawthorn_data[['Home_Team', 'Away_Team', 'Total_Game_Score', 'Total_Q4_Score_avg2', 'Total_Q4_Score_wavg2', 'Total_Q4_Score_exp_wavg2']].head(10)

In [None]:
# 0.25*148 + 0.75*168

In [None]:
# brisbane_data.head()

Export modelling data .csv

In [None]:
# modelling_data = pd.concat([training_data_preproc, test_data_preproc], axis=0)

In [None]:
# modelling_data.to_csv(preprocessed_output_path + "/modelling_data_v2.csv", index = False)

In [None]:
# training_data_preproc.to_csv(preprocessed_output_path + "/training_data_v2.csv", index = False)

In [None]:
# test_data_preproc.to_csv(preprocessed_output_path + "/test_data_v2.csv", index = False)