Loading and Exploring AFL Match Data for Modelling

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

pd.options.display.max_rows = 100
pd.options.display.max_columns = 999

%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')

from total_points_model.config import raw_data_file_path, preprocessed_output_path
from total_points_model.domain.preprocessing.data_preprocessor import DataPreprocessor
from total_points_model.domain.contracts.mappings import Mappings
from total_points_model.domain.contracts.rolling_columns import RollingColumns
from total_points_model.domain.contracts.modelling_data_contract import ModellingDataContract


In [None]:
afl_data = pd.read_csv(raw_data_file_path)
afl_data = afl_data[(afl_data['Year'] > 2017) & (afl_data['Year'] < 2023) & ~(afl_data['Year'] == 2020)]
afl_data.head(2)

In [None]:
training_data = afl_data[afl_data['ModellingFilter2022']]
test_data = afl_data[~afl_data['ModellingFilter2022']]

Response Distribution

In [None]:
sns.kdeplot(data = afl_data, x=ModellingDataContract.RESPONSE, shade=True, hue="Year", palette = "crest")

In [None]:
fig = sns.kdeplot(training_data[ModellingDataContract.RESPONSE], shade=True, color="r")
fig = sns.kdeplot(test_data[ModellingDataContract.RESPONSE], shade=True, color="b")
plt.xlabel("Total Game Score")
plt.legend(labels = ["Train", "Test"])
plt.show()

Create Features

In [None]:
preprocessor = DataPreprocessor(Mappings=Mappings, rolling_dict=RollingColumns.rolling_dict)

In [None]:
preprocessor.fit(training_data)

In [None]:
training_data_preproc = preprocessor.transform(training_data)
test_data_preproc = preprocessor.transform(test_data)

In [None]:
test_data_preproc.tail()

In [None]:
training_model_data = pd.merge(training_data, training_data_preproc.drop(columns=['Year']), how = "left", on = "Match_ID")
test_model_data = pd.merge(test_data, test_data_preproc.drop(columns=['Year']), how = "left", on = "Match_ID")

Exploration Functions

In [None]:
def _get_feature_plot_data(data, response, feature):
    
    from pandas.api.types import is_numeric_dtype
    
    plot_dict = {
    'actual':data[response],
    'feature':data[feature]
    }
    plot_data = pd.DataFrame(plot_dict)

    if is_numeric_dtype(plot_data['feature']) & (len(np.unique(plot_data['feature'])) > 50):
        bins = 10
        edges = np.linspace(plot_data['feature'].min(), plot_data['feature'].max(), bins+1).astype(float)
        labels = [f'({edges[i]}, {edges[i+1]}]' for i in range(bins)]
        plot_data['feature'] = pd.cut(plot_data['feature'], bins = bins, labels = labels)
        
    feature_plot_data = plot_data.groupby('feature').agg(
        actual = ('actual', 'mean'),
        exposure = ('actual', 'size'),
        ).reset_index()
    
    feature_plot_data['relative'] = feature_plot_data['actual'] / data[response].mean()
    
    return feature_plot_data.sort_values(by = 'feature')

In [None]:
def plot_feature(data, response, feature):
    
    import matplotlib.pyplot as plt
    
    feature_plot_data = _get_feature_plot_data(data, response, feature)

    fig, ax1 = plt.subplots(figsize=(8, 8))
    ax2 = ax1.twinx()

    ax1.bar(feature_plot_data['feature'],feature_plot_data['exposure'], alpha = 0.5)
    ax2.plot(feature_plot_data['feature'], feature_plot_data['actual'], label = "Actual", color = "r")
    ax2.axhline(y = data[response].mean(), color = 'green', linestyle = '-')

    ax1.set_xlabel(feature)
    for tick in ax1.get_xticklabels():
        tick.set_rotation(90)
        
    ax1.set_ylabel("Number of Games", fontsize=14)
    ax2.set_ylabel("Total Points Scored", fontsize=14)

    ax2.legend()

    fig.suptitle("Actual: " + feature, fontsize=20)
    fig.show()

In [None]:
def _get_two_way_feature_plot_data(data, response, xaxis_feature, yaxis_feature):
    
    from pandas.api.types import is_numeric_dtype
    
    plot_dict = {
    'actual':data[response],
    'xaxis_feature':data[xaxis_feature],
    'yaxis_feature':data[yaxis_feature]
    }
    plot_data = pd.DataFrame(plot_dict)
    
    if is_numeric_dtype(plot_data['xaxis_feature']) & (len(np.unique(plot_data['xaxis_feature'])) > 50):
            bins = 10
            edges = np.linspace(plot_data['xaxis_feature'].min(), plot_data['xaxis_feature'].max(), bins+1).astype(float)
            labels = [f'({edges[i]}, {edges[i+1]}]' for i in range(bins)]
            plot_data['xaxis_feature'] = pd.cut(plot_data['xaxis_feature'], bins = bins, labels = labels)
            
    if is_numeric_dtype(plot_data['yaxis_feature']) & (len(np.unique(plot_data['yaxis_feature'])) > 50):
            bins = 10
            edges = np.linspace(plot_data['yaxis_feature'].min(), plot_data['yaxis_feature'].max(), bins+1).astype(float)
            labels = [f'({edges[i]}, {edges[i+1]}]' for i in range(bins)]
            plot_data['yaxis_feature'] = pd.cut(plot_data['yaxis_feature'], bins = bins, labels = labels)
            
    feature_plot_data = plot_data.groupby(['xaxis_feature', 'yaxis_feature']).agg(
            actual = ('actual', 'mean'),
            exposure = ('actual', 'size'),
            ).reset_index()
    
    return feature_plot_data.sort_values(by='xaxis_feature')

In [None]:
def plot_two_way_feature(data, response, xaxis_feature, yaxis_feature):
    
    import plotly
    import plotly.graph_objects as go
    from plotly.subplots import make_subplots
    
    feature_plot_data = _get_two_way_feature_plot_data(data, response, xaxis_feature, yaxis_feature)
    
    # Create figure with secondary y-axis
    fig = make_subplots(specs=[[{"secondary_y": True}]])

    plotly_default_colours = plotly.colors.qualitative.Plotly
    unique_levels = list(feature_plot_data['yaxis_feature'].unique())

    for index in range(len(unique_levels)):
        level_data = feature_plot_data[feature_plot_data['yaxis_feature'] == unique_levels[index]]
        # Add traces
        fig.add_trace(
            go.Bar(x=level_data['xaxis_feature'], y=level_data['exposure'], 
                name=str(unique_levels[index]),
                opacity = 0.5,
                marker_color = plotly_default_colours[index]),
            secondary_y=False,
        )
    for index in range(len(unique_levels)):
        level_data = feature_plot_data[feature_plot_data['yaxis_feature'] == unique_levels[index]]
        fig.add_trace(
            go.Scatter(x=level_data['xaxis_feature'], y=level_data['actual'], 
                    name=str(unique_levels[index]),
                    marker_color = plotly_default_colours[index]),
            secondary_y=True,
        )

    # Add figure title
    fig.update_layout(
        title_text="Two Way AvE: " + xaxis_feature + " x " + yaxis_feature,
        width = 1000,
        height = 600,
        barmode = "group"
    )

    # Set x-axis title
    fig.update_xaxes(title_text=xaxis_feature)

    # Set y-axes titles
    fig.update_yaxes(title_text="Exposure", secondary_y=False)
    fig.update_yaxes(title_text=response, secondary_y=True)

    fig.show()

Time Test Features

In [None]:
modelling_data = pd.concat([training_model_data, test_model_data], axis=0)

In [None]:
test_model_data.shape

In [None]:
for weather_col in [x for x in list(training_model_data) if "Kicking" in x]:
    plot_two_way_feature(modelling_data, ModellingDataContract.RESPONSE, "Year", weather_col)

In [None]:
for col in [x for x in list(training_model_data) if "_wavg" in x]:
    plot_two_way_feature(modelling_data, ModellingDataContract.RESPONSE, col, "Year")

In [None]:
plot_two_way_feature(modelling_data, ModellingDataContract.RESPONSE, "Temperature", "Year")

Venue / City

In [None]:
city_mapping = {
    'Adelaide':"South Australia",
    'Alice Springs':'Northern Territory',
    'Ballarat':"Victoria",
    'Brisbane':"Queensland",
    'Cairns':"Queensland",
    'Canberra':"New South Wales",
    'Darwin':"Northern Territory",
    'Geelong':"Victoria",
    'Gold Coast':"Queensland",
    'Hobart':"Tasmania",
    'Launceston':"Tasmania",
    'Melbourne':"Victoria",
    'Perth':"Western Australia",
    'Shanghai':"International",
    'Sydney':"New South Wales",
    'Townsville':"Queensland"
    }
modelling_data['State'] = modelling_data['City'].replace(city_mapping)
modelling_data['State'].value_counts()

In [None]:
vic_mapping = {
    'Adelaide':"Not Victoria",
    'Alice Springs':'Not Victoria',
    'Ballarat':"Victoria",
    'Brisbane':"Not Victoria",
    'Cairns':"Not Victoria",
    'Canberra':"Not Victoria",
    'Darwin':"Not Victoria",
    'Geelong':"Victoria",
    'Gold Coast':"Not Victoria",
    'Hobart':"Not Victoria",
    'Launceston':"Not Victoria",
    'Melbourne':"Victoria",
    'Perth':"Not Victoria",
    'Shanghai':"Victoria",
    'Sydney':"Not Victoria",
    'Townsville':"Not Victoria"
    }
modelling_data['Victoria'] = modelling_data['City'].replace(vic_mapping)

In [None]:
plot_two_way_feature(modelling_data, ModellingDataContract.RESPONSE, "Year", "Victoria")

In [None]:
roof_mapping = {
    'Adelaide Oval':"No Roof",
    'Bellerive Oval':"No Roof",
    'Carrara':"No Roof",
    'Cazalys Stadium':"No Roof",
    'Docklands':"Roof",
    'Eureka Stadium':"No Roof",
    'Gabba':"No Roof",
    'Jiangwan Stadium':"No Roof",
    'Kardinia Park':"No Roof",
    'M.C.G.':"No Roof",
    'Manuka Oval':"No Roof",
    'Marrara Oval':"No Roof",
    'Perth Stadium':"No Roof",
    'Riverway Stadium':"No Roof",
    'S.C.G.':"No Roof",
    'Stadium Australia':"No Roof",
    'Sydney Showground':"No Roof",
    'Traeger Park':"No Roof",
    'York Park':"No Roof"
 }
modelling_data['Roof'] = modelling_data['Venue'].replace(roof_mapping)
plot_two_way_feature(modelling_data, ModellingDataContract.RESPONSE, "Year", "Roof")

In [None]:
plot_two_way_feature(modelling_data, ModellingDataContract.RESPONSE, "Ground_Width", "Year")

In [None]:
plot_two_way_feature(modelling_data, ModellingDataContract.RESPONSE, "Ground_Length", "Year")

In [None]:
modelling_data['Ground_Area'] = modelling_data['Ground_Length'] * modelling_data['Ground_Width']
plot_feature(modelling_data, ModellingDataContract.RESPONSE, "Ground_Area")

In [None]:
team_state_mapping = {
    'Adelaide':"South Australia",
    'Brisbane Lions':"Queensland",
    'Carlton':"Victoria",
    'Collingwood':"Victoria",
    'Essendon':"Victoria",
    'Fremantle':"Western Australia",
    'Geelong':"Victoria",
    'Gold Coast':"Queensland",
    'Greater Western Sydney':"New South Wales",
    'Hawthorn':"Victoria",
    'Melbourne':"Victoria",
    'North Melbourne':"Victoria",
    'Port Adelaide':"South Australia",
    'Richmond':"Victoria",
    'St Kilda':"Victoria",
    'Sydney':"New South Wales",
    'West Coast':"Western Australia",
    'Western Bulldogs':"Victoria"}

In [None]:
modelling_data['Home_Team_State'] = modelling_data['Home_Team'].replace(team_state_mapping)
modelling_data['Away_Team_State'] = modelling_data['Away_Team'].replace(team_state_mapping)
modelling_data['Home_Team_Within_State'] = np.where(modelling_data['State'] == modelling_data['Home_Team_State'], True, False)
modelling_data['Away_Team_Within_State'] = np.where(modelling_data['State'] == modelling_data['Away_Team_State'], True, False)

In [None]:
plot_feature(modelling_data, ModellingDataContract.RESPONSE, "Home_Team_Within_State")
plot_two_way_feature(modelling_data, ModellingDataContract.RESPONSE, "Home_Team_Within_State", "Year")

In [None]:
modelling_data['Home_Team_Within_State'].value_counts()

In [None]:
plot_feature(modelling_data, ModellingDataContract.RESPONSE, "Away_Team_Within_State")
plot_two_way_feature(modelling_data, ModellingDataContract.RESPONSE, "Away_Team_Within_State", "Year")

In [None]:
modelling_data['Away_Team_Within_State'].value_counts()

In [None]:
modelling_data.head()

In [None]:
modelling_data['Primary_Home'] = np.where(modelling_data['Home_Ground'] == "Primary Home", True, False)

In [None]:
plot_feature(modelling_data, ModellingDataContract.RESPONSE, "Primary_Home")
plot_two_way_feature(modelling_data, ModellingDataContract.RESPONSE, "Primary_Home", "Year")

In [None]:
modelling_data.head()

In [None]:
modelling_data['Finals'] = np.where(modelling_data['Round_ID'].str.contains('F'), True, False)

In [None]:
plot_feature(modelling_data, ModellingDataContract.RESPONSE, "Finals")
plot_two_way_feature(modelling_data, ModellingDataContract.RESPONSE, "Finals", "Year")

Close games aren't high scoring

In [None]:
plot_feature(afl_data, ModellingDataContract.RESPONSE, "Margin")
plot_two_way_feature(modelling_data, ModellingDataContract.RESPONSE, "Margin", "Year")

ELO

In [None]:
afl_data.head()

In [None]:
from total_points_model.domain.preprocessing.preprocessing_functions import score_col_splitter

In [None]:
elo_data = afl_data.copy()
# elo_data = score_col_splitter(elo_data, "Q4_Score")
elo_data.head()

In [None]:
def calculate_elo_ratings(data, k_factor):
    
    # Initialise a dictionary with default elos for each team
    elo_dict = {team: 1500 for team in ModellingDataContract.team_list}
    elos, elo_probs = {}, {}
    
    for index, row in data.iterrows():
        game_id = row['Match_ID']
        margin = row['Margin']
        
        if game_id in elos.keys():
            continue
        
        home_team = row['Home_Team']
        away_team = row['Away_Team']
        
        home_team_elo = elo_dict[home_team]
        away_team_elo = elo_dict[away_team]
        
        prob_win_home = 1 / (1 + 10**((away_team_elo - home_team_elo) / 400))
        prob_win_away = 1 - prob_win_home
        
        elos[game_id] = [home_team_elo, away_team_elo]
        elo_probs[game_id] = [prob_win_home, prob_win_away]
        
        if margin > 0:
            new_home_team_elo = home_team_elo + k_factor*(1 - prob_win_home)
            new_away_team_elo = away_team_elo + k_factor*(0 - prob_win_away)
        elif margin < 0:
            new_home_team_elo = home_team_elo + k_factor*(0 - prob_win_home)
            new_away_team_elo = away_team_elo + k_factor*(1 - prob_win_away)
        elif margin == 0:
            new_home_team_elo = home_team_elo + k_factor*(0.5 - prob_win_home)
            new_away_team_elo = away_team_elo + k_factor*(0.5 - prob_win_away)
            
        elo_dict[home_team] = new_home_team_elo
        elo_dict[away_team] = new_away_team_elo

    
    return elos, elo_dict, elo_probs

In [None]:
elo, elo_dict, elo_probs = calculate_elo_ratings(elo_data, k_factor=30)

In [None]:
def convert_elo_dict_to_dataframe(elo_dict):
    
    elo_df = pd.DataFrame(list(elo_dict.items()), columns = ['Match_ID', 'ELO_list'])
    elo_df[['ELO_Home', 'ELO_Away']] = elo_df['ELO_list'].tolist()
    elo_df['ELO_diff'] = elo_df['ELO_Home'] - elo_df['ELO_Away']
    elo_df['ELO_abs_diff'] = abs(elo_df['ELO_diff'])
    elo_df = elo_df.drop(columns = ['ELO_list'])   
    
    return elo_df

In [None]:
def merge_elo_ratings(X, elo_dict):
    
    elo_df = convert_elo_dict_to_dataframe(elo_dict)
    
    X = pd.merge(X, elo_df, how = 'left', on = 'Match_ID')
    
    return X

In [None]:
# elo_df = pd.DataFrame(list(elo.items()), columns = ['Match_ID', 'ELO_list'])
# elo_df[['ELO_Home', 'ELO_Away']] = elo_df['ELO_list'].tolist()
# elo_df['ELO_diff'] = elo_df['ELO_Home'] - elo_df['ELO_Away']
# elo_df['ELO_abs_diff'] = abs(elo_df['ELO_diff'])
# elo_df = elo_df.drop(columns = ['ELO_list'])
# elo_df.tail()

In [None]:
# elo_probs_df = pd.DataFrame(list(elo_probs.items()), columns = ['Match_ID', 'ELO_probs_list'])
# elo_probs_df[['ELO_probs_Home', 'ELO_probs_Away']] = elo_probs_df['ELO_probs_list'].tolist()
# elo_probs_df['ELO_probs_diff'] = elo_probs_df['ELO_probs_Home'] - elo_probs_df['ELO_probs_Away']
# elo_probs_df['ELO_probs_abs_diff'] = abs(elo_probs_df['ELO_probs_diff'])
# elo_probs_df = elo_probs_df.drop(columns = ['ELO_probs_list'])
# elo_probs_df.tail()

In [None]:
# modelling_data = pd.merge(modelling_data, elo_df, how = 'left', on = 'Match_ID')

In [None]:
# modelling_data = pd.merge(modelling_data, elo_probs_df, how = 'left', on = 'Match_ID')

In [None]:
def create_elo_rating_factor(X):
    
    elos, elo_dict, elo_probs  = calculate_elo_ratings(X, k_factor=ModellingDataContract.ELO_K_FACTOR)
    
    X = merge_elo_ratings(X, elos)
    X = merge_elo_ratings(X, elo_probs)
    
    return X
modelling_data = create_elo_rating_factor(modelling_data)

In [None]:
modelling_data.head()

In [None]:
list(modelling_data)

In [None]:
plot_feature(modelling_data, ModellingDataContract.RESPONSE, "ELO_Home")
plot_two_way_feature(modelling_data, ModellingDataContract.RESPONSE, "ELO_Home", "Year")


In [None]:
plot_feature(modelling_data, ModellingDataContract.RESPONSE, "ELO_Away")
plot_two_way_feature(modelling_data, ModellingDataContract.RESPONSE, "ELO_Away", "Year")

In [None]:
plot_feature(modelling_data, ModellingDataContract.RESPONSE, "ELO_diff")
plot_two_way_feature(modelling_data, ModellingDataContract.RESPONSE, "ELO_diff", "Year")

In [None]:
plot_feature(modelling_data, ModellingDataContract.RESPONSE, "ELO_abs_diff")
plot_two_way_feature(modelling_data, ModellingDataContract.RESPONSE, "ELO_abs_diff", "Year")

In [None]:
plot_feature(modelling_data, ModellingDataContract.RESPONSE, "ELO_probs_diff")
plot_two_way_feature(modelling_data, ModellingDataContract.RESPONSE, "ELO_probs_diff", "Year")

In [None]:
plot_feature(modelling_data, ModellingDataContract.RESPONSE, "ELO_probs_Home")
plot_two_way_feature(modelling_data, ModellingDataContract.RESPONSE, "ELO_probs_Home", "Year")