# Modelling

In [23]:
# match data with aggregated individual data
import pandas as pd
match_path = '/Users/t_raver9/Desktop/projects/aflengine/analysis/machine_learning/src/player_data/data/matches_with_player_agg.csv'
players_path = '/Users/t_raver9/Desktop/projects/aflengine/analysis/machine_learning/src/player_data/data/players_with_player_stat_totals.csv'
matches = pd.read_csv(match_path)
players = pd.read_csv(players_path)

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


## Data Preparation

For the first iteration, we'll only use data after 2009. This is when most modern statistics began to be kept (though not all of them did).

In [4]:
model_data = matches[matches['season'] >= 2010]

We want to split the data into test and train in a stratified manner, i.e. we don't want to favour a certain season, or a part of the season. So we'll take a portion (25%) of games from each round.

In [5]:
# How many games do we get per round?
round_counts = {}
curr_round = 1
matches_in_round = 0
for idx,row in model_data.iterrows():
    
    if curr_round != row['round']:
        
        if matches_in_round not in round_counts:
            round_counts[matches_in_round] = 1
        else:
            round_counts[matches_in_round] += 1
            
        curr_round = row['round']
        matches_in_round = 1
        continue
        
    else:
        matches_in_round += 1
        
round_counts

{8: 44, 7: 7, 9: 154, 6: 22}

In [6]:
# Taking a minimum 25% of each round
from math import ceil
test_sample_size = {}
for num_games in round_counts:
    test_sample_size[num_games] = ceil(num_games/4)

In [8]:
rounds_in_season = get_season_rounds(model_data)
teams_in_season = get_season_teams(model_data)

Create test and training data

In [9]:
# test set
from copy import deepcopy

test_data = pd.DataFrame()
for season, max_round in rounds_in_season.items():
    for rnd in range(1, max_round):
        round_matches = model_data[(model_data['season']==season) & (model_data['round']==rnd)]
        num_test = test_sample_size[len(round_matches)]
        round_test_set = round_matches.sample(num_test)
        test_data = test_data.append(round_test_set)
        
# training set
training_data = model_data.drop(test_data.index)

Define features

In [12]:
features = [col 
            for col 
            in ['h_career_' + col for col in player_cols_to_agg] + \
            ['h_season_' + col for col in player_cols_to_agg] + \
            ['a_career_' + col for col in player_cols_to_agg] + \
            ['a_season_' + col for col in player_cols_to_agg] + \
            ['h_' + col for col in ladder_columns] + \
            ['h_' + col + '_form' for col in ladder_columns] + \
            ['a_' + col for col in ladder_columns] + \
            ['a_' + col + '_form' for col in ladder_columns] + \
            ['h_career_' + col for col in misc_columns] + \
            ['h_season_' + col for col in misc_columns] + \
            ['a_career_' + col for col in misc_columns] + \
            ['a_season_' + col for col in misc_columns]
            ]

target = 'winner'

Set up test and train datasets

In [13]:
X_train = training_data[features]
y_train = training_data[target]
X_test = test_data[features]
y_test = test_data[target]

Fill the NaN values

In [14]:
X_train.fillna(0,inplace=True)
y_train.fillna(0,inplace=True)
X_test.fillna(0,inplace=True)
y_test.fillna(0,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  **kwargs


## Modelling

Model 1: Logistic regression

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import numpy as np

log_reg = LogisticRegression()

param_grid = {
                 'tol': [.0001, .001, .01],
                 'C': [.1, 1, 10],
                 'max_iter': [50,100,200]
             }

grid_log_reg = GridSearchCV(log_reg, param_grid, cv=5)
grid_log_reg.fit(X_train, y_train)



















GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [0.1, 1, 10], 'max_iter': [50, 100, 200],
                         'tol': [0.0001, 0.001, 0.01]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [17]:
grid_log_reg.score(X_train,y_train)

0.708550185873606

## Generating predictions

Now that we have a model, we need to ingest data for that model to make a prediction on.

In [21]:
matches[features].tail()

Unnamed: 0,h_career_AFLfantasy,h_career_Supercoach,h_career_behinds,h_career_bounces,h_career_brownlow,h_career_clangers,h_career_clearances,h_career_contested_marks,h_career_contested_poss,h_career_disposals,...,a_prem_points_form,a_played_form,a_points_for_form,a_points_against_form,a_percentage_form,a_position_form,h_career_games_played,h_season_games_played,a_career_games_played,a_season_games_played
14927,64.168319,69.22065,0.430593,0.381619,0.051072,2.190349,1.37939,0.518395,6.081885,15.310005,...,0,5,318,453,70.198675,1,115.863636,17.5,90.863636,15.181818
14928,66.22439,70.628447,0.501595,0.222014,0.091061,2.374858,1.529519,0.590786,6.589475,16.17361,...,12,5,430,413,104.116223,-1,109.727273,16.318182,94.095238,16.238095
14929,68.604675,70.779146,0.422881,0.326577,0.050622,2.138111,1.27789,0.471104,5.936039,16.539982,...,4,5,355,457,77.680525,3,73.409091,16.0,132.761905,15.0
14930,71.173017,74.816969,0.44576,0.281506,0.0943,2.31501,1.904655,0.464219,6.88351,17.316909,...,8,5,337,384,87.760417,0,116.409091,15.0,77.190476,15.571429
14931,67.142573,71.514086,0.470924,0.42921,0.091438,2.476963,1.600045,0.477188,6.267784,16.118679,...,20,5,491,342,143.567251,-2,118.090909,16.363636,97.727273,18.272727


In [22]:
matches.tail()

Unnamed: 0.1,Unnamed: 0,round,venue,date,day,time,crowd,hteam,hteam_q1,hteam_q2,...,a_season_disposal_efficiency,a_season_effective_disposals,a_season_intercepts,a_season_metres_gained,a_season_stoppage_clearances,a_season_score_involvements,a_season_tackles_in_50,a_season_turnovers,a_career_games_played,a_season_games_played
14927,14927,23,Bellerive Oval,2019-08-24,Sat,14:10:00,8202.0,North Melbourne,3.4.22,5.4.34,...,68.83611,11.992744,3.405772,266.125412,1.358133,3.883696,0.547596,3.61291,90.863636,15.181818
14928,14928,23,S.C.G.,2019-08-24,Sat,13:45:00,33722.0,Sydney,6.2.38,6.3.39,...,72.6721,12.383266,2.944996,256.505922,1.457191,4.133335,0.525543,3.107189,94.095238,16.238095
14929,14929,23,Eureka Stadium,2019-08-25,Sun,13:10:00,9560.0,Footscray,6.5.41,9.8.62,...,72.105249,12.824283,3.1997,255.888196,1.185771,4.338499,0.41649,3.171435,132.761905,15.0
14930,14930,23,Adelaide Oval,2019-08-25,Sun,16:10:00,27504.0,Port Adelaide,3.4.22,7.9.51,...,71.529123,11.938904,3.415264,241.111595,1.173426,3.844153,0.484698,3.247399,77.190476,15.571429
14931,14931,23,M.C.G.,2019-08-25,Sun,15:20:00,76995.0,Richmond,6.2.38,8.6.54,...,71.631697,11.353278,3.431237,265.25026,1.284315,3.984482,0.5784,3.004644,97.727273,18.272727


In [24]:
players.tail()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,Unnamed: 0.1.1.1.1,Unnamed: 0.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1.1,AFLfantasy,Supercoach,...,career_stoppage_clearances,season_score_involvements,career_score_involvements,season_tackles_in_50,career_tackles_in_50,season_turnovers,career_turnovers,career_games_played,season_games_played,next_matchid
627690,627690,627690,627690,627690,627690,627690,627690,563034,27.0,47.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,3,201220FOTRCH
627691,627691,627691,627691,627691,627691,627691,627691,563424,61.0,50.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,4,201221FOTSYD
627692,627692,627692,627692,627692,627692,627692,627692,563787,31.0,28.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,5,201222FOTGEE
627693,627693,627693,627693,627693,627693,627693,627693,564269,61.0,58.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7,6,201223BRSFOT
627694,627694,627694,627694,627694,627694,627694,627694,564478,12.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8,7,


How can we do this?
- we will have to enter the players who are playing in each game. Their stats are largely aggregated and ready for consumption in the players df
- we'll have to write new functions to update the ladder form, based on the ladder form of the previous games

# Metadata and functions

In [7]:
from typing import Dict
import numpy as np

def get_season_rounds(matches: pd.DataFrame) -> Dict:
    """
    Return a dictionary with seasons as keys and number of games
    in season as values
    """
    seasons = matches['season'].unique()
    rounds_in_season = dict.fromkeys(seasons,0)
    
    for season in seasons:
        rounds_in_season[season] = max(matches[matches['season']==season]['round'])
    
    return rounds_in_season

# What teams participated in each season?
def get_season_teams(matches: pd.DataFrame) -> Dict:
    """
    Return a dictionary with seasons as keys and a list of teams who played
    in that season as values
    """
    seasons = matches['season'].unique()
    teams_in_season = {}

    for season in seasons:
        teams = list(matches[matches['season']==season]['hteam'].unique())
        teams.extend(list(matches[matches['season']==season]['ateam'].unique()))
        teams = np.unique(teams)
        teams_in_season[season] = list(teams)
        
    return teams_in_season

In [11]:
player_cols_to_agg = [
    'AFLfantasy',
    'Supercoach',
    'behinds',
    'bounces',
    'brownlow',
    'clangers',
    'clearances',
    'contested_marks',
    'contested_poss',
    'disposals',
    'frees_against',
    'frees_for',
    'goal_assists',
    'goals',
    'handballs',
    'hitouts',
    'inside50',
    'kicks',
    'marks',
    'marks_in_50',
    'one_percenters',
    'rebound50',
    'tackles',
    'tog',
    'uncontested_poss',
    'centre_clearances',
    'disposal_efficiency',
    'effective_disposals',
    'intercepts',
    'metres_gained',
    'stoppage_clearances',
    'score_involvements',
    'tackles_in_50',
    'turnovers'
]

match_cols = [
    'odds',
    'line'
]

ladder_columns = [
    'wins',
    'losses',
    'draws',
    'prem_points',
    'played',
    'points_for',
    'points_against',
    'percentage',
    'position'
]

misc_columns = [
    'games_played'
]