In [1]:
from data_preparation_functions import *
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import warnings
from sklearn import linear_model, tree, discriminant_analysis, naive_bayes, ensemble, gaussian_process
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.metrics import log_loss, confusion_matrix
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 100)

## 3. Model Building & Hyperparameter Tuning

This section will look to cover 3 main points:
1. Choosing which Machine Learning algorithm to use from a variety of choices
2. Hyperparameter Tuning
3. Overfitting/Underfitting

### Choosing an Algorithm

The best way to choose an algorithm is to test all of them and decide after reviewing their metrics. <br>
To do this, it is necessary to define a function that iterates over a number of algorithms and gives us an indication of which algorithms are suited to this dataset and exercise.

In [3]:
features = create_feature_df()

Creating all games feature DataFrame
Creating stats feature DataFrame
Creating odds feature DataFrame
Creating market values feature DataFrame
Filling NAs
Merging stats, odds and market values into one features DataFrame
Complete.


In [4]:
features

Unnamed: 0,Date,gameId,HomeTeam,season,f_homeWinPc38Home,f_homeWinPc5Home,f_awayWinPc38Home,f_awayWinPc5Home,f_eloForHome,f_eloAgainstHome,f_wtEloGoalsForHome,f_wtEloGoalsAgainstHome,f_cornersAgainstHome,f_cornersForHome,f_freesAgainstHome,f_freesForHome,f_goalsAgainstHome,f_goalsForHome,f_halfTimeGoalsAgainstHome,f_halfTimeGoalsForHome,f_redsAgainstHome,f_redsForHome,f_shotsAgainstHome,f_shotsForHome,f_shotsOnTargetAgainstHome,f_shotsOnTargetForHome,f_yellowsAgainstHome,f_yellowsForHome,f_avAsianHandicapOddsAgainstHome,f_avAsianHandicapOddsForHome,f_avgreaterthan2.5Home,f_avlessthan2.5Home,f_sizeOfHandicapHome,AwayTeam,f_homeWinPc38Away,f_homeWinPc5Away,f_awayWinPc38Away,f_awayWinPc5Away,f_eloForAway,f_eloAgainstAway,f_wtEloGoalsForAway,f_wtEloGoalsAgainstAway,f_cornersAgainstAway,f_cornersForAway,f_freesAgainstAway,f_freesForAway,f_goalsAgainstAway,f_goalsForAway,f_halfTimeGoalsAgainstAway,f_halfTimeGoalsForAway,f_redsAgainstAway,f_redsForAway,f_shotsAgainstAway,f_shotsForAway,f_shotsOnTargetAgainstAway,f_shotsOnTargetForAway,f_yellowsAgainstAway,f_yellowsForAway,f_avAsianHandicapOddsAgainstAway,f_avAsianHandicapOddsForAway,f_avgreaterthan2.5Away,f_avlessthan2.5Away,f_sizeOfHandicapAway,f_attMktH%,f_attMktA%,f_midMktH%,f_midMktA%,f_defMktH%,f_defMktA%,f_gkMktH%,f_gkMktA%,f_totalMktH%,f_totalMktA%,result,f_awayOdds,f_drawOdds,f_homeOdds
20,2005-08-23,21,Birmingham,0506,0.394737,0.4,0.263158,0.2,1478.687038,1492.866048,1.061763,1.260223,4.979592,7.530612,12.000000,9.938776,1.020408,0.510204,0.510204,0.510204,0.000000,0.000000,11.938776,8.020408,6.489796,2.979592,1.000000,2.510204,1.909000,1.945500,2.051000,1.673500,-0.137500,Middlesbrough,0.394737,0.4,0.263158,0.2,1492.866048,1478.687038,1.129940,1.279873,2.551020,5.510204,13.551020,13.428571,1.020408,0.000000,0.000000,0.000000,0.000000,0.489796,17.020408,8.081633,7.510204,2.510204,3.000000,1.489796,1.939500,1.909500,2.003500,1.715500,0.387500,5.132983,5.260851,3.341048,4.289788,3.502318,4.168935,2.332815,3.216457,3.934396,4.522205,away,2.75,3.20,2.50
21,2005-08-23,22,Portsmouth,0506,0.447368,0.4,0.263158,0.4,1405.968416,1489.229314,1.147101,1.503051,2.510204,4.959184,21.979592,16.061224,2.000000,0.510204,1.000000,0.000000,0.000000,0.000000,8.448980,10.489796,3.959184,4.448980,3.020408,1.530612,1.896500,1.969000,2.004000,1.700500,0.250000,Aston Villa,0.447368,0.4,0.263158,0.4,1489.229314,1405.968416,1.175160,1.263229,9.530612,7.000000,14.469388,17.571429,1.489796,0.979592,0.979592,0.979592,0.000000,0.000000,15.551020,3.000000,9.061224,2.510204,2.000000,0.510204,1.856500,1.977000,1.850500,1.848500,0.712500,3.738614,3.878659,4.494368,4.954673,2.884262,4.065926,3.746642,5.372543,3.743410,4.365456,draw,2.75,3.20,2.50
22,2005-08-23,23,Sunderland,0506,0.236842,0.0,0.236842,0.4,1277.888970,1552.291880,0.650176,1.543716,5.000000,5.000000,12.408163,17.551020,1.979592,0.489796,1.000000,0.489796,0.489796,0.510204,14.510204,6.897959,5.020408,3.918367,1.020408,2.510204,1.852000,1.991500,1.853500,1.850000,0.712500,Man City,0.236842,0.0,0.236842,0.4,1552.291880,1277.888970,1.288750,1.287367,7.530612,3.510204,8.959184,12.489796,0.510204,1.020408,0.510204,0.510204,0.000000,0.000000,10.959184,11.938776,2.489796,6.979592,3.000000,1.489796,1.815000,2.039500,2.006000,1.709500,-0.200000,0.706318,3.750792,1.476812,1.070209,2.634096,4.455890,0.777605,4.913050,1.499427,3.151477,away,2.50,3.20,2.75
23,2005-08-24,24,Arsenal,0506,0.736842,1.0,0.236842,0.2,1729.086068,1481.943781,2.099593,0.921523,3.000000,7.489796,17.000000,18.061224,0.510204,0.979592,0.000000,0.000000,0.489796,0.000000,5.571429,11.938776,3.551020,7.408163,1.510204,1.530612,1.945500,1.909000,1.876000,1.828500,-0.287500,Fulham,0.736842,1.0,0.236842,0.2,1481.943781,1729.086068,1.170928,1.323440,7.020408,3.448980,19.632653,13.020408,1.020408,0.510204,0.510204,0.000000,0.000000,0.000000,11.591837,11.428571,6.551020,5.469388,2.000000,1.510204,2.061000,1.799000,2.023500,1.684500,0.275000,10.807882,0.785474,8.064289,4.161925,9.116327,3.583254,3.661813,5.337198,9.031622,2.924604,home,13.00,5.50,1.22
24,2005-08-24,25,Blackburn,0506,0.263158,0.6,0.263158,0.2,1496.457214,1535.410612,1.248951,1.308457,1.489796,7.020408,12.530612,20.122449,1.979592,1.510204,0.000000,1.000000,0.000000,0.489796,10.448980,13.551020,4.489796,7.040816,1.020408,1.510204,1.803500,2.056500,2.014500,1.693500,-0.275000,Tottenham,0.263158,0.6,0.263158,0.2,1535.410612,1496.457214,1.274824,1.393005,6.489796,3.530612,11.469388,19.428571,0.000000,2.000000,0.000000,0.489796,0.000000,0.000000,11.510204,14.571429,4.959184,7.020408,0.510204,2.510204,1.914000,1.935000,1.976500,1.733500,-0.387500,1.583126,5.553120,3.477861,6.881561,4.010007,5.537488,2.297469,5.973420,2.916354,6.001831,draw,2.60,3.20,2.62
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4949,2018-08-25,4950,Bournemouth,1819,0.447368,0.6,0.157895,0.4,1549.801937,1567.656994,1.261521,1.720533,5.380406,5.765467,11.482014,9.224256,1.591704,1.359998,0.704842,0.500421,0.023102,0.024295,14.022928,12.652535,4.589902,4.339203,2.099701,1.347368,1.876216,1.994746,1.815644,2.049951,0.188785,Everton,0.447368,0.6,0.157895,0.4,1567.656994,1549.801937,1.330970,1.232383,5.442022,4.320277,10.632374,11.395848,1.401492,1.318978,0.591241,0.501612,0.128647,0.097965,13.651956,10.228199,4.534917,3.846102,1.357584,1.231665,1.993966,1.884081,2.123252,1.754956,0.106011,1.592553,4.602562,1.562375,4.967551,1.928114,3.080280,1.689039,6.400569,1.677174,4.441914,draw,2.75,3.60,2.62
4950,2018-08-26,4951,Watford,1819,0.421053,0.4,0.289474,0.6,1520.411452,1546.461760,1.018660,1.607977,5.066188,4.833421,10.483965,11.992207,1.581523,1.106250,0.735976,0.447844,0.071242,0.079449,10.826491,11.602295,4.021974,3.862998,1.662156,1.751864,1.938889,1.939490,2.053532,1.915178,0.436705,Crystal Palace,0.421053,0.4,0.289474,0.6,1546.461760,1520.411452,1.108738,1.462203,5.611883,5.300684,11.293843,11.171714,1.349860,1.350959,0.681212,0.518444,0.072246,0.041085,12.692089,11.836398,4.556892,4.137001,1.861655,1.849007,2.010493,1.870704,1.848792,2.020223,0.281209,1.700947,2.651476,2.323532,1.081644,2.363116,2.989988,0.933416,1.644591,2.010164,2.191385,home,3.00,3.25,2.60
4951,2018-08-26,4952,Newcastle,1819,0.394737,0.6,0.605263,0.8,1513.740462,1663.134224,1.263713,1.457554,5.663536,4.187155,10.328695,10.939110,1.239496,1.054580,0.586392,0.443609,0.005387,0.091234,12.452771,11.747707,4.003907,3.858941,1.621902,1.362317,1.867457,2.006749,2.024455,1.911703,0.379764,Chelsea,0.394737,0.6,0.605263,0.8,1663.134224,1513.740462,1.848987,0.860756,3.755354,5.849745,11.132337,9.514379,1.074810,1.771308,0.414233,0.893722,0.042799,0.054557,9.727137,16.066182,3.341093,5.717481,1.928424,1.130717,1.879666,1.984855,1.805033,2.053100,-0.926899,1.833687,10.622580,2.323532,12.338755,2.186764,12.897203,2.666904,11.734376,2.132119,11.813535,away,1.66,4.00,5.75
4952,2018-08-26,4953,Fulham,1819,0.289474,0.2,0.210526,0.4,1405.922861,1523.094877,1.137916,1.434621,5.975406,5.069325,10.171646,9.963712,2.186921,1.087311,0.628756,0.350691,0.030692,0.043723,16.577464,11.759445,7.149427,4.548139,1.314872,1.393248,1.884552,1.985978,1.756776,2.128261,0.502253,Burnley,0.289474,0.2,0.210526,0.4,1523.094877,1405.922861,0.938616,1.525725,6.009157,4.388209,11.883636,9.503644,1.244818,0.939679,0.422908,0.382256,0.088614,0.012340,14.509454,10.273809,4.536976,3.510299,1.211391,1.578136,1.965764,1.906055,2.282184,1.675649,0.234690,0.466927,1.784327,3.405176,1.862832,0.540813,1.975141,1.066761,2.622455,1.423612,1.915461,home,4.33,3.40,2.00


To start our modelling process, we need to make a training set, a test set and a holdout set. <br>
<br>
As we are using cross validation, we will make our training set all of the seasons up until 2017/18, and we will use the 2017/18 season as the test set.

In [None]:
feature_list = [col for col in features.column if col.startwith("f_")]

le = LabelEncoder() # initiate a label encoder to transform the labels 'away', 'draw', 'home' to 0,1,2

# Grab all seasons except for 2017/18 to use cross validation with



