In [1]:
import sys
sys.path.append('/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages')

## Match predictions:

The goal of this notebook is to create a model to predict match results, i.e home win / home loss / draw.

We base our model on the intersection of a certain domain knowledge (the kind of information that could influence the result of a game of football) and the data in our possession. Better results could definetily be achevied if we were in possession of more detailed data on previous match such as a breakdown of performances per position or average possession statistics, etc.

This analysis is inspired by the work of Gunjan Kumar in his thesis: "Machine Learning for Soccer Analytics".

In [3]:
% matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

# Import all ML modules and packages we'll need
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

from __future__ import division

## Data preparation

In [4]:
import json
# Load in data
filenames = ['../Data/BPL/BPL12-13.json']
with open(filenames[0], 'r') as fp:
    data = json.load(fp)

We need to create a dictionary that contains the following data:

- Final score: response variable
- Home team metric average on past 5 games
- Away team metric average on past 5 games
- Sum of the differences of the best player metrics score for each team in the past 5 game
- Average goals against the home team on the past 5 games
- Average goals against the away team on the past 5 games
- Number of losses for the home team in the past 2 games
- Number of losses for the away team in the past 2 games 


We initialise the dictionary with key equal to the match ID and first values: home team, away team, and day.

In [5]:
teams = ['arsenal-fc','aston-villa','chelsea-fc','everton-fc','fulham-fc','liverpool-fc','manchester-city','manchester-united','newcastle-united','norwich-city','queens-park-rangers','reading-fc','southampton-fc','stoke-city','sunderland-afc','swansea-city','tottenham-hotspur','west-bromwich-albion','west-ham-united','wigan-athletic']
games = {}
for t1 in teams:
    for t2 in teams:
        if t1!=t2:
            games[t1+"-"+t2] = {'home': t1, 'away': t2}

In [6]:
for k in games.keys():
    games[k]['day']= data[k]['day']

In order to fill in this dictionary, we need a way to access the data for every player in the team at a particular day. We therefore create a team dictionary. Note that the "if p[2]>=0 else ' '" statement is to remove own goal scorers.

In [7]:
team_players = dict.fromkeys(teams)
for k in games.keys():
    team1 = games[k]['home']
    team2 = games[k]['away']
    
    if team_players[team1] is None:
        team_players[team1] = [p[0] if p[2]>=0 else ' ' for p in data[k]['home']]
    else:
        team_players[team1].extend([p[0] if p[2]>=0 else ' ' for p in data[k]['home']])
    team_players[team1].extend([p[1] if p[2]>=0 else ' ' for p in data[k]['home']])
    
    if team_players[team2] is None:
        team_players[team2] = [p[0] if p[2]>=0 else ' ' for p in data[k]['away']]
    else:
        team_players[team2].extend([p[0] if p[2]>=0 else ' ' for p in data[k]['away']])
    team_players[team2].extend([p[1] if p[2]>=0 else ' ' for p in data[k]['away']])

In [8]:
for k in team_players.keys():
    team_players[k] = list(set(team_players[k]))
    team_players[k].remove(' ')

In [9]:
player_to_club = {}
for k in team_players.keys():
    for v in team_players[k]:
        player_to_club[v] = k

We test it:

In [10]:
team_players['arsenal-fc']

[u'Mikel Arteta',
 u'Aaron Ramsey',
 u'Lukas Podolski',
 u'Jack Wilshere',
 u'Theo Walcott',
 u'Per Mertesacker',
 u'Gervinho',
 u'Kieran Gibbs',
 u'Nacho Monreal',
 u'Alex Oxlade-Chamberlain',
 u'Tom\xc3\xa1\xc5\xa1 Rosick\xc3\xbd',
 u'Laurent Koscielny',
 u'Olivier Giroud',
 u'Santi Cazorla']

In [11]:
import json
with open('../Data/BPL_teamplayers.json', 'w') as fp:
    json.dump(team_players, fp)

We load the feature data:

In [12]:
features12 = pd.read_pickle('../Data/features12-13.pkl')

We first fill in the metric averages:

In [13]:
for k in games.keys():
    d = games[k]['day']
    if (d-1) == 0:
        continue
    home_average = 0
    for p in team_players[games[k]['home']]:
        home_average += features12[p]['match_value_list'][d-2]
    games[k]['home_average'] = home_average/len(team_players[games[k]['home']])
    away_average = 0
    for p in team_players[games[k]['away']]:
        away_average += features12[p]['match_value_list'][d-2]
    games[k]['away_average'] = away_average/len(team_players[games[k]['away']])

In [14]:
games[games.keys()[0]]

{'away': 'manchester-city',
 'away_average': 0.609375,
 'day': 19,
 'home': 'sunderland-afc',
 'home_average': 0.4723214285714285}

We fill in the best player differences:

In [16]:
for k in games.keys():
    d = games[k]['day']
    home_team = games[k]['home']
    away_team = games[k]['away']
    best_home = []
    best_away = []
    if (d-1)==0:
        continue
    elif (d-1) == 1:
        for p in team_players[home_team]:
            best_home.append(features12[p]['match_value_list'][0])
        for p in team_players[away_team]:
            best_away.append(features12[p]['match_value_list'][0])
            
    elif (d-1) == 2:
        for p in team_players[home_team]:
            best_home.append(np.max(features12[p]['match_value_list'][:2]))
        for p in team_players[away_team]:
            best_away.append(np.max(features12[p]['match_value_list'][:2]))
            
    else:
        for p in team_players[home_team]:
            best_home.append(np.max(features12[p]['match_value_list'][d-4:d-1]))
        for p in team_players[away_team]:
            best_away.append(np.max(features12[p]['match_value_list'][d-4:d-1]))
     
    games[k]['best'] = np.sum(best_home)-np.sum(best_away)

In [17]:
games[games.keys()[0]]

{'away': 'manchester-city',
 'away_average': 0.609375,
 'best': -13.650000000000002,
 'day': 19,
 'home': 'sunderland-afc',
 'home_average': 0.4723214285714285}

We add the score:

In [18]:
results = dict.fromkeys(teams)
goals_against = dict.fromkeys(teams)

for k in results.keys():
    results[k] = np.zeros(38)
    goals_against[k] = np.zeros(38)
    
for k in games.keys():
    team1 = games[k]['home']
    team2 = games[k]['away']
    d = games[k]['day']-1
    goal_home = len(data[k]['home'])
    goal_away = len(data[k]['away'])
    
    goals_against[team1][d] = -goal_away
    goals_against[team2][d] = -goal_home
    
    if goal_home > goal_away:
        results[team1][d] = 1
        results[team2][d] = -1
    elif goal_home<goal_away:
        results[team1][d] = -1
        results[team2][d] = 1
    else:
        results[team1][d] = 0
        results[team2][d] = 0

We finally add home losses,wins, goals against, etc...

In [19]:
for k in games.keys():
    
    team1 = games[k]['home']
    team2 = games[k]['away']
    
    d = games[k]['day']-1
    
    if d==0:
        continue
    elif d<=4:
        home_goal_vs = np.sum(goals_against[team1][:d])/d
        away_goal_vs = np.sum(goals_against[team2][:d])/d
        if d<=2:
            home_loss = len(results[team1][:d][results[team1][:d]<0])
            home_win = len(results[team1][:d][results[team1][:d]>0])
            away_loss = len(results[team2][:d][results[team2][:d]<0])
            away_win = len(results[team2][:d][results[team2][:d]>0])
        else:
            home_loss = len(results[team1][d-2:d][results[team1][d-2:d]<0])
            home_win = len(results[team1][d-2:d][results[team1][d-2:d]>0])
            away_loss = len(results[team2][d-2:d][results[team2][d-2:d]<0])
            away_win = len(results[team2][d-2:d][results[team2][d-2:d]>0])
    else:
        home_goal_vs = np.sum(goals_against[team1][d-5:d])/5
        away_goal_vs = np.sum(goals_against[team2][d-5:d])/5
        home_loss = len(results[team1][d-2:d][results[team1][d-2:d]<0])
        home_win = len(results[team1][d-2:d][results[team1][d-2:d]>0])
        away_loss = len(results[team2][d-2:d][results[team2][d-2:d]<0])
        away_win = len(results[team2][d-2:d][results[team2][d-2:d]>0])
        
    games[k]['goals_against_home'] = home_goal_vs
    games[k]['goals_against_away'] = away_goal_vs
    games[k]['home_loss'] = home_loss
    games[k]['away_loss'] = away_loss
    games[k]['home_win'] = home_win
    games[k]['away_win'] = away_win
    games[k]['score'] = results[team1][d]

We therefore have:

In [23]:
games[games.keys()[0]]

{'away': 'manchester-city',
 'away_average': 0.609375,
 'away_loss': 0,
 'away_win': 2,
 'best': -13.650000000000002,
 'day': 19,
 'goals_against_away': -1.0,
 'goals_against_home': -1.6000000000000001,
 'home': 'sunderland-afc',
 'home_average': 0.4723214285714285,
 'home_loss': 1,
 'home_win': 1,
 'score': 1.0}

We now convert it to a dataframe to perform predictions using to start multinomial logistic regression:

In [20]:
DF = pd.DataFrame.from_dict(games)

In [21]:
DF.head()

Unnamed: 0,arsenal-fc-aston-villa,arsenal-fc-chelsea-fc,arsenal-fc-everton-fc,arsenal-fc-fulham-fc,arsenal-fc-liverpool-fc,arsenal-fc-manchester-city,arsenal-fc-manchester-united,arsenal-fc-newcastle-united,arsenal-fc-norwich-city,arsenal-fc-queens-park-rangers,arsenal-fc-reading-fc,arsenal-fc-southampton-fc,arsenal-fc-stoke-city,arsenal-fc-sunderland-afc,arsenal-fc-swansea-city,arsenal-fc-tottenham-hotspur,arsenal-fc-west-bromwich-albion,arsenal-fc-west-ham-united,arsenal-fc-wigan-athletic,aston-villa-arsenal-fc,aston-villa-chelsea-fc,aston-villa-everton-fc,aston-villa-fulham-fc,aston-villa-liverpool-fc,aston-villa-manchester-city,aston-villa-manchester-united,aston-villa-newcastle-united,aston-villa-norwich-city,aston-villa-queens-park-rangers,aston-villa-reading-fc,aston-villa-southampton-fc,aston-villa-stoke-city,aston-villa-sunderland-afc,aston-villa-swansea-city,aston-villa-tottenham-hotspur,aston-villa-west-bromwich-albion,aston-villa-west-ham-united,aston-villa-wigan-athletic,chelsea-fc-arsenal-fc,chelsea-fc-aston-villa,chelsea-fc-everton-fc,chelsea-fc-fulham-fc,chelsea-fc-liverpool-fc,chelsea-fc-manchester-city,chelsea-fc-manchester-united,chelsea-fc-newcastle-united,chelsea-fc-norwich-city,chelsea-fc-queens-park-rangers,chelsea-fc-reading-fc,chelsea-fc-southampton-fc,...,west-bromwich-albion-manchester-united,west-bromwich-albion-newcastle-united,west-bromwich-albion-norwich-city,west-bromwich-albion-queens-park-rangers,west-bromwich-albion-reading-fc,west-bromwich-albion-southampton-fc,west-bromwich-albion-stoke-city,west-bromwich-albion-sunderland-afc,west-bromwich-albion-swansea-city,west-bromwich-albion-tottenham-hotspur,west-bromwich-albion-west-ham-united,west-bromwich-albion-wigan-athletic,west-ham-united-arsenal-fc,west-ham-united-aston-villa,west-ham-united-chelsea-fc,west-ham-united-everton-fc,west-ham-united-fulham-fc,west-ham-united-liverpool-fc,west-ham-united-manchester-city,west-ham-united-manchester-united,west-ham-united-newcastle-united,west-ham-united-norwich-city,west-ham-united-queens-park-rangers,west-ham-united-reading-fc,west-ham-united-southampton-fc,west-ham-united-stoke-city,west-ham-united-sunderland-afc,west-ham-united-swansea-city,west-ham-united-tottenham-hotspur,west-ham-united-west-bromwich-albion,west-ham-united-wigan-athletic,wigan-athletic-arsenal-fc,wigan-athletic-aston-villa,wigan-athletic-chelsea-fc,wigan-athletic-everton-fc,wigan-athletic-fulham-fc,wigan-athletic-liverpool-fc,wigan-athletic-manchester-city,wigan-athletic-manchester-united,wigan-athletic-newcastle-united,wigan-athletic-norwich-city,wigan-athletic-queens-park-rangers,wigan-athletic-reading-fc,wigan-athletic-southampton-fc,wigan-athletic-stoke-city,wigan-athletic-sunderland-afc,wigan-athletic-swansea-city,wigan-athletic-tottenham-hotspur,wigan-athletic-west-bromwich-albion,wigan-athletic-west-ham-united
away,aston-villa,chelsea-fc,everton-fc,fulham-fc,liverpool-fc,manchester-city,manchester-united,newcastle-united,norwich-city,queens-park-rangers,reading-fc,southampton-fc,stoke-city,sunderland-afc,swansea-city,tottenham-hotspur,west-bromwich-albion,west-ham-united,wigan-athletic,arsenal-fc,chelsea-fc,everton-fc,fulham-fc,liverpool-fc,manchester-city,manchester-united,newcastle-united,norwich-city,queens-park-rangers,reading-fc,southampton-fc,stoke-city,sunderland-afc,swansea-city,tottenham-hotspur,west-bromwich-albion,west-ham-united,wigan-athletic,arsenal-fc,aston-villa,everton-fc,fulham-fc,liverpool-fc,manchester-city,manchester-united,newcastle-united,norwich-city,queens-park-rangers,reading-fc,southampton-fc,...,manchester-united,newcastle-united,norwich-city,queens-park-rangers,reading-fc,southampton-fc,stoke-city,sunderland-afc,swansea-city,tottenham-hotspur,west-ham-united,wigan-athletic,arsenal-fc,aston-villa,chelsea-fc,everton-fc,fulham-fc,liverpool-fc,manchester-city,manchester-united,newcastle-united,norwich-city,queens-park-rangers,reading-fc,southampton-fc,stoke-city,sunderland-afc,swansea-city,tottenham-hotspur,west-bromwich-albion,wigan-athletic,arsenal-fc,aston-villa,chelsea-fc,everton-fc,fulham-fc,liverpool-fc,manchester-city,manchester-united,newcastle-united,norwich-city,queens-park-rangers,reading-fc,southampton-fc,stoke-city,sunderland-afc,swansea-city,tottenham-hotspur,west-bromwich-albion,west-ham-united
away_average,0.25,0.75,0.578125,0.430556,0.611111,0.359375,0.420455,0.603125,0.647059,0.490789,0,1.04545,0.266667,,0.660714,0.314881,0,0.302632,1.29297,1.10714,0,0.359375,0,0.0638889,0.59375,0.215909,0.15,0.176471,0.5,0.330625,0.522727,0.7075,0.410714,0.75,0.214286,0.455882,0.302632,0.071875,0,0.610937,0.25,0,0.291667,0.53125,0.659091,0.3575,0.279412,0,0,0.522727,...,0.579545,0,0.676471,0.0921053,0.2875,0.0909091,1.3,0,0.5,0.2875,0.598026,0.65625,0.517857,,0,0,0.43125,0.319444,0.359375,0.590909,0,0.264706,0,0,1.04545,0.383333,0.410714,0,0.285714,0,0.375,1.04732,0.359375,,0.640625,0.430556,0.680556,0,0.170455,0.725,0.388971,0.25,0.4125,0.313636,0,0.410714,1.04643,0.797619,0.441176,0.5
away_loss,0,0,1,0,1,0,0,1,1,1,2,2,1,,0,2,2,1,0,0,0,0,1,1,1,0,1,1,0,1,0,0,0,0,0,1,1,2,1,0,0,2,0,0,0,0,2,2,1,1,...,1,1,0,2,2,2,0,2,1,0,1,1,1,,0,0,1,1,0,0,1,2,0,1,1,1,0,0,0,0,1,0,1,,0,1,1,0,0,1,0,0,0,1,0,1,1,0,1,1
away_win,1,1,1,0,1,2,2,1,0,0,0,0,0,,1,0,0,0,1,1,2,1,1,1,1,2,0,1,2,1,0,2,2,1,1,1,1,0,0,1,1,0,0,2,2,1,0,0,0,1,...,1,1,2,0,0,0,2,0,1,0,1,0,0,,0,1,1,1,2,2,0,0,1,1,0,1,0,1,2,1,0,2,1,,2,1,1,1,2,1,0,0,1,0,0,1,0,1,1,1
best,0.625,-18.2875,1.2,-30.875,-16.375,18.4875,19.0125,-1.35,5.7125,10.6875,10.625,-11.4875,-2.15,,7.8125,6.9375,-9.175,4.3125,-9.9625,-12.1375,6.2625,-5.75,22.4,-0.3875,-3.9625,-15.6125,8.2125,-3.6125,-10.2,-4.8625,-4.025,-26.25,1.6875,-13.1125,-9.0875,0.475,3.3125,1.7375,-7.675,2.85,18.2625,-17.625,16.675,-1.1,11.3,3.6125,18.925,34.6375,9.35,7.325,...,5.075,-4.4375,-24.1375,-3.775,4.375,11.5375,-1.4125,12.3375,1.575,8.7375,-27.6,-3.0375,-2.825,,-0.275,10.4125,-11.7625,18.2625,-10.475,-2.2,7.0375,1.25,-4.6125,-8.6125,-2.9875,2.875,-10.8125,-0.7125,-2,7.25,-9.95,-3.125,-6.3875,,-31.0625,-6.15,-1.35,-2.25,-11.725,-12.6125,1.3875,12.55,-6.5,5.025,4.3125,0.45,-1.2,-21.25,-7.075,-8.6125


## Analysis:

We first try to run a Logistic Regression model to classify home wins, draws and home losses:

In [24]:
from sklearn.linear_model import LogisticRegression

We will train on the first part of the season (except the first game as we don't have any explanotary variables for the first game)

In [25]:
firsthalf = DF.loc['day']<=19
secondhalf = ~firsthalf

In [26]:
DF_firsthalf = DF[DF.columns[firsthalf]]
DF_secondhalf = DF[DF.columns[secondhalf]]

In [35]:
# We remove the first day:
train = DF_firsthalf[DF_firsthalf.columns[DF_firsthalf.loc['day']>1]]

In [158]:
y = train.loc['score'].copy()

In [159]:
X = train.iloc[[1,2,3,4,6,7,9,10,11],:].T
test = DF_secondhalf.iloc[[1,2,3,4,6,7,9,10,11],:].T
y_test = DF_secondhalf.loc['score'].copy()

In [160]:
X.head()

Unnamed: 0,away_average,away_loss,away_win,best,goals_against_away,goals_against_home,home_average,home_loss,home_win
arsenal-fc-chelsea-fc,0.75,0,1,-18.2875,-0.4,-0.4,0.246429,0,1
arsenal-fc-fulham-fc,0.430556,0,0,-30.875,-1.8,-1.2,0.0821429,1,1
arsenal-fc-queens-park-rangers,0.490789,1,0,10.6875,-1.6,-1.2,0.0,1,1
arsenal-fc-southampton-fc,1.04545,2,0,-11.4875,-2.66667,0.0,0.616071,0,1
arsenal-fc-swansea-city,0.660714,0,1,7.8125,-0.8,-1.6,0.472321,0,0


We fit the model:

In [161]:
logreg = LogisticRegression(C=0.1)

In [162]:
logreg.fit(X,y.astype(int))

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

We predict the second half:

In [163]:
y_test_pred = logreg.predict(test)

We derive the confusion metric to check how good we actually do:

In [164]:
from sklearn.metrics import confusion_matrix
mat = confusion_matrix(y_test.values.astype(int),y_test_pred.astype(int))

In [165]:
score = float(mat[0,0]+mat[1,1]+mat[2,2])/np.sum(mat)

In [166]:
print score

0.410526315789


Not bad when we know that picking randomly would lead to a 0.333 rate, but it is not great either.... Let's see if with a greater training set we can get better results !

In [170]:
tr = DF.loc['day']<=27
ts = ~tr

In [171]:
DF_tr = DF[DF.columns[tr]]
DF_ts = DF[DF.columns[ts]]
train2 = DF_tr[DF_tr.columns[DF_tr.loc['day']>1]]
y2 = train2.loc['score']

X2 = train2.iloc[[1,2,3,4,6,7,9,10,11],:].T
test2 = DF_ts.iloc[[1,2,3,4,6,7,9,10,11],:].T
y_test2 = DF_ts.loc['score']

In [184]:
logreg2 = LogisticRegression(C=0.1)
logreg.fit(X2,y2.astype(int))

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [185]:
y_test_pred2 = logreg.predict(test2)

In [186]:
mat2 = confusion_matrix(y_test2.values.astype(int),y_test_pred2.astype(int))

In [187]:
mat2

array([[ 1,  7, 27],
       [ 6,  5, 17],
       [ 7,  5, 35]])

In [188]:
score2 = float(mat2[0,0]+mat2[1,1]+mat2[2,2])/np.sum(mat2)

In [189]:
print score2

0.372727272727


nop....

### Let's try random forest:

In [82]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

In [95]:
clf = RandomForestClassifier(n_estimators=25)
clf.fit(X,y.astype(int))

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=25, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [96]:
RFC_pred = clf.predict(test).astype(int)

In [97]:
RFC_mat = confusion_matrix(y_test.astype(int),RFC_pred)

In [98]:
RFC_mat

array([[13, 21, 24],
       [14, 18, 18],
       [16, 19, 47]])

In [99]:
RFC_score = float(RFC_mat[0,0]+RFC_mat[1,1]+RFC_mat[2,2])/np.sum(RFC_mat)

In [100]:
print RFC_score

0.410526315789


Slightly better.... Let's try on the bigger training set:

In [279]:
clf = RandomForestClassifier(n_estimators=25)
clf.fit(X2,y2)
RFC_pred2 = clf.predict(test2).astype(int)
RFC_mat2 = confusion_matrix(y_test2.astype(int),RFC_pred2)

print RFC_mat2

RFC_score2 = float(RFC_mat2[0,0]+RFC_mat2[1,1]+RFC_mat2[2,2])/np.sum(RFC_mat2)

print RFC_score2

[[15  8 12]
 [ 7  5 16]
 [10 10 27]]
0.427272727273


Let's now to see if we can at least better predict if the home teams doesn't win:

In [101]:
y[y<1]= 0 

In [102]:
y_test[y_test<1] = 0

In [147]:
logreg = LogisticRegression(C=0.1)
logreg.fit(X,y.astype(int))
y_test_pred = logreg.predict(test).astype(int)

In [148]:
mat4 = confusion_matrix(y_test.values.astype(int),y_test_pred)

In [149]:
mat4

array([[76, 32],
       [49, 33]])

In [150]:
score4 = float(mat4[0,0]+mat4[1,1])/np.sum(mat4)

In [151]:
score4

0.5736842105263158

We manage to predict slightly better than by chance... still not good enough...

Lets's try to see if we are better at predicting when the home teams does not not loose:

In [196]:
y = train.loc['score'].copy()
y_test = DF_secondhalf.loc['score'].copy()
y[y>-1]= 1
y_test[y_test>-1]= 1

In [275]:
logreg = LogisticRegression(C=0.1)
logreg.fit(X,y.astype(int))
y_test_pred = logreg.predict(test).astype(int)
mat5 = confusion_matrix(y_test.values.astype(int),y_test_pred)
score5 = float(mat5[0,0]+mat5[1,1])/np.sum(mat5)

clf = RandomForestClassifier(n_estimators=100)
clf.fit(X,y.astype(int))
RFC_pred = clf.predict(test).astype(int)
RFC_mat = confusion_matrix(y_test.astype(int),RFC_pred)
score6 = float(RFC_mat[0,0]+RFC_mat[1,1])/np.sum(RFC_mat)

In [276]:
print "Logistic Regression: %s" % score5

print "Random Forrest: %s" %score6

Logistic Regression: 0.694736842105
Random Forrest: 0.678947368421


Seems to be working a bit more, let's compare to the "Never Loosing" baseline:

In [213]:
neverloosing_mat = confusion_matrix(y_test.values.astype(int),[1]*190)
float(neverloosing_mat[0,0]+neverloosing_mat[1,1])/np.sum(neverloosing_mat)

0.6947368421052632

In [214]:
onlyloss_mat = confusion_matrix(y_test.values.astype(int),[-1]*190)
float(onlyloss_mat[0,0]+onlyloss_mat[1,1])/np.sum(onlyloss_mat)

0.30526315789473685

We equated one of the baselines, which is not a satisfactory result.

#### Let's try to improve results by looking in more depths at the models:

Let's focus on one model in particular, the random forest 

In [278]:
DF_firsthalf = DF[DF.columns[firsthalf]]
DF_secondhalf = DF[DF.columns[secondhalf]]
train = DF_firsthalf[DF_firsthalf.columns[DF_firsthalf.loc['day']>1]]
y = train.loc['score'].copy()
X = train.iloc[[1,2,3,4,6,7,9,10,11],:].T
test = DF_secondhalf.iloc[[1,2,3,4,6,7,9,10,11],:].T
y_test = DF_secondhalf.loc['score'].copy()

In [280]:
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X,y.astype(int))

RFC_pred = clf.predict(test).astype(int)
RFC_mat = confusion_matrix(y_test.astype(int),RFC_pred)

RFC_score = float(RFC_mat[0,0]+RFC_mat[1,1]+RFC_mat[2,2])/np.sum(RFC_mat)

print "Accuracy: %s" % RFC_score

Accuracy: 0.431578947368


Let's inspect feature importance to see if one or multiple variables might impact too much the output of the model:

In [281]:
zip(DF.index[[1,2,3,4,5,7,8,10,11,12]],clf.feature_importances_)

[('away_average', 0.16778765124524156),
 ('away_loss', 0.049654380727071956),
 ('away_win', 0.050517361323656862),
 ('best', 0.19601800474103415),
 ('day', 0.15084289134556492),
 ('goals_against_home', 0.14542118288312128),
 ('home', 0.15449469452813949),
 ('home_loss', 0.047315760198110297),
 ('home_win', 0.037948073008059462)]

It's not obvious that there is a clear bias in favor of one of the variables.

### Let's try to add a team factor:

In [282]:
team_df = dict.fromkeys(games.keys())

In [283]:
for k in team_df.keys():
    team_df[k] = {}
    for team in teams:
        team_df[k][team] = 0
    team_df[k][games[k]['home']] = 1
    team_df[k][games[k]['away']] = 1

In [284]:
team_df = pd.DataFrame.from_dict(team_df)

In [285]:
team_df.head()

Unnamed: 0,arsenal-fc-aston-villa,arsenal-fc-chelsea-fc,arsenal-fc-everton-fc,arsenal-fc-fulham-fc,arsenal-fc-liverpool-fc,arsenal-fc-manchester-city,arsenal-fc-manchester-united,arsenal-fc-newcastle-united,arsenal-fc-norwich-city,arsenal-fc-queens-park-rangers,arsenal-fc-reading-fc,arsenal-fc-southampton-fc,arsenal-fc-stoke-city,arsenal-fc-sunderland-afc,arsenal-fc-swansea-city,arsenal-fc-tottenham-hotspur,arsenal-fc-west-bromwich-albion,arsenal-fc-west-ham-united,arsenal-fc-wigan-athletic,aston-villa-arsenal-fc,aston-villa-chelsea-fc,aston-villa-everton-fc,aston-villa-fulham-fc,aston-villa-liverpool-fc,aston-villa-manchester-city,aston-villa-manchester-united,aston-villa-newcastle-united,aston-villa-norwich-city,aston-villa-queens-park-rangers,aston-villa-reading-fc,aston-villa-southampton-fc,aston-villa-stoke-city,aston-villa-sunderland-afc,aston-villa-swansea-city,aston-villa-tottenham-hotspur,aston-villa-west-bromwich-albion,aston-villa-west-ham-united,aston-villa-wigan-athletic,chelsea-fc-arsenal-fc,chelsea-fc-aston-villa,chelsea-fc-everton-fc,chelsea-fc-fulham-fc,chelsea-fc-liverpool-fc,chelsea-fc-manchester-city,chelsea-fc-manchester-united,chelsea-fc-newcastle-united,chelsea-fc-norwich-city,chelsea-fc-queens-park-rangers,chelsea-fc-reading-fc,chelsea-fc-southampton-fc,...,west-bromwich-albion-manchester-united,west-bromwich-albion-newcastle-united,west-bromwich-albion-norwich-city,west-bromwich-albion-queens-park-rangers,west-bromwich-albion-reading-fc,west-bromwich-albion-southampton-fc,west-bromwich-albion-stoke-city,west-bromwich-albion-sunderland-afc,west-bromwich-albion-swansea-city,west-bromwich-albion-tottenham-hotspur,west-bromwich-albion-west-ham-united,west-bromwich-albion-wigan-athletic,west-ham-united-arsenal-fc,west-ham-united-aston-villa,west-ham-united-chelsea-fc,west-ham-united-everton-fc,west-ham-united-fulham-fc,west-ham-united-liverpool-fc,west-ham-united-manchester-city,west-ham-united-manchester-united,west-ham-united-newcastle-united,west-ham-united-norwich-city,west-ham-united-queens-park-rangers,west-ham-united-reading-fc,west-ham-united-southampton-fc,west-ham-united-stoke-city,west-ham-united-sunderland-afc,west-ham-united-swansea-city,west-ham-united-tottenham-hotspur,west-ham-united-west-bromwich-albion,west-ham-united-wigan-athletic,wigan-athletic-arsenal-fc,wigan-athletic-aston-villa,wigan-athletic-chelsea-fc,wigan-athletic-everton-fc,wigan-athletic-fulham-fc,wigan-athletic-liverpool-fc,wigan-athletic-manchester-city,wigan-athletic-manchester-united,wigan-athletic-newcastle-united,wigan-athletic-norwich-city,wigan-athletic-queens-park-rangers,wigan-athletic-reading-fc,wigan-athletic-southampton-fc,wigan-athletic-stoke-city,wigan-athletic-sunderland-afc,wigan-athletic-swansea-city,wigan-athletic-tottenham-hotspur,wigan-athletic-west-bromwich-albion,wigan-athletic-west-ham-united
arsenal-fc,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
aston-villa,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
chelsea-fc,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
everton-fc,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
fulham-fc,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [286]:
big_df = pd.concat([DF,team_df])

In [287]:
big_df_firsthalf = big_df[big_df.columns[firsthalf]]
big_df_secondhalf = big_df[big_df.columns[secondhalf]]
train = big_df_firsthalf[big_df_firsthalf.columns[big_df_firsthalf.loc['day']>1]]
y = train.loc['score'].copy()
X = train.iloc[[1,2,3,4,6,7,9,10,11,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32],:].T
test = big_df_secondhalf.iloc[[1,2,3,4,6,7,9,10,11,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32],:].T
y_test = big_df_secondhalf.loc['score'].copy()

In [294]:
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X,y.astype(int))

RFC_pred = clf.predict(test).astype(int)
RFC_mat = confusion_matrix(y_test.astype(int),RFC_pred)

print RFC_mat

RFC_score = float(RFC_mat[0,0]+RFC_mat[1,1]+RFC_mat[2,2])/np.sum(RFC_mat)
print RFC_score

[[12  9 37]
 [ 7 14 29]
 [12 16 54]]
0.421052631579


It did not seem to have much impact. Let's try on the "Not Loosing" problem:

In [295]:
y = train.loc['score'].copy()
y_test = big_df_secondhalf.loc['score'].copy()
y[y>-1]= 1
y[y==-1] = 0
y_test[y_test>-1]= 1
y_test[y_test==-1] = 0

In [303]:
clf = RandomForestClassifier(n_estimators=100,class_weight='auto')
clf.fit(X,y.astype(int))

RFC_pred = clf.predict(test).astype(int)
RFC_mat = confusion_matrix(y_test.astype(int),RFC_pred)

print RFC_mat

RFC_score = float(RFC_mat[0,0]+RFC_mat[1,1])/np.sum(RFC_mat)
print RFC_score

[[  3  55]
 [  2 130]]
0.7


Slightly better...

## Using a more equally distributed training data set:

Analysing the confusion matrices, we noticed that the models we created tend to classify most of the matches as a win. This is due to the fact that the training set is inherently biased in favor of home wins (cf. the baseline 'every game is a win' gives a 69% accuracy'). Let's try to rebalance the training set by sampling equal number of losses, draws and wins.

Let's start by evaluating the size of each class in the training data:

In [38]:
firsthalf = DF.loc['day']<=19
secondhalf = ~firsthalf

big_df = pd.concat([DF,team_df])

big_df_firsthalf = big_df[big_df.columns[firsthalf]]
big_df_secondhalf = big_df[big_df.columns[secondhalf]]
train = big_df_firsthalf[big_df_firsthalf.columns[big_df_firsthalf.loc['day']>1]]
y = train.loc['score'].copy()
X = train.iloc[[1,2,3,4,6,7,9,10,11,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32],:].copy().T
test = big_df_secondhalf.iloc[[1,2,3,4,6,7,9,10,11,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32],:].T
y_test = big_df_secondhalf.loc['score'].copy()

In [39]:
y[y>-1]= 1
y[y==-1] = 0
y_test[y_test>-1]= 1
y_test[y_test==-1] = 0

Number of games where the home team is not loosing:

In [40]:
len(y[y==1])

134

Number of home losses:

In [41]:
len(y[y==0])

46

Let's sample 46 wins/draws:

In [53]:
train_noloss = train.loc[:,y==1].copy()
train_loss = train.loc[:,y==0].copy()

In [54]:
touse = np.random.choice(range(len(y[y==1])),len(y[y==0]))

In [59]:
new_train = train_noloss.loc[:,train_noloss.columns[touse]].join(train_loss)

In [80]:
newX = new_train.iloc[[1,2,3,4,6,7,9,10,11,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32],:].copy().T

In [74]:
newy = list(y[y==1][touse].values)
newy.extend(list(y[y==0].values))
len(newy)

92

In [81]:
clf = RandomForestClassifier(n_estimators=20)
clf.fit(newX,newy)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [84]:
RFC_pred = clf.predict(test).astype(int)
RFC_mat = confusion_matrix(y_test.astype(int),RFC_pred)

In [85]:
RFC_score = float(RFC_mat[0,0]+RFC_mat[1,1])/np.sum(RFC_mat)
print RFC_score

0.415789473684


In [86]:
RFC_mat

array([[37, 21],
       [90, 42]])

FAIL...

Lets add more data to the training set:

In [87]:
firsthalf = DF.loc['day']<=29
secondhalf = ~firsthalf

big_df = pd.concat([DF,team_df])

big_df_firsthalf = big_df[big_df.columns[firsthalf]]
big_df_secondhalf = big_df[big_df.columns[secondhalf]]
train = big_df_firsthalf[big_df_firsthalf.columns[big_df_firsthalf.loc['day']>1]]
y = train.loc['score'].copy()
X = train.iloc[[1,2,3,4,6,7,9,10,11,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32],:].copy().T
test = big_df_secondhalf.iloc[[1,2,3,4,6,7,9,10,11,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32],:].T
y_test = big_df_secondhalf.loc['score'].copy()

In [100]:
clf = RandomForestClassifier(n_estimators=20)
clf.fit(X,list(y.values))

RFC_pred = clf.predict(test).astype(int)
RFC_mat = confusion_matrix(y_test.astype(int),RFC_pred)

RFC_score = float(RFC_mat[0,0]+RFC_mat[1,1]+RFC_mat[2,2])/np.sum(RFC_mat)
print RFC_score

0.422222222222
