# Imports
Imports required libraries

In [19]:
from __future__ import print_function, division
import os
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import sklearn as sklearn
from datetime import datetime
from sklearn.cross_validation import train_test_split, KFold, cross_val_predict
from sklearn.linear_model import LinearRegression, LogisticRegression, LogisticRegressionCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.dummy import DummyClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import cross_val_score
from numpy import random
import sqlite3
%matplotlib inline

# 1) Description of the dataset
The initial database has four tables: Countries, leagues, teams, team_atts, matches, players, player_atts.

Countries - The country_id for each match (which country the match took place in)

Leagues - The league_id for each match (as each country only has one league in this set, this is identical to country_id and not needed).

Teams - Contains the id, team_api_id, team_fifa_api_id, and name of each team. The team_fifa_api_id is what the id of each team is in the FIFA games (where player/team statistics are pulled from).

Team_atts - Contains attributes about each team: their playstyle, offense, defense, etc.

Matches - This contains the bulk of the data that we want. Contains information about: who the home/away team are, which league the game took place in, which season, the data, player statistics, and betting data.

Players - Contains information about each player: their id, age, team, stature

Player_atts - Has information on the attributes of each player: their skills, strengths, weaknesses, but most importantly their player rating.

--A side note, teams and players contain both a FIFA and non-FIFA key. Matches are joined to them by non-FIFA key.--

We will now read-in the tables from the sqlite database.

In [2]:
with sqlite3.connect('database.sqlite') as con:
    countries = pd.read_sql_query("SELECT * from Country", con)
    matches = pd.read_sql_query("SELECT * from Match", con)
    players = pd.read_sql_query("SELECT * from Player", con)
    player_atts = pd.read_sql_query("SELECT * from Player_Attributes", con)
    team_atts = pd.read_sql_query("SELECT * from Team_Attributes", con)
    teams = pd.read_sql_query("SELECT * from Team", con)


For now, we will currently only be examining betting data from bet365. Many of the betting sites included in the dataset are missing betting data on a large portion of the matches and bet365 has by far the most.

We will also drop attributes that will not be useful for what we want yet; these attributes are generally ingame statistics of the matches in general.

`league_id` is also dropped because we are not using the leagues table.

In [3]:
# Drops unneeded attributes
atts_to_drop = ['BWH', 'BWD', 'BWA', 'IWH', 'IWD', 'IWA', 'LBH', 'LBD', 'LBA', 'GBD',
               'PSH', 'PSD', 'PSA', 'WHH', 'WHA', 'WHD', 'SJH', 'SJD', 'SJA', 'VCH',
               'VCD', 'VCA', 'GBH', 'GBA', 'BSH', 'BSD', 'BSA', 'shoton', 'goal', 'shotoff',
               'foulcommit', 'card', 'cross', 'corner', 'possession', 'league_id']
matches = matches.drop(atts_to_drop, axis=1) 
# Drops player formation values
matches = matches.drop(matches.columns[10:54], axis=1)

# Drops null values
matches = matches.dropna()
players = players.dropna()
player_atts = player_atts.dropna(subset = ['date', 'overall_rating', 'player_api_id'], axis = 0)
team_atts = team_atts.dropna()

# Replace home_team_goal and away_team_goal with home_win (0 = no, 1 = draw, 2 = yes)
def find_winner(row):
    if row['home_team_goal'] > row['away_team_goal']:
        return 2
    if row['home_team_goal'] == row['away_team_goal']:
        return 1
    if row['home_team_goal'] < row['away_team_goal']:
        return 0
    
matches.insert(9, 'match_result', -1)
matches['match_result'] = matches.apply(lambda x: find_winner(x), axis = 1)
matches = matches.drop(['home_team_goal', 'away_team_goal'], axis = 1)


Now, we separate matches by country.

In [4]:
england_matches = matches[(matches['country_id'] == 1729)]
france_matches = matches[(matches['country_id'] == 4769)]
germany_matches = matches[(matches['country_id'] == 7809)]
italy_matches = matches[(matches['country_id'] == 10257)]
netherlands_matches = matches[(matches['country_id'] == 13274)]
spain_matches = matches[(matches['country_id'] == 21518)]

Convert dates into datetime objects

In [5]:
matches['date'] = matches['date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %X'))
player_atts['date'] = player_atts['date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %X'))

We will take the dates and append the players' player_id with them (to make it easier to process), as player_atts contains different player rating values for each date they play.

In [6]:
#print(player_atts.loc[player_atts['player_api_id'] == 110189])
#matches.loc[3630, h_a_players] = matches.loc[3630, h_a_players].map(lambda x: find_rating_by_date(player_atts.loc[(player_atts['player_api_id'] == x)], matches.loc[3630,'date']))

In [7]:
h_a_players = ['home_player_1', 'home_player_2', 'home_player_3', 'home_player_4', 'home_player_5',
          'home_player_6', 'home_player_7', 'home_player_8', 'home_player_9', 'home_player_10', 'home_player_11',
          'away_player_1', 'away_player_2', 'away_player_3', 'away_player_4', 'away_player_5', 'away_player_6',
          'away_player_7', 'away_player_8', 'away_player_9', 'away_player_10', 'away_player_11']

def process_row(row):
    matches.loc[row.name, h_a_players] = row[h_a_players].map(lambda x: find_rating_by_date(player_atts.loc[(player_atts['player_api_id'] == x)], row['date']))
def find_rating_by_date(rows, date):
    nearest_date = nearest(rows['date'], date) #The date we will use for the player ratings
    n = rows.loc[rows['date'] == nearest_date]['overall_rating']
    return rows.loc[rows['date'] == nearest_date]['overall_rating']
def nearest(items, pivot):
    return min(items, key=lambda x: abs(x-pivot))
matches.apply(lambda x: process_row(x), axis = 1)
matches.head

<bound method DataFrame.head of           id  country_id     season  stage       date  match_api_id  \
145      146           1  2008/2009     24 2009-02-27        493017   
153      154           1  2008/2009     25 2009-03-08        493025   
155      156           1  2008/2009     25 2009-03-07        493027   
162      163           1  2008/2009     26 2009-03-13        493034   
168      169           1  2008/2009     26 2009-03-14        493040   
173      174           1  2008/2009     27 2009-03-22        493045   
176      177           1  2008/2009     27 2009-03-21        493048   
189      190           1  2008/2009     29 2009-04-12        493061   
190      191           1  2008/2009     29 2009-04-10        493062   
219      220           1  2008/2009     31 2009-04-26        493082   
226      227           1  2008/2009     32 2009-05-02        493089   
229      230           1  2008/2009     32 2009-05-02        493092   
231      232           1  2008/2009     32 20

# 3) Betting Analysis

In [8]:
multipred = england_matches.drop(['date','season','B365H', 'B365A', 'B365D', 'match_result', 'country_id', 'id', 'match_api_id'], axis=1)
multitarget = england_matches['B365H']
kf = KFold(multitarget.size, n_folds = 10, shuffle= True, random_state = 0) 
lr =  LinearRegression(normalize = True)

result = cross_val_predict(lr, multipred, multitarget, cv = kf)

slope, intercept, r_value, p_value, std_err = stats.linregress(multitarget, result)

print("R^2: ", r_value**2)
print("RMSE: ", mean_squared_error(multitarget, result)**(1/2.0))
print("MAE: ", mean_absolute_error(multitarget, result))
print("CC: ", np.corrcoef(result, multitarget)[1][0])

R^2:  0.0222088554614
RMSE:  1.67900599579
MAE:  1.16370610022
CC:  0.149026358277




# 3) Win Analysis
Fans of soccer know that certain leagues are easier to predict than others. For example, the Spanish La Liga is notoriously top-heavy, with Real Madrid and Barcelona dominating the league for the last few decades. We will now use 

In [9]:
y_tr_eng = england_matches['match_result']
X_tr_eng = england_matches.drop(['match_result', 'date', 'season', 'B365H', 'B365A', 'B365D', 'match_result', 'country_id', 'id', 'match_api_id'], axis=1)
y_tr_spa = spain_matches['match_result']
X_tr_spa = spain_matches.drop(['match_result', 'date', 'season', 'B365H', 'B365A', 'B365D', 'match_result', 'country_id', 'id', 'match_api_id'], axis=1)
y_tr_ger = germany_matches['match_result']
X_tr_ger = germany_matches.drop(['match_result', 'date', 'season', 'B365H', 'B365A', 'B365D', 'match_result', 'country_id', 'id', 'match_api_id'], axis=1)
y_tr_ita = italy_matches['match_result']
X_tr_ita = italy_matches.drop(['match_result', 'date', 'season', 'B365H', 'B365A', 'B365D', 'match_result', 'country_id', 'id', 'match_api_id'], axis=1)


sets = zip([('English', X_tr_eng, y_tr_eng), ('Spanish', X_tr_spa, y_tr_spa),
            ('German', X_tr_ger, y_tr_ger), ('Italian', X_tr_ita, y_tr_ita)])


dc_eng = DummyClassifier(strategy = 'most_frequent')
dc_spa = DummyClassifier(strategy = 'most_frequent')
dc_ger = DummyClassifier(strategy = 'most_frequent')
dc_ita = DummyClassifier(strategy = 'most_frequent')

dc_eng.fit(X_tr_eng, y_tr_eng)
print("Baseline classification accuracy of English matches: %f" % (dc_eng.score(X_tr_eng, y_tr_eng)))
dc_spa.fit(X_tr_spa, y_tr_spa)
print("Baseline classification accuracy of Spanish matches: %f" % (dc_spa.score(X_tr_spa, y_tr_spa)))
dc_ger.fit(X_tr_ger, y_tr_ger)
print("Baseline classification accuracy of German matches: %f" % (dc_ger.score(X_tr_ger, y_tr_ger)))
dc_ita.fit(X_tr_ita, y_tr_ita)
print("Baseline classification accuracy of Italian matches: %f" % (dc_ita.score(X_tr_ita, y_tr_ita)))




Baseline classification accuracy of English matches: 0.455436
Baseline classification accuracy of Spanish matches: 0.484848
Baseline classification accuracy of German matches: 0.453895
Baseline classification accuracy of Italian matches: 0.465524


In [12]:
kf_eng = KFold(X_tr_eng.index.size, n_folds = 10, shuffle=True, random_state = 0)
kf_spa = KFold(X_tr_spa.index.size, n_folds = 10, shuffle=True, random_state = 0)
kf_ger = KFold(X_tr_ger.index.size, n_folds = 10, shuffle=True, random_state = 0)
kf_ita = KFold(X_tr_ita.index.size, n_folds = 10, shuffle=True, random_state = 0)

c_space = np.logspace(-5, 5, 20)
lorc_eng = LogisticRegressionCV(Cs = c_space, cv = kf_eng, solver='lbfgs')
lorc_eng.fit(X_tr_eng, y_tr_eng)
print("Classification accuracy of Logistic Regression on English matches: %f" % (lorc_eng.score(X_tr_eng, y_tr_eng)))
lorc_spa = LogisticRegressionCV(Cs = c_space, cv = kf_spa, solver='lbfgs')
lorc_spa.fit(X_tr_spa, y_tr_spa)
print("Classification accuracy of Logistic Regression on Spanish matches: %f" % (lorc_spa.score(X_tr_spa, y_tr_spa)))
lorc_ger = LogisticRegressionCV(Cs = c_space, cv = kf_ger, solver='lbfgs')
lorc_ger.fit(X_tr_ger, y_tr_ger)
print("Classification accuracy of Logistic Regression on German matches: %f" % (lorc_ger.score(X_tr_ger, y_tr_ger)))
lorc_ita = LogisticRegressionCV(Cs = c_space, cv = kf_ita, solver='lbfgs')
lorc_ita.fit(X_tr_ita, y_tr_ita)
print("Classification accuracy of Logistic Regression on Italian matches: %f" % (lorc_ita.score(X_tr_ita, y_tr_ita)))

Classification accuracy of Logistic Regression on English matches: 0.461850
Classification accuracy of Logistic Regression on Spanish matches: 0.506652
Classification accuracy of Logistic Regression on German matches: 0.464421
Classification accuracy of Logistic Regression on Italian matches: 0.479387


The logistic regression model performs negligibly better than the baseline classifier on the training set. Spanish matches, however, seem to be noticeably easier to predict (both in baseline and in logistic classifications)

In [25]:
eng_rbf = SVC(kernel = 'rbf')
spa_rbf = SVC(kernel = 'rbf')
ger_rbf = SVC(kernel = 'rbf')
ita_rbf = SVC(kernel = 'rbf')


print("Classification accuracy of RBF SVC on English matches: %f" % (np.mean(cross_val_score(eng_rbf, X_tr_eng, y_tr_eng, cv=kf_eng))))
print("Classification accuracy of RBF SVC on Spanish matches: %f" % (np.mean(cross_val_score(spa_rbf, X_tr_spa, y_tr_spa, cv=kf_spa))))
print("Classification accuracy of RBF SVC on German matches: %f" % (np.mean(cross_val_score(ger_rbf, X_tr_ger, y_tr_ger, cv=kf_ger))))
print("Classification accuracy of RBF SVC on Italian matches: %f" % (np.mean(cross_val_score(ita_rbf, X_tr_ita, y_tr_ita, cv=kf_ita))))

Classification accuracy of RBF SVC on English matches: 0.455434
Classification accuracy of RBF SVC on Spanish matches: 0.484823
Classification accuracy of RBF SVC on German matches: 0.453877
Classification accuracy of RBF SVC on Italian matches: 0.465520


RBF-kernel SVCs perform even worse than logistic regression, and approximately as well as the baseline estimator does.