In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st
from scipy.special import logit, expit
import statsmodels.formula.api as smf
import statsmodels.api as sm
from collections import defaultdict
from sklearn.metrics import brier_score_loss
pd.set_option('display.max_columns', 50)

In [2]:
df = pd.read_csv('soccer18.csv', index_col = False)

In [3]:
df = df.sort_values(['Date'])
df.loc[:,'GameID'] = df.index
df['HGD'] = df.FTHG - df.FTAG
df['AGD'] = df.FTAG - df.FTHG

In [4]:
goalsh = np.empty(len(df))
goalsa = np.empty(len(df))
gamesh = np.empty(len(df))
gamesa = np.empty(len(df))
gdhome = np.empty(len(df))
gdaway = np.empty(len(df))
total_score = defaultdict(int)
total_games = defaultdict(int)
total_gd = defaultdict(int)
for i in range(len(df)) :
    curr_home = df.HomeTeam.iat[i]
    curr_away = df.AwayTeam.iat[i]
    goalsh[i] = total_score[curr_home]
    goalsa[i] = total_score[curr_away]
    gamesh[i] = total_games[curr_home]
    gamesa[i] = total_games[curr_away]
    total_score[curr_home] += df.FTHG.iat[i]
    total_score[curr_away] += df.FTAG.iat[i]
    total_games[curr_home] += 1
    total_games[curr_away] += 1
    gdhome[i] = total_gd[curr_home]
    gdaway[i] = total_gd[curr_away]
    total_gd[curr_home] += df.HGD.iat[i]
    total_gd[curr_away] += df.AGD.iat[i]

In [5]:
df['home_goals'] = goalsh
df['away_goals'] = goalsa
df['home_games'] = gamesh
df['away_games'] = gamesa
df['home_gd'] = gdhome
df['away_gd'] = gdaway
df['home_agd'] = df['home_gd']/df['home_games']
df['away_agd'] = df['away_gd']/df['away_games']
df['agd_diff'] = np.abs(df['home_agd'] - df['away_agd'])
df = df.fillna(0)
df['Home_win'] = (df['FTHG'] - df['FTAG'] > 0).astype(int)
dfhw = df.loc[df['Y'] < 18]
df18 = df.loc[df['Y'] == 18]

## 1.A.i

In [6]:
dfhw = dfhw[['Div','Y','HomeTeam','AwayTeam','home_agd','away_agd','agd_diff','home_games','away_games']]
dfhw.nlargest(7, 'agd_diff')

Unnamed: 0,Div,Y,HomeTeam,AwayTeam,home_agd,away_agd,agd_diff,home_games,away_games
5326,Ligue_1,14,Evian Thonon Gaillard,Paris SG,-3.5,1.0,4.5,2.0,2.0
7214,Serie_A,14,Sassuolo,Sampdoria,-3.5,1.0,4.5,2.0,2.0
6464,Ligue_1,17,Strasbourg,Lille,-4.0,0.078261,4.078261,1.0,115.0
1910,La_Liga,14,Cordoba,Celta,-2.0,2.0,4.0,1.0,1.0
1912,La_Liga,14,Elche,Granada,-3.0,1.0,4.0,1.0,1.0
7197,Serie_A,14,Empoli,Roma,-2.0,2.0,4.0,1.0,1.0
7212,Serie_A,14,Palermo,Inter,-0.5,3.5,4.0,2.0,2.0


## 1.A.ii

In [7]:
df1aii = dfhw.loc[(dfhw.home_games >= 100) & (dfhw.away_games >= 100)]
df1aii = df1aii[['Div','Y','HomeTeam','AwayTeam','home_agd','away_agd','agd_diff','home_games','away_games']]
df1aii.nlargest(7, 'agd_diff')

Unnamed: 0,Div,Y,HomeTeam,AwayTeam,home_agd,away_agd,agd_diff,home_games,away_games
2940,La_Liga,16,Granada,Barcelona,-0.875,2.192308,3.067308,104.0,104.0
3393,La_Liga,17,Levante,Barcelona,-0.705357,2.14,2.845357,112.0,150.0
3008,La_Liga,16,Granada,Real Madrid,-0.936937,1.9,2.836937,111.0,110.0
3293,La_Liga,17,Las Palmas,Barcelona,-0.623762,2.208633,2.832395,101.0,139.0
3370,La_Liga,17,La Coruna,Barcelona,-0.621622,2.142857,2.764479,148.0,147.0
2921,La_Liga,16,La Coruna,Barcelona,-0.519608,2.22549,2.745098,102.0,102.0
3190,La_Liga,17,Barcelona,La Coruna,2.186047,-0.527132,2.713178,129.0,129.0


## 1.A.iii

Strasbourg was a newly promoted team that had never played in Ligue 1 before. They lost their first game -4 so had a very low average goal difference.

## 1.B.i

In [9]:
dfhw = df.loc[df['Y'] < 18]
df18 = df.loc[df['Y'] == 18]

In [10]:
result = smf.glm('Home_win ~ 1', data = dfhw, family = sm.families.Binomial()).fit()
result.summary()

0,1,2,3
Dep. Variable:,Home_win,No. Observations:,7304.0
Model:,GLM,Df Residuals:,7303.0
Model Family:,Binomial,Df Model:,0.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-5037.4
Date:,"Wed, 17 Feb 2021",Deviance:,10075.0
Time:,21:09:33,Pearson chi2:,7300.0
No. Iterations:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.1669,0.023,-7.106,0.000,-0.213,-0.121


## 1.B.ii

In [11]:
print(brier_score_loss(df18.Home_win, result.predict(df18)))

0.2473559477379797


## 1.D

In [12]:
result2 = smf.glm('Home_win~ home_agd + away_agd', data = dfhw, family = sm.families.Binomial()).fit()
result2.summary()

0,1,2,3
Dep. Variable:,Home_win,No. Observations:,7304.0
Model:,GLM,Df Residuals:,7301.0
Model Family:,Binomial,Df Model:,2.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-4606.5
Date:,"Wed, 17 Feb 2021",Deviance:,9212.9
Time:,21:09:40,Pearson chi2:,7350.0
No. Iterations:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.1791,0.025,-7.183,0.000,-0.228,-0.130
home_agd,0.7853,0.039,20.128,0.000,0.709,0.862
away_agd,-0.7619,0.040,-19.082,0.000,-0.840,-0.684


In [13]:
print(brier_score_loss(df18.Home_win, result2.predict(df18)))

0.21726101075298784
