All data used is from Sports Reference or fbref, who gets xG from Statbomb

In [51]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, RidgeCV, Lasso, \
LassoCV, ElasticNet, ElasticNetCV
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVR
from sklearn.decomposition import PCA

In [52]:
pl_results_2021 = pd.read_csv('./data/pl_results2021.csv')

In [53]:
pl_results_2021['over/under'] = np.where(pl_results_2021['home_score'] + pl_results_2021['away_score'] > 2.5, 1, 0)
pl_results_2021.head()

Unnamed: 0,Wk,Day,Date,Time,Home,xG,Score,xG.1,Away,home_score,away_score,Venue,Referee,Match Report,Notes,over/under
0,1,Sat,2020-09-12,12:30 (07:30),Fulham,0.2,0–3,1.8,Arsenal,0.0,3.0,Craven Cottage,Chris Kavanagh,Match Report,,1
1,1,Sat,2020-09-12,15:00 (10:00),Crystal Palace,0.7,1–0,0.8,Southampton,1.0,0.0,Selhurst Park,Jonathan Moss,Match Report,,0
2,1,Sat,2020-09-12,17:30 (12:30),Liverpool,3.3,4–3,0.6,Leeds United,4.0,3.0,Anfield,Michael Oliver,Match Report,,1
3,1,Sat,2020-09-12,20:00 (15:00),West Ham,1.1,0–2,1.5,Newcastle Utd,0.0,2.0,London Stadium,Stuart Attwell,Match Report,,0
4,1,Sun,2020-09-13,14:00 (09:00),West Brom,0.5,0–3,2.2,Leicester City,0.0,3.0,The Hawthorns,Anthony Taylor,Match Report,,1


In [54]:
pl_results_2021 = pl_results_2021.rename(columns={'xG':'home_xG', 'xG.1':'away_xG'})
#Renaming confusing xG columns

In [55]:
pl_results_2021 = pd.get_dummies(pl_results_2021, columns=['Home'])
pl_results_2021 = pd.get_dummies(pl_results_2021, columns=['Away'])

In [56]:
pl_results_2021.corr()['over/under'].sort_values(ascending=False)[0:20]

over/under             1.000000
away_score             0.557345
home_score             0.526173
away_xG                0.253113
home_xG                0.197142
Away_West Ham          0.091812
Away_Leeds United      0.091812
Home_Newcastle Utd     0.067651
Home_Manchester Utd    0.067651
Home_Leicester City    0.067651
Home_West Brom         0.043490
Away_Crystal Palace    0.043490
Home_Aston Villa       0.043490
Away_Leicester City    0.043490
Away_Manchester Utd    0.019329
Away_Liverpool         0.019329
Away_Arsenal           0.019329
Away_Aston Villa       0.019329
Home_West Ham          0.019329
Away_Tottenham         0.019329
Name: over/under, dtype: float64

Could include the average xg and average xga because that will be available, but might need to add some sort of weight to that regarding which teams they have played so far. Will have to include some sort of merging with the teams' stats and these results

In [57]:
pl_results_2021['homexg'] = ""
pl_results_2021['awayxg'] = ""

In [58]:
pl_results_2021.columns

Index(['Wk', 'Day', 'Date', 'Time', 'home_xG', 'Score', 'away_xG',
       'home_score', 'away_score', 'Venue', 'Referee', 'Match Report', 'Notes',
       'over/under', 'Home_Arsenal', 'Home_Aston Villa', 'Home_Brighton',
       'Home_Burnley', 'Home_Chelsea', 'Home_Crystal Palace', 'Home_Everton',
       'Home_Fulham', 'Home_Leeds United', 'Home_Leicester City',
       'Home_Liverpool', 'Home_Manchester City', 'Home_Manchester Utd',
       'Home_Newcastle Utd', 'Home_Sheffield Utd', 'Home_Southampton',
       'Home_Tottenham', 'Home_West Brom', 'Home_West Ham', 'Home_Wolves',
       'Away_Arsenal', 'Away_Aston Villa', 'Away_Brighton', 'Away_Burnley',
       'Away_Chelsea', 'Away_Crystal Palace', 'Away_Everton', 'Away_Fulham',
       'Away_Leeds United', 'Away_Leicester City', 'Away_Liverpool',
       'Away_Manchester City', 'Away_Manchester Utd', 'Away_Newcastle Utd',
       'Away_Sheffield Utd', 'Away_Southampton', 'Away_Tottenham',
       'Away_West Brom', 'Away_West Ham', 'Away_Wolv

In [59]:
pl_results_2021.loc[pl_results_2021['Home_Arsenal'] == 1, ['homexg']] = 1.41
pl_results_2021.loc[pl_results_2021['Home_Aston Villa'] == 1, ['homexg']] = 1.40
pl_results_2021.loc[pl_results_2021['Home_Brighton'] == 1, ['homexg']] = 1.36
pl_results_2021.loc[pl_results_2021['Home_Burnley'] == 1, ['homexg']] = 1.04
pl_results_2021.loc[pl_results_2021['Home_Chelsea'] == 1, ['homexg']] = 1.68
pl_results_2021.loc[pl_results_2021['Home_Crystal Palace'] == 1, ['homexg']] = 0.86
pl_results_2021.loc[pl_results_2021['Home_Everton'] == 1, ['homexg']] = 1.24
pl_results_2021.loc[pl_results_2021['Home_Fulham'] == 1, ['homexg']] = 1.07
pl_results_2021.loc[pl_results_2021['Home_Leeds United'] == 1, ['homexg']] = 1.51
pl_results_2021.loc[pl_results_2021['Home_Leicester City'] == 1, ['homexg']] = 1.47
pl_results_2021.loc[pl_results_2021['Home_Liverpool'] == 1, ['homexg']] = 1.91
pl_results_2021.loc[pl_results_2021['Home_Manchester City'] == 1, ['homexg']] = 1.93
pl_results_2021.loc[pl_results_2021['Home_Manchester Utd'] == 1, ['homexg']] = 1.58
pl_results_2021.loc[pl_results_2021['Home_Newcastle Utd'] == 1, ['homexg']] = 1.08
pl_results_2021.loc[pl_results_2021['Home_Sheffield Utd'] == 1, ['homexg']] = 0.83
pl_results_2021.loc[pl_results_2021['Home_Southampton'] == 1, ['homexg']] = 1.11
pl_results_2021.loc[pl_results_2021['Home_Tottenham'] == 1, ['homexg']] = 1.44
pl_results_2021.loc[pl_results_2021['Home_West Brom'] == 1, ['homexg']] = 0.89
pl_results_2021.loc[pl_results_2021['Home_West Ham'] == 1, ['homexg']] = 1.42
pl_results_2021.loc[pl_results_2021['Home_Wolves'] == 1, ['homexg']] = 1.05
#Assigning average xg for teams, xg from statsbomb via fbref

In [60]:
pl_results_2021.loc[pl_results_2021['Away_Arsenal'] == 1, ['awayxg']] = 1.41
pl_results_2021.loc[pl_results_2021['Away_Aston Villa'] == 1, ['awayxg']] = 1.40
pl_results_2021.loc[pl_results_2021['Away_Brighton'] == 1, ['awayxg']] = 1.36
pl_results_2021.loc[pl_results_2021['Away_Burnley'] == 1, ['awayxg']] = 1.04
pl_results_2021.loc[pl_results_2021['Away_Chelsea'] == 1, ['awayxg']] = 1.68
pl_results_2021.loc[pl_results_2021['Away_Crystal Palace'] == 1, ['awayxg']] = 0.86
pl_results_2021.loc[pl_results_2021['Away_Everton'] == 1, ['awayxg']] = 1.24
pl_results_2021.loc[pl_results_2021['Away_Fulham'] == 1, ['awayxg']] = 1.07
pl_results_2021.loc[pl_results_2021['Away_Leeds United'] == 1, ['awayxg']] = 1.51
pl_results_2021.loc[pl_results_2021['Away_Leicester City'] == 1, ['awayxg']] = 1.47
pl_results_2021.loc[pl_results_2021['Away_Liverpool'] == 1, ['awayxg']] = 1.91
pl_results_2021.loc[pl_results_2021['Away_Manchester City'] == 1, ['awayxg']] = 1.93
pl_results_2021.loc[pl_results_2021['Away_Manchester Utd'] == 1, ['awayxg']] = 1.58
pl_results_2021.loc[pl_results_2021['Away_Newcastle Utd'] == 1, ['awayxg']] = 1.08
pl_results_2021.loc[pl_results_2021['Away_Sheffield Utd'] == 1, ['awayxg']] = 0.83
pl_results_2021.loc[pl_results_2021['Away_Southampton'] == 1, ['awayxg']] = 1.11
pl_results_2021.loc[pl_results_2021['Away_Tottenham'] == 1, ['awayxg']] = 1.44
pl_results_2021.loc[pl_results_2021['Away_West Brom'] == 1, ['awayxg']] = 0.89
pl_results_2021.loc[pl_results_2021['Away_West Ham'] == 1, ['awayxg']] = 1.42
pl_results_2021.loc[pl_results_2021['Away_Wolves'] == 1, ['awayxg']] = 1.05
#Assigning average xg for teams, xg from statsbomb via fbref

In [61]:
pl_results_2021.columns

Index(['Wk', 'Day', 'Date', 'Time', 'home_xG', 'Score', 'away_xG',
       'home_score', 'away_score', 'Venue', 'Referee', 'Match Report', 'Notes',
       'over/under', 'Home_Arsenal', 'Home_Aston Villa', 'Home_Brighton',
       'Home_Burnley', 'Home_Chelsea', 'Home_Crystal Palace', 'Home_Everton',
       'Home_Fulham', 'Home_Leeds United', 'Home_Leicester City',
       'Home_Liverpool', 'Home_Manchester City', 'Home_Manchester Utd',
       'Home_Newcastle Utd', 'Home_Sheffield Utd', 'Home_Southampton',
       'Home_Tottenham', 'Home_West Brom', 'Home_West Ham', 'Home_Wolves',
       'Away_Arsenal', 'Away_Aston Villa', 'Away_Brighton', 'Away_Burnley',
       'Away_Chelsea', 'Away_Crystal Palace', 'Away_Everton', 'Away_Fulham',
       'Away_Leeds United', 'Away_Leicester City', 'Away_Liverpool',
       'Away_Manchester City', 'Away_Manchester Utd', 'Away_Newcastle Utd',
       'Away_Sheffield Utd', 'Away_Southampton', 'Away_Tottenham',
       'Away_West Brom', 'Away_West Ham', 'Away_Wolv

In [62]:
features = ['Home_Arsenal', 'Home_Aston Villa', 'Home_Brighton',
       'Home_Burnley', 'Home_Chelsea', 'Home_Crystal Palace', 'Home_Everton',
       'Home_Fulham', 'Home_Leeds United', 'Home_Leicester City',
       'Home_Liverpool', 'Home_Manchester City', 'Home_Manchester Utd',
       'Home_Newcastle Utd', 'Home_Sheffield Utd', 'Home_Southampton',
       'Home_Tottenham', 'Home_West Brom', 'Home_West Ham', 'Home_Wolves',
       'Away_Arsenal', 'Away_Aston Villa', 'Away_Brighton', 'Away_Burnley',
       'Away_Chelsea', 'Away_Crystal Palace', 'Away_Everton', 'Away_Fulham',
       'Away_Leeds United', 'Away_Leicester City', 'Away_Liverpool',
       'Away_Manchester City', 'Away_Manchester Utd', 'Away_Newcastle Utd',
       'Away_Sheffield Utd', 'Away_Southampton', 'Away_Tottenham',
       'Away_West Brom', 'Away_West Ham', 'Away_Wolves', 'homexg', 'awayxg']
X = pl_results_2021[features]
y = pl_results_2021['over/under']

Could put all average stats for a team into a dictionary then use .loc to load all of the averages in for each team

Need to figure out a way to merge these together so that teams have all of their stats

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)

In [64]:
logreg = LogisticRegression()

In [65]:
logreg.fit(X_train, y_train)

LogisticRegression()

In [66]:
logreg.score(X_train, y_train)

0.6541353383458647

In [67]:
logreg.score(X_test, y_test)

0.49122807017543857

In [68]:
preds = logreg.predict(X_test[features])

In [69]:
accuracy_score(y_test, preds)

0.49122807017543857

In [70]:
preds

array([0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 0])

In [71]:
y_test

266    0
261    0
265    0
39     1
33     0
      ..
137    1
332    1
351    1
310    1
167    0
Name: over/under, Length: 114, dtype: int32

ready to push

next steps