In [162]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, \
LassoCV, ElasticNet, ElasticNetCV
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.decomposition import PCA

In [163]:
champ_results_2021 = pd.read_csv('./data/Champ_results_2021.csv')
champ_results_2020 = pd.read_csv('./data/Champ_results_2020.csv')
champ_results_2019 = pd.read_csv('./data/Champ_results_2019.csv')
champ_results_2018 = pd.read_csv('./data/Champ_results_2018.csv')
champ_results_2017 = pd.read_csv('./data/Champ_results_2017.csv')

pl_results_2021 = pd.read_csv('./data/pl_results2021.csv')
pl_results_2020 = pd.read_csv('./data/pl_results2020.csv')
pl_results_2019 = pd.read_csv('./data/pl_results2019.csv')
pl_results_2018 = pd.read_csv('./data/pl_results2018.csv')
pl_results_2017 = pd.read_csv('./data/pl_results2017.csv')

All of the data used came from Sports Reference, or Football Reference

In [164]:
champ_results_2021 = champ_results_2021.drop(['Score'],axis=1)

In [165]:
champ_results_2020 = champ_results_2020.drop(['Score'],axis=1)

In [185]:
def rename_XG(season):
    season = season.rename(columns={'xG':'home_xG', 'xG.1':'away_xG'})
    

In [186]:
rename_XG(pl_results_2018)

In [187]:
pl_results_2018

Unnamed: 0,Wk,Day,Date,Time,Home,xG,Score,xG.1,home_score,away_score,Away,Attendance,Venue,Referee,Match Report,Notes,over/under
0,1,Fri,2017-08-11,19:45 (14:45),Arsenal,2.1,4–3,1.6,4,3,Leicester City,59387,Emirates Stadium,Mike Dean,Match Report,,1
1,1,Sat,2017-08-12,12:30 (07:30),Watford,2.0,3–3,3.0,3,3,Liverpool,20407,Vicarage Road Stadium,Anthony Taylor,Match Report,,1
2,1,Sat,2017-08-12,15:00 (10:00),West Brom,1.2,1–0,0.4,1,0,Bournemouth,25011,The Hawthorns,Robert Madley,Match Report,,0
3,1,Sat,2017-08-12,15:00 (10:00),Everton,0.4,1–0,0.3,1,0,Stoke City,39045,Goodison Park,Niel Swarbrick,Match Report,,0
4,1,Sat,2017-08-12,15:00 (10:00),Southampton,2.1,0–0,0.4,0,0,Swansea City,31447,St. Mary's Stadium,Mike Jones,Match Report,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,38,Sun,2018-05-13,15:00 (10:00),Southampton,0.5,0–1,1.2,0,1,Manchester City,31882,St. Mary's Stadium,Andre Marriner,Match Report,,0
376,38,Sun,2018-05-13,15:00 (10:00),Huddersfield,1.6,0–1,2.0,0,1,Arsenal,24122,The John Smith's Stadium,Michael Oliver,Match Report,,0
377,38,Sun,2018-05-13,15:00 (10:00),Tottenham,1.9,5–4,1.3,5,4,Leicester City,77841,Wembley Stadium,Craig Pawson,Match Report,,1
378,38,Sun,2018-05-13,15:00 (10:00),Manchester Utd,0.8,1–0,0.6,1,0,Watford,75049,Old Trafford,Lee Mason,Match Report,,0


I'd like to do a bit of analysis on if there still existed a home field advantage, even with no attendance

# Home vs Away EFL Championship Results

In [169]:
draw = champ_results_2021[champ_results_2021.home_score == champ_results_2021.away_score]
away_win = champ_results_2021[champ_results_2021.home_score < champ_results_2021.away_score]
home_win = champ_results_2021[champ_results_2021.home_score > champ_results_2021.away_score]
print(f'There were {len(draw)} draws in the 20-21 Championship season, or {(len(draw)/(len(champ_results_2021))*100)} percent of games')
print(f'The away team won {len(away_win)} games in the 20-21 Championship season, or {(len(away_win)/(len(champ_results_2021)))*100} percent of games')
print(f'The home team won {len(home_win)} games in the 20-21 Championship season, or {(len(home_win)/(len(champ_results_2021)))*100} percent of games')

There were 139 draws in the 20-21 Championship season, or 25.181159420289855 percent of games
The away team won 186 games in the 20-21 Championship season, or 33.69565217391305 percent of games
The home team won 227 games in the 20-21 Championship season, or 41.1231884057971 percent of games


In [170]:
draw = champ_results_2020[champ_results_2020.home_score == champ_results_2020.away_score]
away_win = champ_results_2020[champ_results_2020.home_score < champ_results_2020.away_score]
home_win = champ_results_2020[champ_results_2020.home_score > champ_results_2020.away_score]
print(f'There were {len(draw)} draws in the 19-20 Championship season, or {(len(draw)/(len(champ_results_2020))*100)} percent of games')
print(f'The away team won {len(away_win)} games in the 19-20 Championship season, or {(len(away_win)/(len(champ_results_2020)))*100} percent of games')
print(f'The home team won {len(home_win)} games in the 19-20 Championship season, or {(len(home_win)/(len(champ_results_2020)))*100} percent of games')

There were 149 draws in the 19-20 Championship season, or 26.992753623188403 percent of games
The away team won 173 games in the 19-20 Championship season, or 31.34057971014493 percent of games
The home team won 230 games in the 19-20 Championship season, or 41.66666666666667 percent of games


In [171]:
draw = champ_results_2019[champ_results_2019.home_score == champ_results_2019.away_score]
away_win = champ_results_2019[champ_results_2019.home_score < champ_results_2019.away_score]
home_win = champ_results_2019[champ_results_2019.home_score > champ_results_2019.away_score]
print(f'There were {len(draw)} draws in the 18-19 Championship season, or {(len(draw)/(len(champ_results_2019))*100)} percent of games')
print(f'The away team won {len(away_win)} games in the 18-19 Championship season, or {(len(away_win)/(len(champ_results_2019)))*100} percent of games')
print(f'The home team won {len(home_win)} games in the 18-19 Championship season, or {(len(home_win)/(len(champ_results_2019)))*100} percent of games')

There were 162 draws in the 18-19 Championship season, or 29.347826086956523 percent of games
The away team won 150 games in the 18-19 Championship season, or 27.173913043478258 percent of games
The home team won 240 games in the 18-19 Championship season, or 43.47826086956522 percent of games


In [172]:
draw = champ_results_2018[champ_results_2018.home_score == champ_results_2018.away_score]
away_win = champ_results_2018[champ_results_2018.home_score < champ_results_2018.away_score]
home_win = champ_results_2018[champ_results_2018.home_score > champ_results_2018.away_score]
print(f'There were {len(draw)} draws in the 17-18 Championship season, or {(len(draw)/(len(champ_results_2018))*100)} percent of games')
print(f'The away team won {len(away_win)} games in the 17-18 Championship season, or {(len(away_win)/(len(champ_results_2018)))*100} percent of games')
print(f'The home team won {len(home_win)} games in the 17-18 Championship season, or {(len(home_win)/(len(champ_results_2018)))*100} percent of games')

There were 148 draws in the 17-18 Championship season, or 26.811594202898554 percent of games
The away team won 166 games in the 17-18 Championship season, or 30.07246376811594 percent of games
The home team won 238 games in the 17-18 Championship season, or 43.11594202898551 percent of games


In [173]:
draw = champ_results_2017[champ_results_2017.home_score == champ_results_2017.away_score]
away_win = champ_results_2017[champ_results_2017.home_score < champ_results_2017.away_score]
home_win = champ_results_2017[champ_results_2017.home_score > champ_results_2017.away_score]
print(f'There were {len(draw)} draws in the 16-17 Championship season, or {(len(draw)/(len(champ_results_2017))*100)} percent of games')
print(f'The away team won {len(away_win)} games in the 16-17 Championship season, or {(len(away_win)/(len(champ_results_2017)))*100} percent of games')
print(f'The home team won {len(home_win)} games in the 16-17 Championship season, or {(len(home_win)/(len(champ_results_2017)))*100} percent of games')

There were 130 draws in the 16-17 Championship season, or 23.55072463768116 percent of games
The away team won 160 games in the 16-17 Championship season, or 28.985507246376812 percent of games
The home team won 262 games in the 16-17 Championship season, or 47.46376811594203 percent of games


# English Premier League results

In [174]:
draw = pl_results_2021[pl_results_2021.home_score == pl_results_2021.away_score]
away_win = pl_results_2021[pl_results_2021.home_score < pl_results_2021.away_score]
home_win = pl_results_2021[pl_results_2021.home_score > pl_results_2021.away_score]
print(f'There were {len(draw)} draws in the 20-21 PL season, or {(len(draw)/(len(pl_results_2021))*100)} percent of games')
print(f'The away team won {len(away_win)} games in the 20-21 PL season, or {(len(away_win)/(len(pl_results_2021)))*100} percent of games')
print(f'The home team won {len(home_win)} games in the 20-21 PL season, or {(len(home_win)/(len(pl_results_2021)))*100} percent of games')

#This needs to be updated to include final week

There were 83 draws in the 20-21 PL season, or 21.842105263157897 percent of games
The away team won 150 games in the 20-21 PL season, or 39.473684210526315 percent of games
The home team won 137 games in the 20-21 PL season, or 36.05263157894737 percent of games


In [175]:
draw = pl_results_2020[pl_results_2020.home_score == pl_results_2020.away_score]
away_win = pl_results_2020[pl_results_2020.home_score < pl_results_2020.away_score]
home_win = pl_results_2020[pl_results_2020.home_score > pl_results_2020.away_score]
print(f'There were {len(draw)} draws in the 19-20 PL season, or {(len(draw)/(len(pl_results_2020))*100)} percent of games')
print(f'The away team won {len(away_win)} games in the 19-20 PL season, or {(len(away_win)/(len(pl_results_2020)))*100} percent of games')
print(f'The home team won {len(home_win)} games in the 19-20 PL season, or {(len(home_win)/(len(pl_results_2020)))*100} percent of games')

There were 92 draws in the 19-20 PL season, or 24.210526315789473 percent of games
The away team won 116 games in the 19-20 PL season, or 30.526315789473685 percent of games
The home team won 172 games in the 19-20 PL season, or 45.26315789473684 percent of games


In [176]:
draw = pl_results_2019[pl_results_2019.home_score == pl_results_2019.away_score]
away_win = pl_results_2019[pl_results_2019.home_score < pl_results_2019.away_score]
home_win = pl_results_2019[pl_results_2019.home_score > pl_results_2019.away_score]
print(f'There were {len(draw)} draws in the 18-19 PL season, or {(len(draw)/(len(pl_results_2019))*100)} percent of games')
print(f'The away team won {len(away_win)} games in the 18-19 PL season, or {(len(away_win)/(len(pl_results_2019)))*100} percent of games')
print(f'The home team won {len(home_win)} games in the 18-19 PL season, or {(len(home_win)/(len(pl_results_2019)))*100} percent of games')

There were 71 draws in the 18-19 PL season, or 18.684210526315788 percent of games
The away team won 128 games in the 18-19 PL season, or 33.68421052631579 percent of games
The home team won 181 games in the 18-19 PL season, or 47.63157894736842 percent of games


In [177]:
draw = pl_results_2018[pl_results_2018.home_score == pl_results_2018.away_score]
away_win = pl_results_2018[pl_results_2018.home_score < pl_results_2018.away_score]
home_win = pl_results_2018[pl_results_2018.home_score > pl_results_2018.away_score]
print(f'There were {len(draw)} draws in the 17-18 PL season, or {(len(draw)/(len(pl_results_2018))*100)} percent of games')
print(f'The away team won {len(away_win)} games in the 17-18 PL season, or {(len(away_win)/(len(pl_results_2018)))*100} percent of games')
print(f'The home team won {len(home_win)} games in the 17-18 PL season, or {(len(home_win)/(len(pl_results_2018)))*100} percent of games')

There were 99 draws in the 17-18 PL season, or 26.052631578947366 percent of games
The away team won 108 games in the 17-18 PL season, or 28.421052631578945 percent of games
The home team won 173 games in the 17-18 PL season, or 45.526315789473685 percent of games


In [178]:
draw = pl_results_2017[pl_results_2017.home_score == pl_results_2017.away_score]
away_win = pl_results_2017[pl_results_2017.home_score < pl_results_2017.away_score]
home_win = pl_results_2017[pl_results_2017.home_score > pl_results_2017.away_score]
print(f'There were {len(draw)} draws in the 16-17 PL season, or {(len(draw)/(len(pl_results_2017))*100)} percent of games')
print(f'The away team won {len(away_win)} games in the 16-17 PL season, or {(len(away_win)/(len(pl_results_2017)))*100} percent of games')
print(f'The home team won {len(home_win)} games in the 16-17 PL season, or {(len(home_win)/(len(pl_results_2017)))*100} percent of games')

There were 84 draws in the 16-17 PL season, or 22.105263157894736 percent of games
The away team won 109 games in the 16-17 PL season, or 28.68421052631579 percent of games
The home team won 187 games in the 16-17 PL season, or 49.21052631578947 percent of games


In [179]:
pl_results_2017.columns

Index(['Wk', 'Day', 'Date', 'Time', 'Home', 'Score', 'Away', 'home_score',
       'away_score', 'Attendance', 'Venue', 'Referee', 'Match Report',
       'Notes'],
      dtype='object')

# How many goals are we seeing in these games?

In [180]:
total_goals = pl_results_2017['away_score'].sum() + pl_results_2017['home_score'].sum()
total_goals

1064

Adding over or under column, over will be 1, under will be 0

In [181]:
def add_ou(season):
    season['over/under'] = np.where(season['home_score'] + season['away_score'] > 2.5, 1, 0)
    return season

In [182]:
add_ou(pl_results_2017)
add_ou(pl_results_2018)
add_ou(pl_results_2019)
add_ou(pl_results_2020)
add_ou(pl_results_2021)
add_ou(champ_results_2017)
add_ou(champ_results_2018)
add_ou(champ_results_2019)
add_ou(champ_results_2020)
add_ou(champ_results_2021)

Unnamed: 0,Wk,Day,Date,Time,Home,Away,home_score,away_score,Venue,Referee,over/under
0,1,Fri,2020-09-11,19:45 (14:45),Watford,Middlesbrough,1,0,Vicarage Road Stadium,Keith Stroud,0
1,1,Sat,2020-09-12,12:30 (07:30),Wycombe,Rotherham Utd,0,1,Adams Park,James Linington,0
2,1,Sat,2020-09-12,12:30 (07:30),Birmingham City,Brentford,1,0,St Andrew's Trillion Trophy Stadium,Tony Harrington,0
3,1,Sat,2020-09-12,15:00 (10:00),Bournemouth,Blackburn,3,2,Vitality Stadium,Gavin Ward,1
4,1,Sat,2020-09-12,15:00 (10:00),Huddersfield,Norwich City,0,1,The John Smith's Stadium,Geoff Eltringham,0
...,...,...,...,...,...,...,...,...,...,...,...
547,46,Sat,2021-05-08,12:30 (07:30),Coventry City,Millwall,6,1,St Andrew's Trillion Trophy Stadium,Michael Salisbury,1
548,46,Sat,2021-05-08,12:30 (07:30),Cardiff City,Rotherham Utd,1,1,Cardiff City Stadium,Graham Scott,0
549,46,Sat,2021-05-08,12:30 (07:30),Middlesbrough,Wycombe,0,3,Riverside Stadium,Martin Atkinson,1
550,46,Sat,2021-05-08,12:30 (07:30),Bristol City,Brentford,1,3,Ashton Gate Stadium,James Linington,1


In [183]:
pl_results_2017['over/under'].value_counts()
pl_results_2018['over/under'].value_counts()
pl_results_2019['over/under'].value_counts()
pl_results_2020['over/under'].value_counts()
pl_results_2021['over/under'].value_counts()

champ_results_2017['over/under'].value_counts()
champ_results_2018['over/under'].value_counts()
champ_results_2019['over/under'].value_counts()
champ_results_2020['over/under'].value_counts()
champ_results_2021['over/under'].value_counts()

#can add some more to clean this up

0    326
1    226
Name: over/under, dtype: int64

In [184]:
pl_results_2018

Unnamed: 0,Wk,Day,Date,Time,Home,xG,Score,xG.1,home_score,away_score,Away,Attendance,Venue,Referee,Match Report,Notes,over/under
0,1,Fri,2017-08-11,19:45 (14:45),Arsenal,2.1,4–3,1.6,4,3,Leicester City,59387,Emirates Stadium,Mike Dean,Match Report,,1
1,1,Sat,2017-08-12,12:30 (07:30),Watford,2.0,3–3,3.0,3,3,Liverpool,20407,Vicarage Road Stadium,Anthony Taylor,Match Report,,1
2,1,Sat,2017-08-12,15:00 (10:00),West Brom,1.2,1–0,0.4,1,0,Bournemouth,25011,The Hawthorns,Robert Madley,Match Report,,0
3,1,Sat,2017-08-12,15:00 (10:00),Everton,0.4,1–0,0.3,1,0,Stoke City,39045,Goodison Park,Niel Swarbrick,Match Report,,0
4,1,Sat,2017-08-12,15:00 (10:00),Southampton,2.1,0–0,0.4,0,0,Swansea City,31447,St. Mary's Stadium,Mike Jones,Match Report,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,38,Sun,2018-05-13,15:00 (10:00),Southampton,0.5,0–1,1.2,0,1,Manchester City,31882,St. Mary's Stadium,Andre Marriner,Match Report,,0
376,38,Sun,2018-05-13,15:00 (10:00),Huddersfield,1.6,0–1,2.0,0,1,Arsenal,24122,The John Smith's Stadium,Michael Oliver,Match Report,,0
377,38,Sun,2018-05-13,15:00 (10:00),Tottenham,1.9,5–4,1.3,5,4,Leicester City,77841,Wembley Stadium,Craig Pawson,Match Report,,1
378,38,Sun,2018-05-13,15:00 (10:00),Manchester Utd,0.8,1–0,0.6,1,0,Watford,75049,Old Trafford,Lee Mason,Match Report,,0
