In [67]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [68]:
prem_team_stats = pd.read_csv('../../footy_data/england-premier-league-teams-2021-to-2022-stats.csv')
prem_games = pd.read_csv('../../footy_data/england-premier-league-matches-2021-to-2022-stats.csv')
champ_team_stats = pd.read_csv('../../footy_data/england-championship-teams-2021-to-2022-stats.csv')
champ_games = pd.read_csv('../../footy_data/england-championship-matches-2021-to-2022-stats.csv')
league_one_team_stats = pd.read_csv('../../footy_data/england-efl-league-one-teams-2021-to-2022-stats.csv')
league_one_games = pd.read_csv('../../footy_data/england-efl-league-one-matches-2021-to-2022-stats.csv')
league_two_team_stats = pd.read_csv('../../footy_data/england-efl-league-two-teams-2021-to-2022-stats.csv')
league_two_games = pd.read_csv('../../footy_data/england-efl-league-two-matches-2021-to-2022-stats.csv')
segunda_team_stats = pd.read_csv('../../footy_data/spain-segunda-division-teams-2021-to-2022-stats.csv')
segunda_games = pd.read_csv('../../footy_data/spain-segunda-division-matches-2021-to-2022-stats.csv')
laliga_team_stats = pd.read_csv('../../footy_data/spain-la-liga-teams-2021-to-2022-stats.csv')
laliga_games = pd.read_csv('../../footy_data/spain-la-liga-matches-2021-to-2022-stats.csv')

# Data Cleaning and Additions

In [69]:
len(prem_games.loc[prem_games['status'] == 'complete']) +10

130

In [70]:
# prem_games = prem_games.loc[prem_games['status'] == 'complete']
# champ_games = champ_games.loc[champ_games['status'] == 'complete']
# league_one_games = league_one_games.loc[league_one_games['status'] == 'complete']
# league_two_games = league_two_games.loc[league_two_games['status'] == 'complete']
# segunda_games = segunda_games.loc[segunda_games['status'] == 'complete']
# laliga_games = laliga_games.loc[laliga_games['status'] == 'complete']

#NOT ADDING THESE YET BECAUSE WE NEED MORE THAN THE COMPLETED GAMES -- NEED THE ONES TO PREDICT

In [71]:
champ_team_stats['real_xg'] = ""
champ_team_stats['real_xga'] = ""
league_one_team_stats['real_xg'] = ""
league_one_team_stats['real_xga'] = ""
league_two_team_stats['real_xg'] = ""
league_two_team_stats['real_xga'] = ""
segunda_team_stats['real_xg'] = ""
segunda_team_stats['real_xga'] = ""
prem_team_stats['real_xg'] = ""
prem_team_stats['real_xga'] = ""
laliga_team_stats['real_xg'] = ""
laliga_team_stats['real_xga'] = ""
#Setting up to fix xg and xga

In [72]:
champ_team_stats['real_xg'] = ""
champ_team_stats['real_xga'] = ""
league_one_team_stats['real_xg'] = ""
league_one_team_stats['real_xga'] = ""
league_two_team_stats['real_xg'] = ""
league_two_team_stats['real_xga'] = ""
segunda_team_stats['real_xg'] = ""
segunda_team_stats['real_xga'] = ""
prem_team_stats['real_xg'] = ""
prem_team_stats['real_xga'] = ""
laliga_team_stats['real_xg'] = ""
laliga_team_stats['real_xga'] = ""

In [73]:
def xg_fixer(team_name, team_stats, league_games):
    total = league_games.loc[league_games['home_team_name'] == team_name, 'team_a_xg'].sum()
    total = total + league_games.loc[league_games['away_team_name'] == team_name, 'team_b_xg'].sum()
    total = total / (team_stats.loc[team_stats['common_name'] == team_name, 'matches_played'].iloc[0])
    team_stats['real_xg'] = np.where(team_stats['common_name'] == team_name, total, team_stats['real_xg'])
#Function to get correct avg xg for each team

In [74]:
def xg_against_fixer(team_name, team_stats, league_games):
    total = league_games.loc[league_games['home_team_name'] == team_name, 'team_b_xg'].sum()
    #Want team B this time because we want goals against
    total = total + league_games.loc[league_games['away_team_name'] == team_name, 'team_a_xg'].sum()
    total = total / (team_stats.loc[team_stats['common_name'] == team_name, 'matches_played'].iloc[0])
    team_stats['real_xga'] = np.where(team_stats['common_name'] == team_name, total, team_stats['real_xga'])
#Function to get correct avg xga for each team

In [75]:
for i in range (0,24):
    xg_fixer(champ_team_stats['common_name'][i], champ_team_stats, champ_games)
    xg_fixer(league_one_team_stats['common_name'][i], league_one_team_stats, league_one_games)
    xg_fixer(league_two_team_stats['common_name'][i], league_two_team_stats, league_two_games)
    xg_against_fixer(champ_team_stats['common_name'][i], champ_team_stats, champ_games)
    xg_against_fixer(league_one_team_stats['common_name'][i], league_one_team_stats, league_one_games)
    xg_against_fixer(league_two_team_stats['common_name'][i], league_two_team_stats, league_two_games)
#Filling correct xg for leagues with 24 teams

In [76]:
for i in range (0,22):
    xg_fixer(segunda_team_stats['common_name'][i], segunda_team_stats, segunda_games)
    xg_against_fixer(segunda_team_stats['common_name'][i], segunda_team_stats, segunda_games)
#Filling correct xg for leagues with 22 teams

In [77]:
for i in range (0,20):
    xg_fixer(prem_team_stats['common_name'][i], prem_team_stats, prem_games)
    xg_fixer(laliga_team_stats['common_name'][i], laliga_team_stats, laliga_games)
    xg_against_fixer(prem_team_stats['common_name'][i], prem_team_stats, prem_games)
    xg_against_fixer(laliga_team_stats['common_name'][i], laliga_team_stats, laliga_games)
#Filling correct xg for leagues with 20 teams

In [78]:
def add_over_under(games_df):
    games_df['over/under'] = np.where(games_df['home_team_goal_count'] + games_df['away_team_goal_count'] > 2.5, 1, 0)
    #Add column that fills with 1 if a game is over 2.5 goals, and a 0 if under 2.5 goals

In [79]:
prem_games

Unnamed: 0,timestamp,date_GMT,status,attendance,home_team_name,away_team_name,referee,Game Week,Pre-Match PPG (Home),Pre-Match PPG (Away),...,odds_ft_home_team_win,odds_ft_draw,odds_ft_away_team_win,odds_ft_over15,odds_ft_over25,odds_ft_over35,odds_ft_over45,odds_btts_yes,odds_btts_no,stadium_name
0,1628881200,Aug 13 2021 - 7:00pm,complete,16479.0,Brentford,Arsenal,Michael Oliver,1,0.00,0.00,...,3.90,3.40,2.05,1.43,2.20,3.75,7.25,1.95,2.00,Brentford Community Stadium (Brentford- Middle...
1,1628940600,Aug 14 2021 - 11:30am,complete,,Manchester United,Leeds United,Paul Tierney,1,0.00,0.00,...,1.62,4.15,5.25,1.25,1.69,2.60,4.55,1.71,2.25,Old Trafford (Manchester)
2,1628949600,Aug 14 2021 - 2:00pm,complete,,Burnley,Brighton & Hove Albion,David Coote,1,0.00,0.00,...,3.20,3.00,2.31,1.43,2.35,4.40,8.25,2.10,1.83,Turf Moor (Burnley)
3,1628949600,Aug 14 2021 - 2:00pm,complete,,Chelsea,Crystal Palace,Jonathan Moss,1,0.00,0.00,...,1.27,5.40,9.40,1.40,1.67,2.59,7.75,2.65,1.54,Stamford Bridge (London)
4,1628949600,Aug 14 2021 - 2:00pm,complete,,Everton,Southampton,Andy Madley,1,0.00,0.00,...,1.84,3.65,3.85,1.47,1.85,3.20,8.00,2.10,1.83,Goodison Park (Liverpool)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,1653231600,May 22 2022 - 3:00pm,incomplete,,Crystal Palace,Manchester United,,38,1.67,1.67,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,Selhurst Park (London)
376,1653231600,May 22 2022 - 3:00pm,incomplete,,Leicester City,Southampton,,38,1.17,0.83,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,King Power Stadium (Leicester- Leicestershire)
377,1653231600,May 22 2022 - 3:00pm,incomplete,,Liverpool,Wolverhampton Wanderers,,38,2.00,1.67,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,Anfield (Liverpool)
378,1653231600,May 22 2022 - 3:00pm,incomplete,,Manchester City,Aston Villa,,38,2.17,0.50,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,Etihad Stadium (Manchester)


In [80]:
add_over_under(prem_games)
add_over_under(champ_games)
add_over_under(league_one_games)
add_over_under(league_two_games)
add_over_under(laliga_games)
add_over_under(segunda_games)
#Filling dfs with that over/under column

In [81]:
prem_games

Unnamed: 0,timestamp,date_GMT,status,attendance,home_team_name,away_team_name,referee,Game Week,Pre-Match PPG (Home),Pre-Match PPG (Away),...,odds_ft_draw,odds_ft_away_team_win,odds_ft_over15,odds_ft_over25,odds_ft_over35,odds_ft_over45,odds_btts_yes,odds_btts_no,stadium_name,over/under
0,1628881200,Aug 13 2021 - 7:00pm,complete,16479.0,Brentford,Arsenal,Michael Oliver,1,0.00,0.00,...,3.40,2.05,1.43,2.20,3.75,7.25,1.95,2.00,Brentford Community Stadium (Brentford- Middle...,0
1,1628940600,Aug 14 2021 - 11:30am,complete,,Manchester United,Leeds United,Paul Tierney,1,0.00,0.00,...,4.15,5.25,1.25,1.69,2.60,4.55,1.71,2.25,Old Trafford (Manchester),1
2,1628949600,Aug 14 2021 - 2:00pm,complete,,Burnley,Brighton & Hove Albion,David Coote,1,0.00,0.00,...,3.00,2.31,1.43,2.35,4.40,8.25,2.10,1.83,Turf Moor (Burnley),1
3,1628949600,Aug 14 2021 - 2:00pm,complete,,Chelsea,Crystal Palace,Jonathan Moss,1,0.00,0.00,...,5.40,9.40,1.40,1.67,2.59,7.75,2.65,1.54,Stamford Bridge (London),1
4,1628949600,Aug 14 2021 - 2:00pm,complete,,Everton,Southampton,Andy Madley,1,0.00,0.00,...,3.65,3.85,1.47,1.85,3.20,8.00,2.10,1.83,Goodison Park (Liverpool),1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,1653231600,May 22 2022 - 3:00pm,incomplete,,Crystal Palace,Manchester United,,38,1.67,1.67,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,Selhurst Park (London),0
376,1653231600,May 22 2022 - 3:00pm,incomplete,,Leicester City,Southampton,,38,1.17,0.83,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,King Power Stadium (Leicester- Leicestershire),0
377,1653231600,May 22 2022 - 3:00pm,incomplete,,Liverpool,Wolverhampton Wanderers,,38,2.00,1.67,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,Anfield (Liverpool),0
378,1653231600,May 22 2022 - 3:00pm,incomplete,,Manchester City,Aston Villa,,38,2.17,0.50,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,Etihad Stadium (Manchester),0


# Fill Stats 

In [82]:
champ_games['home_xg'] = ""
champ_games['away_xg'] = ""
champ_games['home_xg_against'] = ""
champ_games['away_xg_against'] = ""
champ_games['home_total_corners'] = ""
champ_games['away_total_corners'] = ""

laliga_games['home_xg'] = ""
laliga_games['away_xg'] = ""
laliga_games['home_xg_against'] = ""
laliga_games['away_xg_against'] = ""
laliga_games['home_total_corners'] = ""
laliga_games['away_total_corners'] = ""

segunda_games['home_xg'] = ""
segunda_games['away_xg'] = ""
segunda_games['home_xg_against'] = ""
segunda_games['away_xg_against'] = ""
segunda_games['home_total_corners'] = ""
segunda_games['away_total_corners'] = ""

league_one_games['home_xg'] = ""
league_one_games['away_xg'] = ""
league_one_games['home_xg_against'] = ""
league_one_games['away_xg_against'] = ""
league_one_games['home_total_corners'] = ""
league_one_games['away_total_corners'] = ""

league_two_games['home_xg'] = ""
league_two_games['away_xg'] = ""
league_two_games['home_xg_against'] = ""
league_two_games['away_xg_against'] = ""
league_two_games['home_total_corners'] = ""
league_two_games['away_total_corners'] = ""

prem_games['home_xg'] = ""
prem_games['away_xg'] = ""
prem_games['home_xg_against'] = ""
prem_games['away_xg_against'] = ""
prem_games['home_total_corners'] = ""
prem_games['away_total_corners'] = ""

# Creating new columns where our xg and corner data will go -- need this in the Games df

In [84]:
prem_xg_stats = dict(prem_team_stats[['common_name', 'real_xg']].values)
prem_corner_stats = dict(prem_team_stats[['common_name', 'corners_per_match']].values)
prem_xga_stats = dict(prem_team_stats[['common_name', 'real_xga']].values)
champ_xg_stats = dict(champ_team_stats[['common_name', 'real_xg']].values)
champ_corner_stats = dict(champ_team_stats[['common_name', 'corners_per_match']].values)
champ_xga_stats = dict(champ_team_stats[['common_name', 'real_xga']].values)
league_one_xg_stats = dict(league_one_team_stats[['common_name', 'real_xg']].values)
league_one_corner_stats = dict(league_one_team_stats[['common_name', 'corners_per_match']].values)
league_one_xga_stats = dict(league_one_team_stats[['common_name', 'real_xga']].values)
league_two_xg_stats = dict(league_two_team_stats[['common_name', 'real_xg']].values)
league_two_corner_stats = dict(league_two_team_stats[['common_name', 'corners_per_match']].values)
league_two_xga_stats = dict(league_two_team_stats[['common_name', 'real_xga']].values)
laliga_xg_stats = dict(laliga_team_stats[['common_name', 'real_xg']].values)
laliga_corner_stats = dict(laliga_team_stats[['common_name', 'real_xg']].values)
laliga_xga_stats = dict(laliga_team_stats[['common_name', 'real_xg']].values)
segunda_xg_stats = dict(segunda_team_stats[['common_name', 'real_xg']].values)
segunda_corner_stats = dict(segunda_team_stats[['common_name', 'corners_per_match']].values)
segunda_xga_stats = dict(segunda_team_stats[['common_name', 'real_xga']].values)
#creating new variables for the average stats -- eventually will go into the champ_games df
#By making this into dictionaries, it will be easier to call them/update them with the function

In [85]:
def fill_stats(team, games, xg_stats, xga_stats, corner_stats):
    games['home_xg'] = np.where(games['home_team_name'] == team, xg_stats[team], games['home_xg'])
    games['away_xg'] = np.where(games['away_team_name'] == team, xg_stats[team], games['away_xg'])
    games['home_xg_against'] = np.where(games['home_team_name'] == team, xga_stats[team], games['home_xg_against'])
    games['away_xg_against'] = np.where(games['away_team_name'] == team, xga_stats[team], games['away_xg_against'])
    games['home_total_corners'] = np.where(games['home_team_name'] == team, corner_stats[team], games['home_total_corners'])
    games['away_total_corners'] = np.where(games['away_team_name'] == team, corner_stats[team], games['away_total_corners'])
#Function to put team stats into games dataframe

In [86]:
for i in range (0,24):
    fill_stats(champ_team_stats['common_name'][i], champ_games, champ_xg_stats, champ_xga_stats, champ_corner_stats)
    fill_stats(league_one_team_stats['common_name'][i], league_one_games, league_one_xg_stats, league_one_xga_stats, league_one_corner_stats)
    fill_stats(league_two_team_stats['common_name'][i], league_two_games, league_two_xg_stats, league_two_xga_stats, league_two_corner_stats)

22 Team Leagues

In [94]:
for i in range (0,22):
    fill_stats(segunda_team_stats['common_name'][i], segunda_games, segunda_xg_stats, segunda_xga_stats, segunda_corner_stats)

20 Team Leagues

In [95]:
for i in range (0,20):
    fill_stats(laliga_team_stats['common_name'][i], laliga_games, laliga_xg_stats, laliga_xga_stats, laliga_corner_stats)
    fill_stats(prem_team_stats['common_name'][i], prem_games, prem_xg_stats, prem_xga_stats, prem_corner_stats)

# Prem Modeling

In [110]:
features = ['home_xg', 'away_xg', 'home_xg_against', 'away_xg_against']
X = prem_games.loc[prem_games['status'] == 'complete'][features]
y = prem_games.loc[prem_games['status'] == 'complete']['over/under']
#Using only the completed games in the model

In [111]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.9, random_state = 42)

In [99]:
features = ['home_xg', 'home_xg_against', 'away_xg', 'away_xg_against']
X_train = prem_games[0:len(prem_games.loc[prem_games['status'] == 'complete'])][features]
X_test = prem_games[len(prem_games.loc[prem_games['status'] == 'complete']):len(prem_games.loc[prem_games['status'] == 'complete']) + 10][features]
y_train = prem_games[0:len(prem_games.loc[prem_games['status'] == 'complete'])]['over/under']
y_test = prem_games[len(prem_games.loc[prem_games['status'] == 'complete']):len(prem_games.loc[prem_games['status'] == 'complete']) + 10]['over/under']

#Explanation of code: for X_train, we want to train the data on the matches that have already been played, so we take the values
# of all the matches that have been played (0: len(completed matches)
# for X_test, we take the next 10 games -- the ones we want to predict (len(completed matches) : len(completed matches ) + 10) because 10 games/week

In [102]:
logreg = LogisticRegression()
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier()

In [103]:
y_test

120    0
121    0
122    0
123    0
124    0
125    0
126    0
127    0
128    0
129    0
Name: over/under, dtype: int32

In [104]:
logreg.fit(X_train, y_train)

LogisticRegression()

In [105]:
preds = logreg.predict(X_test)

In [106]:
accuracy_score(y_test, preds)

0.1

In [107]:
preds

array([1, 0, 1, 1, 1, 1, 1, 1, 1, 1])