In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [2]:
prem_team_stats = pd.read_csv('../../footy_data/england-premier-league-teams-2021-to-2022-stats.csv')
prem_games = pd.read_csv('../../footy_data/england-premier-league-matches-2021-to-2022-stats.csv')
champ_team_stats = pd.read_csv('../../footy_data/england-championship-teams-2021-to-2022-stats.csv')
champ_games = pd.read_csv('../../footy_data/england-championship-matches-2021-to-2022-stats.csv')
league_one_team_stats = pd.read_csv('../../footy_data/england-efl-league-one-teams-2021-to-2022-stats.csv')
league_one_games = pd.read_csv('../../footy_data/england-efl-league-one-matches-2021-to-2022-stats.csv')
league_two_team_stats = pd.read_csv('../../footy_data/england-efl-league-two-teams-2021-to-2022-stats.csv')
league_two_games = pd.read_csv('../../footy_data/england-efl-league-two-matches-2021-to-2022-stats.csv')
segunda_team_stats = pd.read_csv('../../footy_data/spain-segunda-division-teams-2021-to-2022-stats.csv')
segunda_games = pd.read_csv('../../footy_data/spain-segunda-division-matches-2021-to-2022-stats.csv')
laliga_team_stats = pd.read_csv('../../footy_data/spain-la-liga-teams-2021-to-2022-stats.csv')
laliga_games = pd.read_csv('../../footy_data/spain-la-liga-matches-2021-to-2022-stats.csv')

# Data Cleaning and Additions

In [3]:
len(prem_games.loc[prem_games['status'] == 'complete']) +10

200

In [4]:
# prem_games = prem_games.loc[prem_games['status'] == 'complete']
# champ_games = champ_games.loc[champ_games['status'] == 'complete']
# league_one_games = league_one_games.loc[league_one_games['status'] == 'complete']
# league_two_games = league_two_games.loc[league_two_games['status'] == 'complete']
# segunda_games = segunda_games.loc[segunda_games['status'] == 'complete']
# laliga_games = laliga_games.loc[laliga_games['status'] == 'complete']

#NOT ADDING THESE YET BECAUSE WE NEED MORE THAN THE COMPLETED GAMES -- NEED THE ONES TO PREDICT

In [5]:
champ_team_stats['real_xg'] = ""
champ_team_stats['real_xga'] = ""
league_one_team_stats['real_xg'] = ""
league_one_team_stats['real_xga'] = ""
league_two_team_stats['real_xg'] = ""
league_two_team_stats['real_xga'] = ""
segunda_team_stats['real_xg'] = ""
segunda_team_stats['real_xga'] = ""
prem_team_stats['real_xg'] = ""
prem_team_stats['real_xga'] = ""
laliga_team_stats['real_xg'] = ""
laliga_team_stats['real_xga'] = ""
#Setting up to fix xg and xga

In [6]:
champ_team_stats['real_xg'] = ""
champ_team_stats['real_xga'] = ""
league_one_team_stats['real_xg'] = ""
league_one_team_stats['real_xga'] = ""
league_two_team_stats['real_xg'] = ""
league_two_team_stats['real_xga'] = ""
segunda_team_stats['real_xg'] = ""
segunda_team_stats['real_xga'] = ""
prem_team_stats['real_xg'] = ""
prem_team_stats['real_xga'] = ""
laliga_team_stats['real_xg'] = ""
laliga_team_stats['real_xga'] = ""

In [7]:
def xg_fixer(team_name, team_stats, league_games):
    total = league_games.loc[league_games['home_team_name'] == team_name, 'team_a_xg'].sum()
    total = total + league_games.loc[league_games['away_team_name'] == team_name, 'team_b_xg'].sum()
    total = total / (team_stats.loc[team_stats['common_name'] == team_name, 'matches_played'].iloc[0])
    team_stats['real_xg'] = np.where(team_stats['common_name'] == team_name, total, team_stats['real_xg'])
#Function to get correct avg xg for each team


In [8]:
def xg_against_fixer(team_name, team_stats, league_games):
    total = league_games.loc[league_games['home_team_name'] == team_name, 'team_b_xg'].sum()
    #Want team B this time because we want goals against
    total = total + league_games.loc[league_games['away_team_name'] == team_name, 'team_a_xg'].sum()
    total = total / (team_stats.loc[team_stats['common_name'] == team_name, 'matches_played'].iloc[0])
    team_stats['real_xga'] = np.where(team_stats['common_name'] == team_name, total, team_stats['real_xga'])
#Function to get correct avg xga for each team

In [9]:
for i in range (0,24):
    xg_fixer(champ_team_stats['common_name'][i], champ_team_stats, champ_games)
    xg_fixer(league_one_team_stats['common_name'][i], league_one_team_stats, league_one_games)
    xg_fixer(league_two_team_stats['common_name'][i], league_two_team_stats, league_two_games)
    xg_against_fixer(champ_team_stats['common_name'][i], champ_team_stats, champ_games)
    xg_against_fixer(league_one_team_stats['common_name'][i], league_one_team_stats, league_one_games)
    xg_against_fixer(league_two_team_stats['common_name'][i], league_two_team_stats, league_two_games)
#Filling correct xg for leagues with 24 teams

In [10]:
for i in range (0,22):
    xg_fixer(segunda_team_stats['common_name'][i], segunda_team_stats, segunda_games)
    xg_against_fixer(segunda_team_stats['common_name'][i], segunda_team_stats, segunda_games)
#Filling correct xg for leagues with 22 teams

In [11]:
for i in range (0,20):
    xg_fixer(prem_team_stats['common_name'][i], prem_team_stats, prem_games)
    xg_fixer(laliga_team_stats['common_name'][i], laliga_team_stats, laliga_games)
    xg_against_fixer(prem_team_stats['common_name'][i], prem_team_stats, prem_games)
    xg_against_fixer(laliga_team_stats['common_name'][i], laliga_team_stats, laliga_games)
#Filling correct xg for leagues with 20 teams

In [12]:
def add_over_under(games_df):
    games_df['over/under'] = np.where(games_df['home_team_goal_count'] + games_df['away_team_goal_count'] > 2.5, 1, 0)
    #Add column that fills with 1 if a game is over 2.5 goals, and a 0 if under 2.5 go

In [13]:
prem_games.head(3)

Unnamed: 0,timestamp,date_GMT,status,attendance,home_team_name,away_team_name,referee,Game Week,Pre-Match PPG (Home),Pre-Match PPG (Away),...,odds_ft_home_team_win,odds_ft_draw,odds_ft_away_team_win,odds_ft_over15,odds_ft_over25,odds_ft_over35,odds_ft_over45,odds_btts_yes,odds_btts_no,stadium_name
0,1628881200,Aug 13 2021 - 7:00pm,complete,16479.0,Brentford,Arsenal,Michael Oliver,1,0.0,0.0,...,3.9,3.4,2.05,1.43,2.2,3.75,7.25,1.95,2.0,Brentford Community Stadium (Brentford- Middle...
1,1628940600,Aug 14 2021 - 11:30am,complete,,Manchester United,Leeds United,Paul Tierney,1,0.0,0.0,...,1.62,4.15,5.25,1.25,1.69,2.6,4.55,1.71,2.25,Old Trafford (Manchester)
2,1628949600,Aug 14 2021 - 2:00pm,complete,,Burnley,Brighton & Hove Albion,David Coote,1,0.0,0.0,...,3.2,3.0,2.31,1.43,2.35,4.4,8.25,2.1,1.83,Turf Moor (Burnley)


In [14]:
add_over_under(prem_games)
add_over_under(champ_games)
add_over_under(league_one_games)
add_over_under(league_two_games)
add_over_under(laliga_games)
add_over_under(segunda_games)
#Filling dfs with that over/under column

In [15]:
prem_games.head(3)

Unnamed: 0,timestamp,date_GMT,status,attendance,home_team_name,away_team_name,referee,Game Week,Pre-Match PPG (Home),Pre-Match PPG (Away),...,odds_ft_draw,odds_ft_away_team_win,odds_ft_over15,odds_ft_over25,odds_ft_over35,odds_ft_over45,odds_btts_yes,odds_btts_no,stadium_name,over/under
0,1628881200,Aug 13 2021 - 7:00pm,complete,16479.0,Brentford,Arsenal,Michael Oliver,1,0.0,0.0,...,3.4,2.05,1.43,2.2,3.75,7.25,1.95,2.0,Brentford Community Stadium (Brentford- Middle...,0
1,1628940600,Aug 14 2021 - 11:30am,complete,,Manchester United,Leeds United,Paul Tierney,1,0.0,0.0,...,4.15,5.25,1.25,1.69,2.6,4.55,1.71,2.25,Old Trafford (Manchester),1
2,1628949600,Aug 14 2021 - 2:00pm,complete,,Burnley,Brighton & Hove Albion,David Coote,1,0.0,0.0,...,3.0,2.31,1.43,2.35,4.4,8.25,2.1,1.83,Turf Moor (Burnley),1


# Fill Stats

In [16]:
champ_games['home_xg'] = ""
champ_games['away_xg'] = ""
champ_games['home_xg_against'] = ""
champ_games['away_xg_against'] = ""
champ_games['home_total_corners'] = ""
champ_games['away_total_corners'] = ""

laliga_games['home_xg'] = ""
laliga_games['away_xg'] = ""
laliga_games['home_xg_against'] = ""
laliga_games['away_xg_against'] = ""
laliga_games['home_total_corners'] = ""
laliga_games['away_total_corners'] = ""

segunda_games['home_xg'] = ""
segunda_games['away_xg'] = ""
segunda_games['home_xg_against'] = ""
segunda_games['away_xg_against'] = ""
segunda_games['home_total_corners'] = ""
segunda_games['away_total_corners'] = ""

league_one_games['home_xg'] = ""
league_one_games['away_xg'] = ""
league_one_games['home_xg_against'] = ""
league_one_games['away_xg_against'] = ""
league_one_games['home_total_corners'] = ""
league_one_games['away_total_corners'] = ""

league_two_games['home_xg'] = ""
league_two_games['away_xg'] = ""
league_two_games['home_xg_against'] = ""
league_two_games['away_xg_against'] = ""
league_two_games['home_total_corners'] = ""
league_two_games['away_total_corners'] = ""

prem_games['home_xg'] = ""
prem_games['away_xg'] = ""
prem_games['home_xg_against'] = ""
prem_games['away_xg_against'] = ""
prem_games['home_total_corners'] = ""
prem_games['away_total_corners'] = ""

# Creating new columns where our xg and corner data will go -- need this in the Games df

In [17]:
prem_xg_stats = dict(prem_team_stats[['common_name', 'real_xg']].values)
prem_corner_stats = dict(prem_team_stats[['common_name', 'corners_per_match']].values)
prem_xga_stats = dict(prem_team_stats[['common_name', 'real_xga']].values)
champ_xg_stats = dict(champ_team_stats[['common_name', 'real_xg']].values)
champ_corner_stats = dict(champ_team_stats[['common_name', 'corners_per_match']].values)
champ_xga_stats = dict(champ_team_stats[['common_name', 'real_xga']].values)
league_one_xg_stats = dict(league_one_team_stats[['common_name', 'real_xg']].values)
league_one_corner_stats = dict(league_one_team_stats[['common_name', 'corners_per_match']].values)
league_one_xga_stats = dict(league_one_team_stats[['common_name', 'real_xga']].values)
league_two_xg_stats = dict(league_two_team_stats[['common_name', 'real_xg']].values)
league_two_corner_stats = dict(league_two_team_stats[['common_name', 'corners_per_match']].values)
league_two_xga_stats = dict(league_two_team_stats[['common_name', 'real_xga']].values)
laliga_xg_stats = dict(laliga_team_stats[['common_name', 'real_xg']].values)
laliga_corner_stats = dict(laliga_team_stats[['common_name', 'real_xg']].values)
laliga_xga_stats = dict(laliga_team_stats[['common_name', 'real_xg']].values)
segunda_xg_stats = dict(segunda_team_stats[['common_name', 'real_xg']].values)
segunda_corner_stats = dict(segunda_team_stats[['common_name', 'corners_per_match']].values)
segunda_xga_stats = dict(segunda_team_stats[['common_name', 'real_xga']].values)
#creating new variables for the average stats -- eventually will go into the champ_games df
#By making this into dictionaries, it will be easier to call them/update them with the function

In [18]:
def fill_stats(team, games, xg_stats, xga_stats, corner_stats):
    games['home_xg'] = np.where(games['home_team_name'] == team, xg_stats[team], games['home_xg'])
    games['away_xg'] = np.where(games['away_team_name'] == team, xg_stats[team], games['away_xg'])
    games['home_xg_against'] = np.where(games['home_team_name'] == team, xga_stats[team], games['home_xg_against'])
    games['away_xg_against'] = np.where(games['away_team_name'] == team, xga_stats[team], games['away_xg_against'])
    games['home_total_corners'] = np.where(games['home_team_name'] == team, corner_stats[team], games['home_total_corners'])
    games['away_total_corners'] = np.where(games['away_team_name'] == team, corner_stats[team], games['away_total_corners'])
#Function to put team stats into games dataframe

In [None]:
for i in range (0,24):
    fill_stats(champ_team_stats['common_name'][i], champ_games, champ_xg_stats, champ_xga_stats, champ_corner_stats)
    fill_stats(league_one_team_stats['common_name'][i], league_one_games, league_one_xg_stats, league_one_xga_stats, league_one_corner_stats)
    fill_stats(league_two_team_stats['common_name'][i], league_two_games, league_two_xg_stats, league_two_xga_stats, league_two_corner_stats)

22 Team Leagues

In [None]:
for i in range (0,22):
    fill_stats(segunda_team_stats['common_name'][i], segunda_games, segunda_xg_stats, segunda_xga_stats, segunda_corner_stats)

20 Team Leagues

In [None]:
for i in range (0,20):
    fill_stats(laliga_team_stats['common_name'][i], laliga_games, laliga_xg_stats, laliga_xga_stats, laliga_corner_stats)
    fill_stats(prem_team_stats['common_name'][i], prem_games, prem_xg_stats, prem_xga_stats, prem_corner_stats)

# Prem Modeling

In [None]:
features = ['home_xg', 'away_xg', 'home_xg_against', 'away_xg_against']
X = prem_games.loc[prem_games['status'] == 'complete'][features]
y = prem_games.loc[prem_games['status'] == 'complete']['over/under']
#Using only the completed games in the model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

For now, I am going to table trying to predict a "matchweek's" games. There are too many cancellations happening and at the moment it is so unpredictable that continuing to write code for a matchweek is not working. Instead I will split all of the complete games and try to optimize the previous results from this season.

In [None]:
logreg = LogisticRegression()
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier()

In [None]:
logreg.fit(X_train, y_train)
#Fit it on the data

In [None]:
preds = logreg.predict(X_test)
#Get our predictions

In [None]:
accuracy_score(y_test, preds)

In [None]:
knn.fit(X_train, y_train)
#Fit it on the data

In [None]:
preds = knn.predict(X_test)
#Get our predictions

In [None]:
accuracy_score(y_test, preds)

In [None]:
dt.fit(X_train, y_train)
#Fit it on the data

In [None]:
preds = dt.predict(X_test)
#Get our predictions

In [None]:
accuracy_score(y_test, preds)

# Champ Modeling

In [None]:
features = ['home_xg', 'away_xg', 'home_xg_against', 'away_xg_against']
X = champ_games.loc[champ_games['status'] == 'complete'][features]
y = champ_games.loc[champ_games['status'] == 'complete']['over/under']
#Using only the completed games in the model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
logreg = LogisticRegression()
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier()

In [None]:
logreg.fit(X_train, y_train)
#Fit it on the data

In [None]:
preds = logreg.predict(X_test)
#Get our predictions

In [None]:
accuracy_score(y_test, preds)

In [None]:
knn.fit(X_train, y_train)
#Fit it on the data

In [None]:
preds = knn.predict(X_test)
#Get our predictions

In [None]:
accuracy_score(y_test, preds)

In [None]:
dt.fit(X_train, y_train)
#Fit it on the data

In [None]:
preds = dt.predict(X_test)
#Get our predictions

In [None]:
accuracy_score(y_test, preds)

# League One Modeling

In [None]:
features = ['home_xg', 'away_xg', 'home_xg_against', 'away_xg_against']
X = league_one_games.loc[league_one_games['status'] == 'complete'][features]
y = league_one_games.loc[league_one_games['status'] == 'complete']['over/under']
#Using only the completed games in the model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
logreg = LogisticRegression()
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier()

In [None]:
logreg.fit(X_train, y_train)
#Fit it on the data

In [None]:
preds = logreg.predict(X_test)
#Get our predictions

In [None]:
accuracy_score(y_test, preds)

In [None]:
knn.fit(X_train, y_train)
#Fit it on the data

In [None]:
preds = knn.predict(X_test)
#Get our predictions

In [None]:
accuracy_score(y_test, preds)

In [None]:
dt.fit(X_train, y_train)
#Fit it on the data

In [None]:
preds = dt.predict(X_test)
#Get our predictions

In [None]:
accuracy_score(y_test, preds)

# League Two Modeling

In [None]:
features = ['home_xg', 'away_xg', 'home_xg_against', 'away_xg_against']
X = league_two_games.loc[league_two_games['status'] == 'complete'][features]
y = league_two_games.loc[league_two_games['status'] == 'complete']['over/under']
#Using only the completed games in the model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
logreg = LogisticRegression()
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier()

In [None]:
logreg.fit(X_train, y_train)
#Fit it on the data

In [None]:
preds = logreg.predict(X_test)
#Get our predictions

In [None]:
accuracy_score(y_test, preds)

In [None]:
knn.fit(X_train, y_train)

In [None]:
preds = knn.predict(X_test)
#Get our predictions

In [None]:
accuracy_score(y_test, preds)

In [None]:
dt.fit(X_train, y_train)

In [None]:
preds = dt.predict(X_test)
#Get our predictions

In [None]:
accuracy_score(y_test, preds)

# La Liga Modeling

In [None]:
features = ['home_xg', 'away_xg', 'home_xg_against', 'away_xg_against']
X = champ_games.loc[champ_games['status'] == 'complete'][features]
y = champ_games.loc[champ_games['status'] == 'complete']['over/under']
#Using only the completed games in the model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
logreg = LogisticRegression()
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier()

In [None]:
logreg.fit(X_train, y_train)
#Fit it on the data

In [None]:
preds = logreg.predict(X_test)
#Get our predictions

In [None]:
accuracy_score(y_test, preds)

# Segunda Modeling

In [None]:
features = ['home_xg', 'away_xg', 'home_xg_against', 'away_xg_against']
X = segunda_games.loc[segunda_games['status'] == 'complete'][features]
y = segunda_games.loc[segunda_games['status'] == 'complete']['over/under']
#Using only the completed games in the model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
logreg = LogisticRegression()
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier()

In [None]:
logreg.fit(X_train, y_train)
#Fit it on the data

In [None]:
preds = logreg.predict(X_test)
#Get our predictions

In [None]:
accuracy_score(y_test, preds)

# Predicting Future Games

Because of the current postponement of certain games, I am putting this part of the project on hold, I will be optimizing the models and trying different modeling techniques like kNN, Decision Trees, etc. to see which works bets in the meantime. The general idea of this modeling is to split the data myself so that the test data consists of the matchweek's upcoming games, because those are the ones we want to predict

In [None]:
features = ['home_xg', 'home_xg_against', 'away_xg', 'away_xg_against']
X_train = prem_games[0:len(prem_games.loc[prem_games['status'] == 'complete'])][features]
X_test = prem_games[len(prem_games.loc[prem_games['status'] == 'complete']):len(prem_games.loc[prem_games['status'] == 'complete']) + 10][features]
y_train = prem_games[0:len(prem_games.loc[prem_games['status'] == 'complete'])]['over/under']
y_test = prem_games[len(prem_games.loc[prem_games['status'] == 'complete']):len(prem_games.loc[prem_games['status'] == 'complete']) + 10]['over/under']

#Explanation of code: for X_train, we want to train the data on the matches that have already been played, so we take the values
# of all the matches that have been played (0: len(completed matches)
# for X_test, we take the next 10 games -- the ones we want to predict (len(completed matches) : len(completed matches ) + 10) because 10 games/week

I'm also planning on some more model optimization such as cross validation and trying new modeling techniques like Naive Bayes and Random Forest, then I'd like to move more into which team will win, engineering more features etc.