In [5]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [6]:
import os
os.chdir("/content/gdrive/My Drive/")

In [7]:
import pandas as pd
import numpy as np
import glob
import datetime
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
import re
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, f1_score, precision_score, classification_report, recall_score
import math

from sklearn.inspection import permutation_importance
from sklearn.model_selection import GridSearchCV

from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process.kernels import DotProduct
from sklearn.gaussian_process.kernels import Matern
from sklearn.gaussian_process.kernels import RationalQuadratic
from sklearn.gaussian_process.kernels import WhiteKernel

from sklearn.compose import make_column_transformer

import warnings
warnings.filterwarnings('ignore')


Create a dataset that each row calculates per team rather than just home and away e.g one big dataset but if home team = liverpool HWins10 will be the number of home wins for liverpool in the last 10.

In [9]:
#Read all datasets .csv files into one pandas dataframe
path = r'/content/gdrive/My Drive/Datasets/' # use your path
all_files = glob.glob(path + "/*.csv")
li = []
for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)

#Filter out the betting stats columns for now
#Filter out the in games stats like fouls and cards for now also
df = frame.loc[: , :"Referee"]

#Encode the dataframe (possibly should just do separate labelencoders per string column)
le = preprocessing.LabelEncoder()
df[["HomeTeam", "AwayTeam"]] = df[["HomeTeam", "AwayTeam"]].apply(le.fit_transform)

individual_team_df = df.copy()

home_goals = []
away_goals = []
rivals_home = []
rivals_away = []

for row_num, row in individual_team_df.iterrows():
  #calculate the rolling sum of the last 5 games to assess home and away scoring tendencies
  individual_team_df["HGS5"] = individual_team_df.loc[individual_team_df['HomeTeam'] == row['HomeTeam']]["FTHG"].rolling(min_periods=1, window=5).sum()
  individual_team_df["AGS5"] = individual_team_df.loc[individual_team_df['AwayTeam'] == row['AwayTeam']]["FTAG"].rolling(min_periods=1, window=5).sum()
  individual_team_df["RivalsHGS5"] = individual_team_df.loc[(individual_team_df['AwayTeam'] == row['AwayTeam']) & (individual_team_df['HomeTeam'] == row['HomeTeam'])]["FTHG"].rolling(min_periods=1, window=5).sum()
  individual_team_df["RivalsAGS5"] = individual_team_df.loc[(individual_team_df['AwayTeam'] == row['AwayTeam']) & (individual_team_df['HomeTeam'] == row['HomeTeam'])]["FTAG"].rolling(min_periods=1, window=5).sum()

  home_goals.append(individual_team_df.loc[row_num,"HGS5"])
  away_goals.append(individual_team_df.loc[row_num,"AGS5"])
  rivals_home.append(individual_team_df.loc[row_num,"RivalsHGS5"])
  rivals_away.append(individual_team_df.loc[row_num,"RivalsAGS5"])

individual_team_df['HGS5'] = home_goals
individual_team_df['AGS5'] = away_goals
individual_team_df['RivalsHGS5'] = rivals_home
individual_team_df['RivalsAGS5'] = rivals_away

#Calculate the average goals scored by home and away in their last 5 games aswell as the last 5 times they played eachother
individual_team_df["AHGS5"] = individual_team_df["HGS5"]/5
individual_team_df["AAGS5"] = individual_team_df["AGS5"]/5
individual_team_df["ARivalsHGS5"] = individual_team_df["RivalsHGS5"]/5
individual_team_df["ARivalsAGS5"] = individual_team_df["RivalsAGS5"]/5

#Calculate percentage home wins, draws and away wins in the last 5 games
individual_team_df["HWins5"] = pd.get_dummies(individual_team_df["FTR"]).rolling(min_periods=1, window=5).sum()["H"]/5
individual_team_df["AWins5"] = pd.get_dummies(individual_team_df["FTR"]).rolling(min_periods=1, window=5).sum()["A"]/5
individual_team_df["Draws5"] = pd.get_dummies(individual_team_df["FTR"]).rolling(min_periods=1, window=5).sum()["D"]/5

#Drop any rows that have a NaN value in it.
feature_set = individual_team_df.rename(columns={"FTR": "Result"})
feature_set = feature_set[["HomeTeam", "AwayTeam","AHGS5", "AAGS5","ARivalsHGS5","ARivalsAGS5", "HWins5","AWins5","Draws5", "Result"]]
feature_set = feature_set.dropna()
y = np.array(feature_set.loc[:, "Result"])
X = np.array(feature_set.loc[:,:"Draws5"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# models to test
names = [
    "Nearest Neighbors",
    "SVM",
    "Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA",
    "GradientBoostingClassifier",
    "LinearDiscriminantAnalysis",
    "ExtraTreesClassifier",
    "BernoulliNB"
]

#Parameters found by extracting best_params_ after GridSearchCV()
classifiers = [
    KNeighborsClassifier(),
    SVC(C=100, gamma=0.0001, kernel = 'rbf'),
    GaussianProcessClassifier(kernel = 1**2 * Matern(length_scale=1, nu=1.5)),
    DecisionTreeClassifier(criterion = 'gini', max_depth=None),
    RandomForestClassifier(criterion = 'entropy', max_depth=3, n_estimators=10, max_features='sqrt'),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(var_smoothing = 0.01),
    QuadraticDiscriminantAnalysis(reg_param = 0.1),
    GradientBoostingClassifier(),
    LinearDiscriminantAnalysis(),
    ExtraTreeClassifier(),
    BernoulliNB()
]
f1_scores = []
accuracies = []
precisions = []
recalls = []
for name, clf in zip(names, classifiers):
  clf.fit(X_train, y_train)
  prediction = clf.predict(X_test)
  f1_scores.append("{}".format(round(f1_score(y_test, prediction, average="macro"),2)))
  accuracies.append("{}%".format(round(accuracy_score(y_test, prediction)*100,2)))
  precisions.append("{}%".format(round(precision_score(y_test, prediction, average="macro")*100,2)))
  recalls.append("{}%".format(round(recall_score(y_test, prediction, average="macro")*100,2)))
scores_dataframe = pd.DataFrame({'Classifiers': names, 'F1-Scores': f1_scores, 'Accuracy': accuracies, 'Precision': precisions, 'Recall': recalls}).sort_values(by=["F1-Scores"], ascending=False)
scores_dataframe = scores_dataframe.sort_values(by=["F1-Scores"], ascending = False)
print(scores_dataframe.to_latex(index=False))
display(scores_dataframe) 


\begin{tabular}{lllll}
\toprule
               Classifiers & F1-Scores & Accuracy & Precision & Recall \\
\midrule
GradientBoostingClassifier &      0.78 &   80.25\% &    78.46\% &  77.8\% \\
                  AdaBoost &      0.75 &   77.52\% &     75.5\% & 75.06\% \\
          Gaussian Process &      0.72 &   74.66\% &    72.57\% & 71.37\% \\
LinearDiscriminantAnalysis &      0.71 &   73.66\% &    71.32\% & 71.04\% \\
                       SVM &       0.7 &   74.16\% &    72.28\% &  69.9\% \\
                Neural Net &      0.69 &    71.8\% &    70.73\% & 67.97\% \\
                       QDA &      0.67 &   70.68\% &    68.38\% & 66.39\% \\
             Decision Tree &      0.65 &   67.33\% &    65.03\% & 65.15\% \\
             Random Forest &      0.61 &   66.34\% &    67.59\% & 60.11\% \\
      ExtraTreesClassifier &       0.6 &   62.73\% &    60.21\% & 60.19\% \\
         Nearest Neighbors &      0.51 &   53.42\% &    50.58\% & 50.86\% \\
               Naive Bayes &      0.48

Unnamed: 0,Classifiers,F1-Scores,Accuracy,Precision,Recall
9,GradientBoostingClassifier,0.78,80.25%,78.46%,77.8%
6,AdaBoost,0.75,77.52%,75.5%,75.06%
2,Gaussian Process,0.72,74.66%,72.57%,71.37%
10,LinearDiscriminantAnalysis,0.71,73.66%,71.32%,71.04%
1,SVM,0.7,74.16%,72.28%,69.9%
5,Neural Net,0.69,71.8%,70.73%,67.97%
8,QDA,0.67,70.68%,68.38%,66.39%
3,Decision Tree,0.65,67.33%,65.03%,65.15%
4,Random Forest,0.61,66.34%,67.59%,60.11%
11,ExtraTreesClassifier,0.6,62.73%,60.21%,60.19%


In [10]:
from sklearn.inspection import permutation_importance
my_model = AdaBoostClassifier().fit(X_train, y_train)
r = permutation_importance(my_model, X_test, y_test,
                            n_repeats=30,
                            random_state=0)

for i in r.importances_mean.argsort()[::-1]:
    if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
         print(f"{feature_set.columns[i]:<8}"
               f"{r.importances_mean[i]:.3f}"
               f" +/- {r.importances_std[i]:.3f}")

ARivalsHGS50.170 +/- 0.012
ARivalsAGS50.156 +/- 0.011
AWins5  0.066 +/- 0.011
Draws5  0.053 +/- 0.010
HWins5  0.045 +/- 0.007
AAGS5   0.030 +/- 0.007
AHGS5   0.028 +/- 0.008


In [11]:
#Read all datasets .csv files into one pandas dataframe
path = r'/content/gdrive/My Drive/Datasets/' # use your path
all_files = glob.glob(path + "/*.csv")
li = []
for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)

#Filter out the betting stats columns for now
#Filter out the in games stats like fouls and cards for now also
df = frame.loc[: , :"Referee"]

#Encode the dataframe (possibly should just do separate labelencoders per string column)
le = preprocessing.LabelEncoder()
df[["HomeTeam", "AwayTeam"]] = df[["HomeTeam", "AwayTeam"]].apply(le.fit_transform)

individual_team_df = df.copy()

home_goals = []
away_goals = []
rivals_home = []
rivals_away = []

for row_num, row in individual_team_df.iterrows():
  #calculate the rolling sum of the last 5 games to assess home and away scoring tendencies
  individual_team_df["HGS5"] = individual_team_df.loc[individual_team_df['HomeTeam'] == row['HomeTeam']]["FTHG"].rolling(min_periods=1, window=5).sum()
  individual_team_df["AGS5"] = individual_team_df.loc[individual_team_df['AwayTeam'] == row['AwayTeam']]["FTAG"].rolling(min_periods=1, window=5).sum()
  individual_team_df["RivalsHGS5"] = individual_team_df.loc[(individual_team_df['AwayTeam'] == row['AwayTeam']) & (individual_team_df['HomeTeam'] == row['HomeTeam'])]["FTHG"].rolling(min_periods=1, window=5).sum()
  individual_team_df["RivalsAGS5"] = individual_team_df.loc[(individual_team_df['AwayTeam'] == row['AwayTeam']) & (individual_team_df['HomeTeam'] == row['HomeTeam'])]["FTAG"].rolling(min_periods=1, window=5).sum()

  home_goals.append(individual_team_df.loc[row_num,"HGS5"])
  away_goals.append(individual_team_df.loc[row_num,"AGS5"])
  rivals_home.append(individual_team_df.loc[row_num,"RivalsHGS5"])
  rivals_away.append(individual_team_df.loc[row_num,"RivalsAGS5"])

individual_team_df['HGS5'] = home_goals
individual_team_df['AGS5'] = away_goals
individual_team_df['RivalsHGS5'] = rivals_home
individual_team_df['RivalsAGS5'] = rivals_away

#Calculate the average goals scored by home and away in their last 5 games aswell as the last 5 times they played eachother
individual_team_df["AHGS5"] = individual_team_df["HGS5"]/5
individual_team_df["AAGS5"] = individual_team_df["AGS5"]/5
individual_team_df["ARivalsHGS5"] = individual_team_df["RivalsHGS5"]/5
individual_team_df["ARivalsAGS5"] = individual_team_df["RivalsAGS5"]/5

#Calculate percentage home wins, draws and away wins in the last 5 games
individual_team_df["HWins5"] = pd.get_dummies(individual_team_df["FTR"]).rolling(min_periods=1, window=5).sum()["H"]/5
individual_team_df["AWins5"] = pd.get_dummies(individual_team_df["FTR"]).rolling(min_periods=1, window=5).sum()["A"]/5
individual_team_df["Draws5"] = pd.get_dummies(individual_team_df["FTR"]).rolling(min_periods=1, window=5).sum()["D"]/5

#Drop any rows that have a NaN value in it.
feature_set = individual_team_df.rename(columns={"FTR": "Result"})
feature_set = feature_set[["AHGS5", "AAGS5", "ARivalsHGS5","ARivalsAGS5", "HWins5","AWins5","Draws5", "Result"]]
feature_set = feature_set.dropna()
y = np.array(feature_set.loc[:, "Result"])
X = np.array(feature_set.loc[:,:"Draws5"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# models to test
names = [
    "Nearest Neighbors",
    "SVM",
    "Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA",
    "GradientBoostingClassifier",
    "LinearDiscriminantAnalysis",
    "ExtraTreesClassifier",
    "BernoulliNB"
]

#Parameters found by extracting best_params_ after GridSearchCV()
classifiers = [
    KNeighborsClassifier(),
    SVC(C=100, gamma=0.0001, kernel = 'rbf'),
    GaussianProcessClassifier(kernel = 1**2 * Matern(length_scale=1, nu=1.5)),
    DecisionTreeClassifier(criterion = 'gini', max_depth=None),
    RandomForestClassifier(criterion = 'entropy', max_depth=3, n_estimators=10, max_features='sqrt'),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(var_smoothing = 0.01),
    QuadraticDiscriminantAnalysis(reg_param = 0.1),
    GradientBoostingClassifier(),
    LinearDiscriminantAnalysis(),
    ExtraTreeClassifier(),
    BernoulliNB()
]

f1_scores = []
accuracies = []
precisions = []
recalls = []
for name, clf in zip(names, classifiers):
  clf.fit(X_train, y_train)
  prediction = clf.predict(X_test)
  f1_scores.append("{}".format(round(f1_score(y_test, prediction, average="macro"),2)))
  accuracies.append("{}%".format(round(accuracy_score(y_test, prediction)*100,2)))
  precisions.append("{}%".format(round(precision_score(y_test, prediction, average="macro")*100,2)))
  recalls.append("{}%".format(round(recall_score(y_test, prediction, average="macro")*100,2)))
scores_dataframe = pd.DataFrame({'Classifiers': names, 'F1-Scores': f1_scores, 'Accuracy': accuracies, 'Precision': precisions, 'Recall': recalls})
scores_dataframe = scores_dataframe.sort_values(by=["F1-Scores"], ascending = False)
print(scores_dataframe.to_latex(index=False))
display(scores_dataframe) 


\begin{tabular}{lllll}
\toprule
               Classifiers & F1-Scores & Accuracy & Precision & Recall \\
\midrule
GradientBoostingClassifier &      0.76 &   78.26\% &    76.61\% & 76.06\% \\
          Gaussian Process &      0.75 &   77.27\% &    75.68\% &  74.7\% \\
                  AdaBoost &      0.74 &    76.4\% &    74.66\% & 74.38\% \\
                Neural Net &      0.71 &   74.66\% &    73.23\% &  71.3\% \\
         Nearest Neighbors &       0.7 &    72.3\% &    70.04\% & 69.87\% \\
LinearDiscriminantAnalysis &       0.7 &    72.3\% &    70.18\% & 69.42\% \\
             Decision Tree &      0.68 &   69.57\% &    67.58\% &  68.1\% \\
               Naive Bayes &      0.68 &   70.56\% &    68.61\% & 68.06\% \\
                       SVM &      0.67 &   72.05\% &    71.25\% & 67.47\% \\
                       QDA &      0.66 &   70.68\% &    68.79\% & 66.19\% \\
      ExtraTreesClassifier &      0.62 &   64.97\% &    62.23\% & 62.44\% \\
             Random Forest &       0.6

Unnamed: 0,Classifiers,F1-Scores,Accuracy,Precision,Recall
9,GradientBoostingClassifier,0.76,78.26%,76.61%,76.06%
2,Gaussian Process,0.75,77.27%,75.68%,74.7%
6,AdaBoost,0.74,76.4%,74.66%,74.38%
5,Neural Net,0.71,74.66%,73.23%,71.3%
0,Nearest Neighbors,0.7,72.3%,70.04%,69.87%
10,LinearDiscriminantAnalysis,0.7,72.3%,70.18%,69.42%
3,Decision Tree,0.68,69.57%,67.58%,68.1%
7,Naive Bayes,0.68,70.56%,68.61%,68.06%
1,SVM,0.67,72.05%,71.25%,67.47%
8,QDA,0.66,70.68%,68.79%,66.19%


We can remove HomeTeam and AwayTeam Features from the feature set based on this feature importance. When we do this I see an increase in performance in all models. AdaBoostClassifier is still the best achieving an accuracy score of ~76% and an F1-score increases from 0.69 to ~0.73. The GaussianProcessClassifier achieves an accuracy score of ~77% and also increases it's F1-score to ~0.75. MLPClassifier also saw an increase to 0.73 for F1-Score and ~77% for accuracy score.


Other models to note massive improvement in after this feature selection step is the KNearestNeighbor model and the Naive Bayes model where we see an increase of ~0.12 and ~0.19 in their respective F1-scores to. While the rest of the models show an increase of ~0.05 in F1-score.

**Below is commented out code if I want to change the feature calculation of Home Wins, Away Wins and Draws. After testing I get more or less the same results as without this and using the above code instead.**

In [None]:
"""for row_num, row in individual_team_df.iterrows():
  #calculate the rolling sum of the last 5 games to assess home and away scoring tendencies
  individual_team_df["HGS5"] = individual_team_df.loc[individual_team_df['HomeTeam'] == row['HomeTeam']]["FTHG"].rolling(min_periods=1, window=5).sum()
  individual_team_df["AGS5"] = individual_team_df.loc[individual_team_df['AwayTeam'] == row['AwayTeam']]["FTAG"].rolling(min_periods=1, window=5).sum()
  individual_team_df["RivalsHGS5"] = individual_team_df.loc[(individual_team_df['AwayTeam'] == row['AwayTeam']) & (individual_team_df['HomeTeam'] == row['HomeTeam'])]["FTHG"].rolling(min_periods=1, window=5).sum()
  individual_team_df["RivalsAGS5"] = individual_team_df.loc[(individual_team_df['AwayTeam'] == row['AwayTeam']) & (individual_team_df['HomeTeam'] == row['HomeTeam'])]["FTAG"].rolling(min_periods=1, window=5).sum()
  
  #Calculate wins for home team while playing at home, wins for away team while playing away and draws in the last 5 games
  home_away_results = pd.get_dummies(individual_team_df.loc[individual_team_df['HomeTeam'] == row['HomeTeam']]["FTR"]).rolling(min_periods=1, window=5).sum()
  draw_results = pd.get_dummies(individual_team_df.loc[(individual_team_df['AwayTeam'] == row['AwayTeam']) | (individual_team_df['HomeTeam'] == row['HomeTeam'])]["FTR"]).rolling(min_periods=1, window=5).sum()
  if "H" in home_away_results.keys():
    individual_team_df["HWins5"] = pd.get_dummies(individual_team_df.loc[individual_team_df['HomeTeam'] == row['HomeTeam']]["FTR"]).rolling(min_periods=1, window=5).sum()["H"]/5
  else:
    individual_team_df["HWins5"] = 0
  if "A" in home_away_results.keys():
    individual_team_df["AWins5"] = pd.get_dummies(individual_team_df.loc[individual_team_df['AwayTeam'] == row['AwayTeam']]["FTR"]).rolling(min_periods=1, window=5).sum()["A"]/5
  else:
    individual_team_df["AWins5"] = 0
  if "D" in draw_results.keys():
    individual_team_df["Draws5"] = pd.get_dummies(individual_team_df.loc[(individual_team_df['AwayTeam'] == row['AwayTeam']) | (individual_team_df['HomeTeam'] == row['HomeTeam'])]["FTR"]).rolling(min_periods=1, window=5).sum()["D"]/5
  else:
    individual_team_df["Draws5"] = 0
  
  home_goals.append(individual_team_df.loc[row_num,"HGS5"])
  away_goals.append(individual_team_df.loc[row_num,"AGS5"])
  rivals_home.append(individual_team_df.loc[row_num,"RivalsHGS5"])
  rivals_away.append(individual_team_df.loc[row_num,"RivalsAGS5"])
  home_wins.append(individual_team_df.loc[row_num,"HWins5"])
  away_wins.append(individual_team_df.loc[row_num,"AWins5"])
  draws.append(individual_team_df.loc[row_num,"Draws5"])

individual_team_df['HGS5'] = home_goals
individual_team_df['AGS5'] = away_goals
individual_team_df['RivalsHGS5'] = rivals_home
individual_team_df['RivalsAGS5'] = rivals_away
individual_team_df['HWins5'] = home_wins
individual_team_df['AWins5'] = away_wins
individual_team_df['Draws5'] = draws"""