In [2]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [3]:
import os
os.chdir("/content/gdrive/My Drive/")

In [4]:
import pandas as pd
import numpy as np
import glob
import datetime
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
import re
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, f1_score, precision_score, classification_report, recall_score
import math

from sklearn.inspection import permutation_importance
from sklearn.model_selection import GridSearchCV

from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process.kernels import DotProduct
from sklearn.gaussian_process.kernels import Matern
from sklearn.gaussian_process.kernels import RationalQuadratic
from sklearn.gaussian_process.kernels import WhiteKernel

from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings('ignore')

#Variable to dictate what data should be considered for example 2021 would denote the 2021/2022 season and 2017 would denote the 2017/2018 season.
currentSeason = 2021


Create a dataset that each row calculates per team rather than just home and away e.g one big dataset but if home team = liverpool HWins10 will be the number of home wins for liverpool in the last 10.

In [9]:
#Read all datasets .csv files into one pandas dataframe
path = r'/content/gdrive/My Drive/Datasets/' # use your path
all_files = glob.glob(path + "/*.csv")
li = []
seasonCount = 0
for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)
    seasonCount += 1

frame = pd.concat(li, axis=0, ignore_index=True)

#Filter out the betting stats columns for now
#Filter out the in games stats like fouls and cards for now also
df = frame.loc[: , :"Referee"]

#Encode the dataframe (possibly should just do separate labelencoders per string column)
le = preprocessing.LabelEncoder()
df[["HomeTeam", "AwayTeam"]] = df[["HomeTeam", "AwayTeam"]].apply(le.fit_transform)

#Keep only the seasons before the year we have provided as currentSeason
df = df.loc[df['Year'] <= currentSeason]

#Inversely weighting older seasons by putting 1/(currentSeason - (season of the instance + 1)) 
#therefore if currentSeason was 2021 and the instance year was 2019 the weighting would be 1/(2021-2019+1) = 1/3.
#df['Weighting'] = 1 / (currentSeason - df['Year'] + 1)
df['Weighting'] = (seasonCount - (currentSeason - df['Year']))/seasonCount
individual_team_df = df.copy()

home_goals = []
away_goals = []
rivals_home = []
rivals_away = []

for row_num, row in individual_team_df.iterrows():
  #calculate the rolling sum of the last 5 games to assess home and away scoring tendencies
  individual_team_df["HGS5"] = individual_team_df.loc[individual_team_df['HomeTeam'] == row['HomeTeam']]["FTHG"].rolling(min_periods=1, window=5).sum()
  individual_team_df["AGS5"] = individual_team_df.loc[individual_team_df['AwayTeam'] == row['AwayTeam']]["FTAG"].rolling(min_periods=1, window=5).sum()
  individual_team_df["RivalsHGS5"] = individual_team_df.loc[(individual_team_df['AwayTeam'] == row['AwayTeam']) & (individual_team_df['HomeTeam'] == row['HomeTeam'])]["FTHG"].rolling(min_periods=1, window=5).sum()
  individual_team_df["RivalsAGS5"] = individual_team_df.loc[(individual_team_df['AwayTeam'] == row['AwayTeam']) & (individual_team_df['HomeTeam'] == row['HomeTeam'])]["FTAG"].rolling(min_periods=1, window=5).sum()

  home_goals.append(individual_team_df.loc[row_num,"HGS5"])
  away_goals.append(individual_team_df.loc[row_num,"AGS5"])
  rivals_home.append(individual_team_df.loc[row_num,"RivalsHGS5"])
  rivals_away.append(individual_team_df.loc[row_num,"RivalsAGS5"])

individual_team_df['HGS5'] = home_goals
individual_team_df['AGS5'] = away_goals
individual_team_df['RivalsHGS5'] = rivals_home
individual_team_df['RivalsAGS5'] = rivals_away

#Calculate the average goals scored by home and away in their last 5 games aswell as the last 5 times they played eachother
individual_team_df["AHGS5"] = individual_team_df["HGS5"]/5
individual_team_df["AAGS5"] = individual_team_df["AGS5"]/5
individual_team_df["ARivalsHGS5"] = individual_team_df["RivalsHGS5"]/5
individual_team_df["ARivalsAGS5"] = individual_team_df["RivalsAGS5"]/5

#Calculate percentage home wins, draws and away wins in the last 5 games
individual_team_df["HWins5"] = pd.get_dummies(individual_team_df["FTR"]).rolling(min_periods=1, window=5).sum()["H"]/5
individual_team_df["AWins5"] = pd.get_dummies(individual_team_df["FTR"]).rolling(min_periods=1, window=5).sum()["A"]/5
individual_team_df["Draws5"] = pd.get_dummies(individual_team_df["FTR"]).rolling(min_periods=1, window=5).sum()["D"]/5

#Drop any rows that have a NaN value in it.
feature_set = individual_team_df.rename(columns={"FTR": "Result"})

weights = feature_set['Weighting']
feature_set = feature_set[["HomeTeam", "AwayTeam", "ARivalsHGS5","ARivalsAGS5", "HWins5","AWins5","Draws5", "Result"]]

#Scale the feature data
scaler = MinMaxScaler()
scaled = scaler.fit_transform(feature_set.loc[:,:"Draws5"])

#Don't use the weightings they reduce the performance of all models significantly
#feature_set.loc[:,:"Draws5"] = feature_set.loc[:,:"Draws5"].multiply(weights,axis=0)
feature_set = feature_set.dropna()

y = np.array(feature_set.loc[:, "Result"])
X = np.array(feature_set.loc[:,:"Draws5"])

"""test_size = round(len(X)*0.2)
X_test = X[-test_size:]
y_test = y[-test_size:]
X_train = X[:-test_size]
y_train = y[:-test_size]"""
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# models to test
names = [
    "Nearest Neighbors",
    "SVM",
    "Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA",
    "GradientBoostingClassifier",
    "LinearDiscriminantAnalysis",
    "ExtraTreesClassifier",
    "BernoulliNB"
]

#Parameters found by extracting best_params_ after GridSearchCV()
classifiers = [
    KNeighborsClassifier(),
    SVC(C=100, gamma=0.0001, kernel = 'rbf'),
    GaussianProcessClassifier(kernel = 1**2 * Matern(length_scale=1, nu=1.5)),
    DecisionTreeClassifier(criterion = 'gini', max_depth=None),
    RandomForestClassifier(criterion = 'entropy', max_depth=3, n_estimators=10, max_features='sqrt'),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(var_smoothing = 0.01),
    QuadraticDiscriminantAnalysis(reg_param = 0.1),
    GradientBoostingClassifier(),
    LinearDiscriminantAnalysis(),
    ExtraTreeClassifier(),
    BernoulliNB()
]

f1_scores = []
accuracies = []
precisions = []
recalls = []
for name, clf in zip(names, classifiers):
  clf.fit(X_train, y_train)
  prediction = clf.predict(X_test)
  f1_scores.append("{}".format(round(f1_score(y_test, prediction, average="weighted"),2)))
  accuracies.append("{}%".format(round(accuracy_score(y_test, prediction)*100,2)))
  precisions.append("{}%".format(round(precision_score(y_test, prediction, average="macro")*100,2)))
  recalls.append("{}%".format(round(recall_score(y_test, prediction, average="macro")*100,2)))
scores_dataframe = pd.DataFrame({'Classifiers': names, 'F1-Scores': f1_scores, 'Accuracy': accuracies, 'Precision': precisions, 'Recall': recalls})
display(scores_dataframe) 
print(scores_dataframe.to_latex(index=False))


Unnamed: 0,Classifiers,F1-Scores,Accuracy,Precision,Recall
0,Nearest Neighbors,0.5,50.07%,47.63%,47.85%
1,SVM,0.7,71.6%,70.17%,67.3%
2,Gaussian Process,0.67,69.96%,67.72%,64.48%
3,Decision Tree,0.68,68.31%,66.43%,66.23%
4,Random Forest,0.68,69.41%,68.39%,64.93%
5,Neural Net,0.72,72.84%,70.69%,69.43%
6,AdaBoost,0.75,75.03%,73.31%,72.96%
7,Naive Bayes,0.46,55.42%,39.92%,45.72%
8,QDA,0.68,69.41%,67.58%,65.15%
9,GradientBoostingClassifier,0.77,76.82%,75.22%,74.53%


\begin{tabular}{lllll}
\toprule
               Classifiers & F1-Scores & Accuracy & Precision & Recall \\
\midrule
         Nearest Neighbors &       0.5 &   50.07\% &    47.63\% & 47.85\% \\
                       SVM &       0.7 &    71.6\% &    70.17\% &  67.3\% \\
          Gaussian Process &      0.67 &   69.96\% &    67.72\% & 64.48\% \\
             Decision Tree &      0.68 &   68.31\% &    66.43\% & 66.23\% \\
             Random Forest &      0.68 &   69.41\% &    68.39\% & 64.93\% \\
                Neural Net &      0.72 &   72.84\% &    70.69\% & 69.43\% \\
                  AdaBoost &      0.75 &   75.03\% &    73.31\% & 72.96\% \\
               Naive Bayes &      0.46 &   55.42\% &    39.92\% & 45.72\% \\
                       QDA &      0.68 &   69.41\% &    67.58\% & 65.15\% \\
GradientBoostingClassifier &      0.77 &   76.82\% &    75.22\% & 74.53\% \\
LinearDiscriminantAnalysis &      0.69 &   68.59\% &    66.22\% & 66.17\% \\
      ExtraTreesClassifier &      0.62

In [7]:
from sklearn.inspection import permutation_importance
my_model = AdaBoostClassifier().fit(X_train, y_train)
r = permutation_importance(my_model, X_test, y_test,
                            n_repeats=30,
                            random_state=0)

for i in r.importances_mean.argsort()[::-1]:
    if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
         print(f"{feature_set.columns[i]:<8}"
               f"{r.importances_mean[i]:.3f}"
               f" +/- {r.importances_std[i]:.3f}")

ARivalsAGS50.217 +/- 0.014
ARivalsHGS50.193 +/- 0.014
Draws5  0.074 +/- 0.010
HWins5  0.072 +/- 0.011
AWins5  0.022 +/- 0.009
