In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
import os
os.chdir("/content/gdrive/My Drive/")

In [None]:
import pandas as pd
import numpy as np
import glob
import datetime
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import re
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, f1_score, precision_score, classification_report, recall_score
import math

from sklearn.inspection import permutation_importance


In [None]:
#Read all datasets .csv files into one pandas dataframe
path = r'/content/gdrive/My Drive/Datasets/' # use your path
all_files = glob.glob(path + "/*.csv")
li = []
for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)

#Filter out the betting stats columns for now
#Filter out the in games stats like fouls and cards for now also
df = frame.loc[: , :"Referee"]

#If i want to create a dataframe for a specific team this is how id access that.
"""
  df = df.where((df["HomeTeam"] == "Liverpool") | (df["AwayTeam"] == "Liverpool"))
  df = df.dropna()

  #How to calculate the rolling count of liverpool wins, loses and draws while playing at home and while playing away
  print(pd.get_dummies(df["FTR"].where(df["HomeTeam"] == "Liverpool")).rolling(min_periods=1, window=10).sum())
  print(pd.get_dummies(df["FTR"].where(df["AwayTeam"] == "Liverpool")).rolling(min_periods=1, window=10).sum())
"""


#calculate the rolling sum of the last 5 games to assess home and away scoring tendencies
df["HGS5"] = df["FTHG"].rolling(min_periods=1, window=5).sum()
df["AGS5"] = df["FTAG"].rolling(min_periods=1, window=5).sum()

#Calculate the average goals scored by home and away 
df["AHGS5"] = df["HGS5"]/5
df["AAGS5"] = df["AGS5"]/5

#Calculate number of wins for a home team, wins for the away team and draws in the last 10 games
df["HWins10"] = pd.get_dummies(df["FTR"]).rolling(min_periods=1, window=10).sum()["H"]
df["AWins10"] = pd.get_dummies(df["FTR"]).rolling(min_periods=1, window=10).sum()["A"]
df["Draws10"] = pd.get_dummies(df["FTR"]).rolling(min_periods=1, window=10).sum()["D"]

#Calculate day of the week from data
days_list = []
for i in range(len(df)):
  days_list.append(pd.to_datetime(df.loc[i, "Date"]).weekday())
df["Day"] = days_list

feature_set = df.drop(columns = ['Div', 'Date', 'FTHG', 'FTAG','HTHG', 'HTAG', 'HTR'])
feature_set = feature_set.rename(columns={"Time": "KickOffTime", "FTR": "Result"})

#Drop any rows that have a NaN value in it.
feature_set = feature_set[["Day","HWins10","AWins10", "AHGS5", "AAGS5","Draws10", "Result"]] #"AwayTeam", "Referee","HGS5", "AGS5"
feature_set = feature_set.dropna()




  
 

In [None]:
y = np.array(feature_set.loc[:, "Result"])
X = np.array(feature_set.loc[:,:"Draws10"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)


In [None]:
# models to test
names = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    #"Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA",
]

classifiers = [
    KNeighborsClassifier(15),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    #GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=3),
    RandomForestClassifier(max_depth=3, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
]
f1_scores = []
accuracies = []
precisions = []
recalls = []
for name, clf in zip(names, classifiers):
  clf.fit(X_train, y_train)
  prediction = clf.predict(X_test)
  f1_scores.append("{}".format(round(f1_score(y_test, prediction, average="weighted"),2)))
  accuracies.append("{}%".format(round(accuracy_score(y_test, prediction)*100,2)))
  precisions.append("{}%".format(round(precision_score(y_test, prediction, average="macro")*100,2)))
  recalls.append("{}%".format(round(recall_score(y_test, prediction, average="macro")*100,2)))
scores_dataframe = pd.DataFrame({'Classifiers': names, 'F1-Scores': f1_scores, 'Accuracy': accuracies, 'Precision': precisions, 'Recall': recalls})
display(scores_dataframe)
print(scores_dataframe.to_latex(index=False))

Unnamed: 0,Classifiers,F1-Scores,Accuracy,Precision,Recall
0,Nearest Neighbors,0.48,49.2%,47.2%,45.39%
1,Linear SVM,0.51,52.68%,51.27%,47.81%
2,RBF SVM,0.45,47.02%,45.23%,42.15%
3,Decision Tree,0.46,49.11%,48.78%,43.73%
4,Random Forest,0.47,51.59%,49.68%,43.92%
5,Neural Net,0.51,53.48%,52.26%,47.92%
6,AdaBoost,0.52,53.18%,51.76%,48.97%
7,Naive Bayes,0.53,53.38%,51.51%,50.21%
8,QDA,0.29,45.43%,70.64%,33.83%


\begin{tabular}{lllll}
\toprule
      Classifiers & F1-Scores & Accuracy & Precision & Recall \\
\midrule
Nearest Neighbors &      0.48 &    49.2\% &     47.2\% & 45.39\% \\
       Linear SVM &      0.51 &   52.68\% &    51.27\% & 47.81\% \\
          RBF SVM &      0.45 &   47.02\% &    45.23\% & 42.15\% \\
    Decision Tree &      0.46 &   49.11\% &    48.78\% & 43.73\% \\
    Random Forest &      0.47 &   51.59\% &    49.68\% & 43.92\% \\
       Neural Net &      0.51 &   53.48\% &    52.26\% & 47.92\% \\
         AdaBoost &      0.52 &   53.18\% &    51.76\% & 48.97\% \\
      Naive Bayes &      0.53 &   53.38\% &    51.51\% & 50.21\% \\
              QDA &      0.29 &   45.43\% &    70.64\% & 33.83\% \\
\bottomrule
\end{tabular}



In [None]:
#Edit the below code from sklearn.org to calcualte feature importance from my current feature, repeat this a number of times to achieve a better set of features

my_model = SVC(kernel="linear", C=0.025).fit(X_train, y_train)

#Calculate the feature importance for each feature in our feature set
from sklearn.inspection import permutation_importance
result = permutation_importance(my_model, X_test, y_test,
                            n_repeats=30,
                            random_state=0)

#Loop through the result set
for i in r.importances_mean.argsort()[::-1]:
  #Filter out features that their permutation importance did not pass a certain threshold
  #Print out the feature name, feature's permutation importance score and the standard deviation per feature.
    if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
         print(f"{feature_set.columns[i]:<8}"
               f"{r.importances_mean[i]:.3f}"
               f" +/- {r.importances_std[i]:.3f}")

AHGS5   0.059 +/- 0.014
AAGS5   0.046 +/- 0.010
Draws10 0.039 +/- 0.010


In [None]:
"""
    To do:
          Increase the list of classifiers to check.
          Rinse and repeat till we have a list of ~10 classifiers with realistic accuracies.
          GridSearchCv for each of classifiers and find the best parameters to run them on
          Look up ways to inversely weight older seasons vs newer seasons e.g a season from 3 years ago will be multiplied by 1/3 and so on.
"""