In [1]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [2]:
import os
os.chdir("/content/gdrive/My Drive/")

In [3]:
import pandas as pd
import numpy as np
import glob
import datetime
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import re
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, classification_report, recall_score
import math

from sklearn.inspection import permutation_importance


In [13]:
#Read all datasets .csv files into one pandas dataframe
path = r'/content/gdrive/My Drive/Datasets/' # use your path
all_files = glob.glob(path + "/*.csv")
li = []
for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)

#Filter out the betting stats columns for now
#Filter out the in games stats like fouls and cards for now also
df = frame.loc[: , :"Referee"]

#If i want to create a dataframe for a specific team this is how id access that.
"""
  df = df.where((df["HomeTeam"] == "Liverpool") | (df["AwayTeam"] == "Liverpool"))
  df = df.dropna()

  #How to calculate the rolling count of liverpool wins, loses and draws while playing at home and while playing away
  print(pd.get_dummies(df["FTR"].where(df["HomeTeam"] == "Liverpool")).rolling(min_periods=1, window=10).sum())
  print(pd.get_dummies(df["FTR"].where(df["AwayTeam"] == "Liverpool")).rolling(min_periods=1, window=10).sum())
"""


#calculate the rolling sum of the last 5 games to assess home and away scoring tendencies
df["HGS5"] = df["FTHG"].rolling(min_periods=1, window=5).sum()
df["AGS5"] = df["FTAG"].rolling(min_periods=1, window=5).sum()

#Calculate the average goals scored by home and away 
df["AHGS5"] = df["HGS5"]/5
df["AAGS5"] = df["AGS5"]/5

#Calculate the percentage of wins for a home team, wins for the away team and draws in the last 10 games
df["HWins10"] = pd.get_dummies(df["FTR"]).rolling(min_periods=1, window=10).sum()["H"]
df["AWins10"] = pd.get_dummies(df["FTR"]).rolling(min_periods=1, window=10).sum()["A"]
df["Draws10"] = pd.get_dummies(df["FTR"]).rolling(min_periods=1, window=10).sum()["D"]

#Calculate day of the week from data
days_list = []
for i in range(len(df)):
  days_list.append(pd.to_datetime(df.loc[i, "Date"]).weekday())
df["Day"] = days_list

feature_set = df.drop(columns = ['Div', 'Date', 'FTHG', 'FTAG','HTHG', 'HTAG', 'HTR'])
feature_set = feature_set.rename(columns={"Time": "KickOffTime", "FTR": "Result"})

#Drop any rows that have a NaN value in it.
feature_set = feature_set[["AHGS5", "AAGS5","HWins10","AWins10", "Result"]] #"Day", "HomeTeam","AwayTeam", "Referee","HGS5", "AGS5"
feature_set = feature_set.dropna()




  
 

In [18]:
y = np.array(feature_set.loc[:, "Result"])
X = np.array(feature_set.loc[:,:"AWins10"])

scaler = MinMaxScaler()
scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(scaled, y, test_size=0.25)


[[0.47368421 0.47058824 0.3        0.375     ]
 [0.31578947 0.35294118 0.3        0.5       ]
 [0.47368421 0.17647059 0.5        0.5       ]
 ...
 [0.47368421 0.47058824 0.6        0.375     ]
 [0.26315789 0.23529412 0.6        0.25      ]
 [0.63157895 0.11764706 0.8        0.125     ]]


In [16]:
# models to test
names = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    "Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA",
]

classifiers = [
    KNeighborsClassifier(15),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=3),
    RandomForestClassifier(max_depth=3, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
]
f1_scores = []
accuracies = []
precisions = []
recalls = []
for name, clf in zip(names, classifiers):
  clf.fit(X_train, y_train)
  prediction = clf.predict(X_test)
  f1_scores.append("{}".format(round(f1_score(y_test, prediction, average="micro"),2)))
  accuracies.append("{}%".format(round(accuracy_score(y_test, prediction)*100,2)))
  precisions.append("{}%".format(round(precision_score(y_test, prediction, average="micro")*100,2)))
  recalls.append("{}%".format(round(recall_score(y_test, prediction, average="micro")*100,2)))
scores_dataframe = pd.DataFrame({'Classifiers': names, 'F1-Scores': f1_scores, 'Accuracy': accuracies, 'Precision': precisions, 'Recall': recalls})
display(scores_dataframe)

Unnamed: 0,Classifiers,F1-Scores,Accuracy,Precision,Recall
0,Nearest Neighbors,0.52,52.19%,52.19%,52.19%
1,Linear SVM,0.53,53.08%,53.08%,53.08%
2,RBF SVM,0.55,55.07%,55.07%,55.07%
3,Gaussian Process,0.56,55.67%,55.67%,55.67%
4,Decision Tree,0.51,50.99%,50.99%,50.99%
5,Random Forest,0.54,54.17%,54.17%,54.17%
6,Neural Net,0.55,55.17%,55.17%,55.17%
7,AdaBoost,0.54,53.78%,53.78%,53.78%
8,Naive Bayes,0.54,53.58%,53.58%,53.58%
9,QDA,0.55,54.67%,54.67%,54.67%


In [20]:
#Edit the below code from sklearn.org to calcualte feature importance from my current feature, repeat this a number of times to achieve a better set of features

my_model = SVC(kernel="linear", C=0.025).fit(X_train, y_train)
from sklearn.inspection import permutation_importance
r = permutation_importance(my_model, X_test, y_test,
                            n_repeats=30,
                            random_state=0)

for i in r.importances_mean.argsort()[::-1]:
  print(f"{feature_set.columns[i]:<8}"
               f"{r.importances_mean[i]:.3f}"
               f" +/- {r.importances_std[i]:.3f}")

AWins10 0.025 +/- 0.006
AHGS5   0.020 +/- 0.005
HWins10 0.018 +/- 0.005
AAGS5   0.017 +/- 0.007


In [None]:
"""
    To do:
          GridSearchCv for each of classifiers and find the best parameters to run them on
          Look up ways to inversely weight older seasons vs newer seasons e.g a season from 3 years ago will be multiplied by 1/3 and so on.
"""