In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
import os
os.chdir("/content/gdrive/My Drive/")

In [None]:
import pandas as pd
import numpy as np
import glob
import datetime
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import re
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, f1_score, precision_score, classification_report, recall_score
import math

from sklearn.inspection import permutation_importance
from sklearn.model_selection import GridSearchCV

from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process.kernels import DotProduct
from sklearn.gaussian_process.kernels import Matern
from sklearn.gaussian_process.kernels import RationalQuadratic
from sklearn.gaussian_process.kernels import WhiteKernel

from sklearn.compose import make_column_transformer

import warnings
warnings.filterwarnings('ignore')


In [None]:
#Read all datasets .csv files into one pandas dataframe
path = r'/content/gdrive/My Drive/Datasets/' # use your path
all_files = glob.glob(path + "/*.csv")
li = []
for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)

#Filter out the betting stats columns for now
#Filter out the in games stats like fouls and cards for now also
df = frame.loc[: , :"Referee"]

#Encode the dataframe (possibly should just do separate labelencoders per string column)
le = preprocessing.LabelEncoder()
df[["HomeTeam", "AwayTeam"]] = df[["HomeTeam", "AwayTeam"]].apply(le.fit_transform)

#Withold sample of dataset for testing
test_subset = df.sample(round(len(df)*0.15))
test_subsets = []
df = df.drop(test_subset.index)

teams_list = df.HomeTeam.unique()
fitted_classifiers = {}
for team in teams_list:
  if not pd.isna(team):
    individual_team_df = df[(df["HomeTeam"]==team) | (df["AwayTeam"]==team)]
    individual_team_test_df = test_subset[(test_subset["HomeTeam"]==team) | (test_subset["AwayTeam"]==team)]

    #calculate the rolling sum of the last 5 games to assess home and away scoring tendencies
    individual_team_df["HGS5"] = individual_team_df["FTHG"].rolling(min_periods=1, window=5).sum()
    individual_team_df["AGS5"] = individual_team_df["FTAG"].rolling(min_periods=1, window=5).sum()

    individual_team_test_df["HGS5"] = individual_team_test_df["FTHG"].rolling(min_periods=1, window=5).sum()
    individual_team_test_df["AGS5"] = individual_team_test_df["FTAG"].rolling(min_periods=1, window=5).sum()

    #Calculate the average goals scored by home and away 
    individual_team_df["AHGS5"] = individual_team_df["HGS5"]/5
    individual_team_df["AAGS5"] = individual_team_df["AGS5"]/5

    individual_team_test_df["AHGS5"] = individual_team_test_df["HGS5"]/5
    individual_team_test_df["AAGS5"] = individual_team_test_df["AGS5"]/5

    #Calculate number of wins for a home team, wins for the away team and draws in the last 10 games
    individual_team_df = individual_team_df.dropna(thresh=10)
    if len(individual_team_df) > 0:
      individual_team_df["HWins10"] = pd.get_dummies(individual_team_df["FTR"]).rolling(min_periods=1, window=10).sum()["H"]
      individual_team_df["AWins10"] = pd.get_dummies(individual_team_df["FTR"]).rolling(min_periods=1, window=10).sum()["A"]
      individual_team_df["Draws10"] = pd.get_dummies(individual_team_df["FTR"]).rolling(min_periods=1, window=10).sum()["D"]

      results_sum = pd.get_dummies(individual_team_test_df["FTR"]).rolling(min_periods=1, window=10).sum()
      if "H" in results_sum:
        individual_team_test_df["HWins10"] = results_sum["H"]
      else:
        individual_team_test_df["HWins10"] = 0
      if "A" in results_sum:
        individual_team_test_df["AWins10"] = results_sum["A"]
      else:
        individual_team_test_df["AWins10"] = 0
      if "D" in results_sum:
        individual_team_test_df["Draws10"] = results_sum["D"]
      else:
        individual_team_test_df["Draws10"] = 0

      feature_set = individual_team_df.drop(columns = ['Div', 'Date', 'FTHG', 'FTAG','HTHG', 'HTAG', 'HTR'])
      feature_set = feature_set.rename(columns={"Time": "KickOffTime", "FTR": "Result"})

      test_feature_set = individual_team_test_df.drop(columns = ['Div', 'Date', 'FTHG', 'FTAG','HTHG', 'HTAG', 'HTR'])
      test_feature_set = test_feature_set.rename(columns={"Time": "KickOffTime", "FTR": "Result"})

      #Drop any rows that have a NaN value in it.
      feature_set = feature_set[["HomeTeam", "AwayTeam", "AHGS5", "AAGS5", "HWins10","AWins10","Draws10", "Result"]] #"HGS5", "AGS5"
      feature_set = feature_set.dropna()

      test_feature_set = test_feature_set[["HomeTeam", "AwayTeam", "AHGS5", "AAGS5", "HWins10","AWins10","Draws10", "Result"]] #"HGS5", "AGS5"
      test_feature_set = test_feature_set.dropna()

      test_subsets.append(test_feature_set)

      y = np.array(feature_set.loc[:, "Result"])
      X = np.array(feature_set.loc[:,:"Draws10"])

      X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

      # models to test
      names = [
          "Nearest Neighbors",
          "SVM",
          "Gaussian Process",
          "Decision Tree",
          "Random Forest",
          "Neural Net",
          "AdaBoost",
          "Naive Bayes",
          "QDA",
      ]

      #Parameters found by extracting best_params_ after GridSearchCV()
      classifiers = [
          KNeighborsClassifier(10),
          SVC(C=100, gamma=0.0001, kernel = 'rbf'),
          GaussianProcessClassifier(kernel = 1**2 * Matern(length_scale=1, nu=1.5)),
          DecisionTreeClassifier(criterion = 'gini', max_depth=3),
          RandomForestClassifier(criterion = 'entropy', max_depth=3, n_estimators=10, max_features='sqrt'),
          MLPClassifier(alpha=1, max_iter=1000),
          AdaBoostClassifier(),
          GaussianNB(var_smoothing = 0.01),
          QuadraticDiscriminantAnalysis(reg_param = 0.1),
      ]
      f1_scores = []
      accuracies = []
      precisions = []
      recalls = []
      best_classifier = 0
      for name, clf in zip(names, classifiers):
        clf.fit(X_train, y_train)
        prediction = clf.predict(X_test)
        f1_scores.append("{}".format(round(f1_score(y_test, prediction, average="micro"),2)))
        accuracies.append("{}%".format(round(accuracy_score(y_test, prediction)*100,2)))
        precisions.append("{}%".format(round(precision_score(y_test, prediction, average="micro")*100,2)))
        recalls.append("{}%".format(round(recall_score(y_test, prediction, average="micro")*100,2)))
        if round(accuracy_score(y_test, prediction)*100,2) > best_classifier:
          best_classifier = round(accuracy_score(y_test, prediction)*100,2)
          fitted_classifiers[team] = clf
      scores_dataframe = pd.DataFrame({'Classifiers': names, 'F1-Scores': f1_scores, 'Accuracy': accuracies, 'Precision': precisions, 'Recall': recalls})
      scores_dataframe = scores_dataframe[scores_dataframe['Accuracy']==scores_dataframe['Accuracy'].max()]
      print("*************** {} ***************".format(team))
      display(scores_dataframe) 
print(fitted_classifiers)

*************** 12 ***************


Unnamed: 0,Classifiers,F1-Scores,Accuracy,Precision,Recall
0,Nearest Neighbors,0.56,56.25%,56.25%,56.25%


*************** 10 ***************


Unnamed: 0,Classifiers,F1-Scores,Accuracy,Precision,Recall
7,Naive Bayes,0.58,58.18%,58.18%,58.18%


*************** 17 ***************


Unnamed: 0,Classifiers,F1-Scores,Accuracy,Precision,Recall
4,Random Forest,0.65,64.71%,64.71%,64.71%


*************** 32 ***************


Unnamed: 0,Classifiers,F1-Scores,Accuracy,Precision,Recall
0,Nearest Neighbors,0.55,54.72%,54.72%,54.72%


*************** 30 ***************


Unnamed: 0,Classifiers,F1-Scores,Accuracy,Precision,Recall
7,Naive Bayes,0.57,57.35%,57.35%,57.35%


*************** 6 ***************


Unnamed: 0,Classifiers,F1-Scores,Accuracy,Precision,Recall
8,QDA,0.7,70.0%,70.0%,70.0%


*************** 25 ***************


Unnamed: 0,Classifiers,F1-Scores,Accuracy,Precision,Recall
7,Naive Bayes,0.71,71.43%,71.43%,71.43%


*************** 11 ***************


Unnamed: 0,Classifiers,F1-Scores,Accuracy,Precision,Recall
7,Naive Bayes,0.55,55.22%,55.22%,55.22%


*************** 15 ***************


Unnamed: 0,Classifiers,F1-Scores,Accuracy,Precision,Recall
3,Decision Tree,0.3,30.0%,30.0%,30.0%
4,Random Forest,0.3,30.0%,30.0%,30.0%


*************** 19 ***************


Unnamed: 0,Classifiers,F1-Scores,Accuracy,Precision,Recall
4,Random Forest,0.64,64.29%,64.29%,64.29%


*************** 0 ***************


Unnamed: 0,Classifiers,F1-Scores,Accuracy,Precision,Recall
7,Naive Bayes,0.65,64.71%,64.71%,64.71%


*************** 26 ***************


Unnamed: 0,Classifiers,F1-Scores,Accuracy,Precision,Recall
1,SVM,0.61,60.66%,60.66%,60.66%


*************** 21 ***************


Unnamed: 0,Classifiers,F1-Scores,Accuracy,Precision,Recall
8,QDA,0.57,57.14%,57.14%,57.14%


*************** 9 ***************


Unnamed: 0,Classifiers,F1-Scores,Accuracy,Precision,Recall
5,Neural Net,0.63,62.86%,62.86%,62.86%


*************** 16 ***************


Unnamed: 0,Classifiers,F1-Scores,Accuracy,Precision,Recall
8,QDA,0.53,53.06%,53.06%,53.06%


*************** 1 ***************


Unnamed: 0,Classifiers,F1-Scores,Accuracy,Precision,Recall
0,Nearest Neighbors,0.48,48.0%,48.0%,48.0%


*************** 35 ***************


Unnamed: 0,Classifiers,F1-Scores,Accuracy,Precision,Recall
5,Neural Net,0.5,50.0%,50.0%,50.0%


*************** 7 ***************


Unnamed: 0,Classifiers,F1-Scores,Accuracy,Precision,Recall
7,Naive Bayes,0.54,53.66%,53.66%,53.66%
8,QDA,0.54,53.66%,53.66%,53.66%


*************** 18 ***************


Unnamed: 0,Classifiers,F1-Scores,Accuracy,Precision,Recall
3,Decision Tree,0.74,73.91%,73.91%,73.91%
6,AdaBoost,0.74,73.91%,73.91%,73.91%


*************** 33 ***************


Unnamed: 0,Classifiers,F1-Scores,Accuracy,Precision,Recall
6,AdaBoost,0.51,50.79%,50.79%,50.79%


*************** 4 ***************


Unnamed: 0,Classifiers,F1-Scores,Accuracy,Precision,Recall
7,Naive Bayes,0.56,56.25%,56.25%,56.25%


*************** 31 ***************


Unnamed: 0,Classifiers,F1-Scores,Accuracy,Precision,Recall
2,Gaussian Process,0.51,51.35%,51.35%,51.35%
3,Decision Tree,0.51,51.35%,51.35%,51.35%


*************** 22 ***************


Unnamed: 0,Classifiers,F1-Scores,Accuracy,Precision,Recall
7,Naive Bayes,0.58,57.89%,57.89%,57.89%


*************** 13 ***************


Unnamed: 0,Classifiers,F1-Scores,Accuracy,Precision,Recall
4,Random Forest,0.62,61.54%,61.54%,61.54%
7,Naive Bayes,0.62,61.54%,61.54%,61.54%


*************** 8 ***************


Unnamed: 0,Classifiers,F1-Scores,Accuracy,Precision,Recall
0,Nearest Neighbors,0.64,64.29%,64.29%,64.29%


*************** 27 ***************


Unnamed: 0,Classifiers,F1-Scores,Accuracy,Precision,Recall
7,Naive Bayes,0.61,60.87%,60.87%,60.87%


*************** 29 ***************


Unnamed: 0,Classifiers,F1-Scores,Accuracy,Precision,Recall
7,Naive Bayes,0.59,58.7%,58.7%,58.7%


*************** 14 ***************


Unnamed: 0,Classifiers,F1-Scores,Accuracy,Precision,Recall
4,Random Forest,0.6,60.0%,60.0%,60.0%


*************** 20 ***************


Unnamed: 0,Classifiers,F1-Scores,Accuracy,Precision,Recall
6,AdaBoost,0.67,66.67%,66.67%,66.67%


*************** 28 ***************


Unnamed: 0,Classifiers,F1-Scores,Accuracy,Precision,Recall
1,SVM,0.55,54.76%,54.76%,54.76%


*************** 5 ***************


Unnamed: 0,Classifiers,F1-Scores,Accuracy,Precision,Recall
0,Nearest Neighbors,0.6,60.0%,60.0%,60.0%


*************** 23 ***************


Unnamed: 0,Classifiers,F1-Scores,Accuracy,Precision,Recall
0,Nearest Neighbors,0.6,60.0%,60.0%,60.0%
1,SVM,0.6,60.0%,60.0%,60.0%
3,Decision Tree,0.6,60.0%,60.0%,60.0%
4,Random Forest,0.6,60.0%,60.0%,60.0%
7,Naive Bayes,0.6,60.0%,60.0%,60.0%


*************** 34 ***************


Unnamed: 0,Classifiers,F1-Scores,Accuracy,Precision,Recall
2,Gaussian Process,0.38,38.46%,38.46%,38.46%


*************** 24 ***************


Unnamed: 0,Classifiers,F1-Scores,Accuracy,Precision,Recall
3,Decision Tree,0.57,57.14%,57.14%,57.14%
4,Random Forest,0.57,57.14%,57.14%,57.14%
6,AdaBoost,0.57,57.14%,57.14%,57.14%


*************** 2 ***************


Unnamed: 0,Classifiers,F1-Scores,Accuracy,Precision,Recall
3,Decision Tree,0.83,83.33%,83.33%,83.33%


*************** 3 ***************


Unnamed: 0,Classifiers,F1-Scores,Accuracy,Precision,Recall
6,AdaBoost,0.86,85.71%,85.71%,85.71%


{12: KNeighborsClassifier(n_neighbors=10), 10: GaussianNB(var_smoothing=0.01), 17: RandomForestClassifier(criterion='entropy', max_depth=3, max_features='sqrt',
                       n_estimators=10), 32: KNeighborsClassifier(n_neighbors=10), 30: GaussianNB(var_smoothing=0.01), 6: QuadraticDiscriminantAnalysis(reg_param=0.1), 25: GaussianNB(var_smoothing=0.01), 11: GaussianNB(var_smoothing=0.01), 15: DecisionTreeClassifier(max_depth=3), 19: RandomForestClassifier(criterion='entropy', max_depth=3, max_features='sqrt',
                       n_estimators=10), 0: GaussianNB(var_smoothing=0.01), 26: SVC(C=100, gamma=0.0001), 21: QuadraticDiscriminantAnalysis(reg_param=0.1), 9: MLPClassifier(alpha=1, max_iter=1000), 16: QuadraticDiscriminantAnalysis(reg_param=0.1), 1: KNeighborsClassifier(n_neighbors=10), 35: MLPClassifier(alpha=1, max_iter=1000), 7: GaussianNB(var_smoothing=0.01), 18: DecisionTreeClassifier(max_depth=3), 33: AdaBoostClassifier(), 4: GaussianNB(var_smoothing=0.01), 31: Gau

In [None]:
test_dataset = pd.concat(test_subsets, axis=0, ignore_index=True)

test_dataset = test_dataset.drop_duplicates()
y_test_subset = np.array(test_dataset.loc[:, "Result"])
X_test_subset = np.array(test_dataset.loc[:,:"Draws10"])

print(X_test_subset.shape)
print(y_test_subset.shape)

predictions = []
for record in X_test_subset:
  home_team = record[0]
  away_team = record[1]

  clf_home = fitted_classifiers[home_team]
  clf_away = fitted_classifiers[away_team]
  #print(clf_home.predict([record]))
  #print(clf_away.predict([record]))
  predictions.append(clf_home.predict([record]))

print("Result: {}%".format(round(accuracy_score(y_test_subset, predictions)*100,2)))


Output hidden; open in https://colab.research.google.com to view.

In [None]:
    best_params = {}

    parameters = {
        "Nearest Neighbors" : {'n_neighbors':list(range(1, 31))},
        "SVM" : {'C': [0.025, 0.5, 0.1, 1, 10, 100, 1000],'gamma': [2, 1, 0.1, 0.01, 0.001, 0.0001],'kernel': ['rbf', 'linear']},
        "Gaussian Process" : {'kernel': [1*RBF(), 1*DotProduct(), 1*Matern(),  1*RationalQuadratic(), 1*WhiteKernel()]},
        "Decision Tree" : {'criterion':['gini','entropy'],'max_depth':[3,4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150]},
        "Random Forest" : { 'n_estimators': [10, 50, 100,200, 500],'max_features': ['auto', 'sqrt', 'log2'],'max_depth' : [3,4,5,6,7,8],'criterion' :['gini', 'entropy']},
        "Naive Bayes" : {'var_smoothing': np.logspace(0,-9, num=100)},
        "QDA" : {'reg_param': [0.1, 0.2, 0.3, 0.4, 0.5]}
        }

    for name, clf in zip(names, classifiers):
      gs_clf = GridSearchCV(clf, parameters[name], scoring='accuracy', cv=5)
      estimator = gs_clf.fit(X_train, y_train)
      best_params[name] = estimator.best_params_
      print(best_params)

    #"Neural Net" : {'solver': ['lbfgs'], 'max_iter': [1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000 ], 'alpha': 10.0 ** -np.arange(1, 10), 'hidden_layer_sizes':np.arange(10, 15), 'random_state':[0,1,2,3,4,5,6,7,8,9]},
    #"AdaBoost" : {"base_estimator__criterion" : ["gini", "entropy"],"base_estimator__splitter" : ["best", "random"],"n_estimators": [1, 2]},