# Scenario 1

4 seasons as train set. Trying to predict the last 2 seasons.

# Bundesliga

In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix,classification_report,ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv(r"C:\Users\George\Desktop\football-match-prediction-using-ml-techniques\fe\bundesliga.csv")

In [3]:
df['HomeTeam'].unique()

array(['Bayern Munich', 'Dortmund', 'Ein Frankfurt', 'FC Koln',
       'Hannover', 'Hertha', 'Hoffenheim', "M'gladbach", 'Paderborn',
       'Augsburg', 'Hamburg', 'Leverkusen', 'Schalke 04', 'Stuttgart',
       'Werder Bremen', 'Wolfsburg', 'Freiburg', 'Mainz', 'Darmstadt',
       'Ingolstadt', 'RB Leipzig', 'Fortuna Dusseldorf', 'Nurnberg',
       'Union Berlin'], dtype=object)

In [4]:
team_id = {'Bayern Munich':1, 'Dortmund':2, 'Ein Frankfurt':3, 'FC Koln':4,
           'Hannover':5, 'Hertha':6, 'Hoffenheim':7, "M'gladbach":8, 'Paderborn':9,
           'Augsburg':10, 'Hamburg':11, 'Leverkusen':12, 'Schalke 04':13, 'Stuttgart':14,
           'Werder Bremen':15, 'Wolfsburg':16, 'Freiburg':17, 'Mainz':18, 'Darmstadt':19,
           'Ingolstadt':20, 'RB Leipzig':21, 'Fortuna Dusseldorf':22, 'Nurnberg':23,
           'Union Berlin':24}

In [5]:
for key,value in df.iteritems():
    df['HomeTeam'] = df['HomeTeam'].apply(lambda x: team_id.get(x,x))
    
for key,value in df.iteritems():
    df['AwayTeam'] = df['AwayTeam'].apply(lambda x: team_id.get(x,x))

In [6]:
#We are dropping the starts that are not available to us before the game 
df = df.drop(['Unnamed: 0', 'Date','FTHG', 'FTAG', 'HTHG','HTAG','league', 'Hppda_coef', 'Appda_coef','HTW', 'HTD',
              'season','HTCR', 'ATCR','HTGS','HTxpts', 'ATxpts','ATGS', 'HTGC', 'ATGC','HTOVA_S', 'HTatt_S', 
              'HTmid_S','HTdef_S', 'ATOVA_S', 'ATatt_S','ATmid_S','ATdef_S','l5_ravg_HTgs', 'l5_ravg_ATgs',
              'l5_ravg_HTgc', 'l5_ravg_ATgc', 'HomeTeamPoints','AwayTeamPoints',
              'HTL', 'ATL', 'ATD', 'ATW'],axis=1)

df = df.dropna()

In [12]:
X = df[['AVGH',
 'AVGD',
 'AVGA',
 'HT_draws',
 'AT_draws',
 'HT_losses',
 'AT_losses',
 'l5_ravg_HTST',
 'l5_ravg_HTCR',
 'l5_ravg_ATCR',
 'l5_ravg_HTxG',
 'l5_ravg_ATxG',
 'l5_ravg_HTxpts',
 'l5_ravg_ATxpts',
 'l5_ravg_HTdeep',
 'l5_htdiff',
 'avgHTP',
 'avgATP',
 'diff_points']]
y = df['FTR']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101)

In [14]:
scaler = MinMaxScaler()

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [15]:
first_model = KNeighborsClassifier()

first_model.fit(scaled_X_train,y_train)
first_model_pred = first_model.predict(scaled_X_test)

In [16]:
confusion_matrix(y_test,first_model_pred)

array([[ 75,  37,  31],
       [ 51,  22,  48],
       [ 87,  58, 108]], dtype=int64)

In [17]:
print(classification_report(y_test,first_model_pred))

              precision    recall  f1-score   support

           A       0.35      0.52      0.42       143
           D       0.19      0.18      0.18       121
           H       0.58      0.43      0.49       253

    accuracy                           0.40       517
   macro avg       0.37      0.38      0.37       517
weighted avg       0.42      0.40      0.40       517



**KNN has very low accuracy comparing to the other models but it can classify the draws better than the other models.**

Since we can't perform RFE with KNN we will only check if we can improve the model using **GridSearchCV**.

In [18]:
leaf_size = list(range(1,30))
n_neighbors = list(range(1,40))
p=[1,2,3]
weights = ['uniform','distance']

In [19]:
grid_param = {'leaf_size': leaf_size,
              'n_neighbors': n_neighbors,
              'p': p,
              'weights':weights}

In [20]:
model = KNeighborsClassifier()

In [21]:
grid_model = GridSearchCV(model,param_grid=grid_param)

In [22]:
grid_model.fit(scaled_X_train,y_train)

In [23]:
y_pred = grid_model.predict(scaled_X_test)

In [24]:
grid_model.best_params_

{'leaf_size': 1, 'n_neighbors': 39, 'p': 3, 'weights': 'distance'}

In [25]:
c = confusion_matrix(y_test,y_pred)

In [26]:
c

array([[ 68,   5,  70],
       [ 35,   3,  83],
       [ 54,   8, 191]], dtype=int64)

In [27]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           A       0.43      0.48      0.45       143
           D       0.19      0.02      0.04       121
           H       0.56      0.75      0.64       253

    accuracy                           0.51       517
   macro avg       0.39      0.42      0.38       517
weighted avg       0.44      0.51      0.45       517



**Best accuracy we could get is 0.51.**

Features **All**.

# **Premier League**

In [28]:
df = pd.read_csv(r"C:\Users\George\Desktop\football-match-prediction-using-ml-techniques\fe\premier_league.csv")

In [29]:
#We assign a unique value at every team of the league
team_id = {'Arsenal':1, 'Leicester':2, 'Man United':3, 'QPR':4, 'Stoke':5, 'West Brom':6,
           'West Ham':7, 'Liverpool':8, 'Newcastle':9, 'Burnley':10, 'Aston Villa':11,
           'Chelsea':12, 'Crystal Palace':13, 'Everton':14, 'Southampton':15, 'Swansea':16,
           'Hull':17, 'Sunderland':18, 'Tottenham':19, 'Man City':20, 'Bournemouth':21,
           'Norwich':22, 'Watford':23, 'Middlesbrough':24, 'Brighton':25, 'Huddersfield':26,
           'Fulham':27, 'Wolves':28, 'Cardiff':29, 'Sheffield United':30}

for key,value in df.iteritems():
    df['HomeTeam'] = df['HomeTeam'].apply(lambda x: team_id.get(x,x))
    
for key,value in df.iteritems():
    df['AwayTeam'] = df['AwayTeam'].apply(lambda x: team_id.get(x,x))

In [30]:
#We are dropping the starts that are not available to us before the game 
df = df.drop(['Unnamed: 0', 'Date','FTHG', 'FTAG', 'HTHG','HTAG','league', 'Hppda_coef', 'Appda_coef','HTW', 'HTD',
              'season','HTCR', 'ATCR','HTGS','HTxpts', 'ATxpts','ATGS', 'HTGC', 'ATGC','HTOVA_S', 'HTatt_S', 
              'HTmid_S','HTdef_S', 'ATOVA_S', 'ATatt_S','ATmid_S','ATdef_S','l5_ravg_HTgs', 'l5_ravg_ATgs',
              'l5_ravg_HTgc', 'l5_ravg_ATgc', 'HomeTeamPoints','AwayTeamPoints',
              'HTL', 'ATL', 'ATD', 'ATW'],axis=1)

df = df.dropna()

In [33]:
X = df[['AVGH',
 'AVGD',
 'AVGA',
 'HT_draws',
 'AT_draws',
 'HT_losses',
 'AT_losses',
 'l5_ravg_HTST',
 'l5_ravg_HTCR',
 'l5_ravg_ATCR',
 'l5_ravg_HTxG',
 'l5_ravg_ATxG',
 'l5_ravg_HTxpts',
 'l5_ravg_ATxpts',
 'l5_atdiff',
 'avgHTP',
 'avgATP',
 'diff_points',
 'diff_MID']]
y = df['FTR']

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101)

In [35]:
scaler = MinMaxScaler()

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [36]:
base = KNeighborsClassifier()

In [37]:
base.fit(scaled_X_train,y_train)

In [38]:
base_pred = base.predict(scaled_X_test)

In [39]:
confusion_matrix(y_test, base_pred)

array([[ 98,  28,  55],
       [ 68,  24,  68],
       [ 95,  45, 173]], dtype=int64)

In [40]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.38      0.54      0.44       181
           D       0.25      0.15      0.19       160
           H       0.58      0.55      0.57       313

    accuracy                           0.45       654
   macro avg       0.40      0.41      0.40       654
weighted avg       0.44      0.45      0.44       654



In [41]:
leaf_size = list(range(1,30))
n_neighbors = list(range(1,40))
p=[1,2,3]
weights = ['uniform','distance']

In [42]:
grid_param = {'leaf_size':leaf_size,
              'n_neighbors':n_neighbors,
              'p': p,
              'weights': weights}

In [43]:
grid_model = GridSearchCV(base,param_grid=grid_param,scoring='accuracy')

In [44]:
grid_model.fit(scaled_X_train,y_train)

In [45]:
grid_model.best_params_

{'leaf_size': 1, 'n_neighbors': 39, 'p': 2, 'weights': 'uniform'}

In [46]:
grid_pred = grid_model.predict(scaled_X_test)

In [47]:
confusion_matrix(y_test,grid_pred)

array([[100,   7,  74],
       [ 43,   3, 114],
       [ 48,  11, 254]], dtype=int64)

In [48]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.52      0.55      0.54       181
           D       0.14      0.02      0.03       160
           H       0.57      0.81      0.67       313

    accuracy                           0.55       654
   macro avg       0.41      0.46      0.41       654
weighted avg       0.45      0.55      0.48       654



**Best accuracy 0.55**

Features: 
- 'AVGH'
- 'AVGD'
- 'AVGA'
- 'HT_draws'
- 'AT_draws'
- 'HT_losses'
- 'AT_losses'
- 'l5_ravg_HTST'
- 'l5_ravg_HTCR'
- 'l5_ravg_ATCR'
- 'l5_ravg_HTxG'
- 'l5_ravg_ATxG'
- 'l5_ravg_HTxpts'
- 'l5_ravg_ATxpts'
- 'l5_atdiff'
- 'avgHTP'
- 'avgATP'
- 'diff_points'
- 'diff_MID'

**La Liga**

In [49]:
df = pd.read_csv(r"C:\Users\George\Desktop\football-match-prediction-using-ml-techniques\fe\la_liga.csv")

In [50]:
#We assign a unique value at every team of the league
team_id = {'Almeria':1, 'Granada':2, 'Malaga':3, 'Sevilla':4, 'Barcelona':5, 'Celta':6,
           'Eibar':7, 'Levante':8, 'Real Madrid':9, 'Vallecano':10, 'Getafe':11,
           'Valencia':12, 'Ath Bilbao':13, 'Ath Madrid':14, 'Cordoba':15, 'Espanol':16,
           'Elche':17, 'La Coruna':18, 'Sociedad':19, 'Villarreal':20, 'Betis':21,
           'Sp Gijon':22, 'Las Palmas':23, 'Leganes':24, 'Osasuna':25, 'Alaves':26, 'Girona':27,
           'Valladolid':28, 'Huesca':29, 'Mallorca':30}

for key,value in df.iteritems():
    df['HomeTeam'] = df['HomeTeam'].apply(lambda x: team_id.get(x,x))
    
for key,value in df.iteritems():
    df['AwayTeam'] = df['AwayTeam'].apply(lambda x: team_id.get(x,x))

In [51]:
#We are dropping the starts that are not available to us before the game 
df = df.drop(['Unnamed: 0', 'Date','FTHG', 'FTAG', 'HTHG','HTAG','league', 'Hppda_coef', 'Appda_coef','HTW', 'HTD',
              'season','HTCR', 'ATCR','HTGS','HTxpts', 'ATxpts','ATGS', 'HTGC', 'ATGC','HTOVA_S', 'HTatt_S', 
              'HTmid_S','HTdef_S', 'ATOVA_S', 'ATatt_S','ATmid_S','ATdef_S','l5_ravg_HTgs', 'l5_ravg_ATgs',
              'l5_ravg_HTgc', 'l5_ravg_ATgc', 'HomeTeamPoints','AwayTeamPoints',
              'HTL', 'ATL', 'ATD', 'ATW'],axis=1)

df = df.dropna()

In [58]:
#We are training the model using the features selected by RFE having as core the Logistic Regression model
X = df[['AVGH', 'AVGD', 'AVGA','HT_wins', 'AT_wins', 'HT_draws', 'AT_draws', 'HT_losses', 'AT_losses','l5_ravg_ATST',
        'l5_ravg_HTCR','l5_ravg_ATCR', 'l5_ravg_HTxG', 'l5_ravg_ATxG', 'l5_ravg_HTxpts', 'l5_htdiff','l5_atdiff',
        'avgHTP', 'avgATP', 'l5_ravg_HTp', 'diff_points', 'diff_MID', 'diff_OVA']]
y = df['FTR']

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101)

In [60]:
scaler = MinMaxScaler()

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [61]:
first_model = KNeighborsClassifier()

first_model.fit(scaled_X_train,y_train)
first_model_pred = first_model.predict(scaled_X_test)

In [62]:
confusion_matrix(y_test,first_model_pred)

array([[ 98,  30,  56],
       [ 61,  32,  66],
       [ 81,  64, 166]], dtype=int64)

In [63]:
print(classification_report(y_test,first_model_pred))

              precision    recall  f1-score   support

           A       0.41      0.53      0.46       184
           D       0.25      0.20      0.22       159
           H       0.58      0.53      0.55       311

    accuracy                           0.45       654
   macro avg       0.41      0.42      0.41       654
weighted avg       0.45      0.45      0.45       654



In [64]:
leaf_size = list(range(1,30))
n_neighbors = list(range(1,40))
p=[1,2,3]
weights = ['uniform','distance']

In [65]:
grid_param = {'leaf_size':leaf_size,
              'n_neighbors':n_neighbors,
              'p': p,
              'weights': weights}

In [66]:
grid_model = GridSearchCV(first_model,param_grid=grid_param,scoring='accuracy')

In [67]:
grid_model.fit(scaled_X_train,y_train)

In [68]:
grid_model.best_params_

{'leaf_size': 1, 'n_neighbors': 22, 'p': 1, 'weights': 'distance'}

In [69]:
grid_pred = grid_model.predict(scaled_X_test)

In [70]:
confusion_matrix(y_test,grid_pred)

array([[ 76,  20,  88],
       [ 43,  24,  92],
       [ 54,  23, 234]], dtype=int64)

In [71]:
print(classification_report(y_test,first_model_pred))

              precision    recall  f1-score   support

           A       0.41      0.53      0.46       184
           D       0.25      0.20      0.22       159
           H       0.58      0.53      0.55       311

    accuracy                           0.45       654
   macro avg       0.41      0.42      0.41       654
weighted avg       0.45      0.45      0.45       654



**Best accuracy I could get is 0.45. Actually it didn't improved after GridSearchCV**

# Serie A

In [72]:
df = pd.read_csv(r"C:\Users\George\Desktop\football-match-prediction-using-ml-techniques\fe\serie_a.csv")

In [73]:
#We assign a unique value at every team of the league
team_id = {'Chievo':1, 'Roma':2, 'Atalanta':3, 'Cesena':4, 'Genoa':5, 'Milan':6,
           'Palermo':7, 'Sassuolo':8, 'Torino':9, 'Udinese':10, 'Empoli':11, 'Juventus':12,
           'Cagliari':13, 'Fiorentina':14, 'Inter':15, 'Lazio':16, 'Napoli':17, 'Parma':18,
           'Sampdoria':19, 'Verona':20, 'Frosinone':21, 'Bologna':22, 'Carpi':23, 'Pescara':24,
           'Crotone':25, 'Benevento':26, 'Spal':27, 'Lecce':28, 'Brescia':29}

for key,value in df.iteritems():
    df['HomeTeam'] = df['HomeTeam'].apply(lambda x: team_id.get(x,x))
    
for key,value in df.iteritems():
    df['AwayTeam'] = df['AwayTeam'].apply(lambda x: team_id.get(x,x))

In [74]:
#We are dropping the starts that are not available to us before the game 
df = df.drop(['Unnamed: 0', 'Date','FTHG', 'FTAG', 'HTHG','HTAG','league', 'Hppda_coef', 'Appda_coef','HTW', 'HTD','season',
              'HTCR', 'ATCR','HTGS','HTxpts', 'ATxpts','ATGS', 'HTGC', 'ATGC','HTOVA_S', 'HTatt_S', 'HomeTeamPoints', 'AwayTeamPoints',
              'HTmid_S','HTdef_S', 'ATOVA_S', 'ATatt_S','ATmid_S','ATdef_S','l5_ravg_HTgs', 'l5_ravg_ATgs',
              'HTL', 'ATL', 'ATD', 'ATW','l5_ravg_HTgc', 'l5_ravg_ATgc','round'],axis=1)

df = df.dropna()

In [75]:
X = df[['AVGH', 'AVGD', 'AVGA', 'l5_ravg_ATCR', 'avgHTP', 'avgATP']]
y = df['FTR']

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101)

In [77]:
scaler = MinMaxScaler()

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [78]:
first_model = KNeighborsClassifier()

first_model.fit(scaled_X_train,y_train)
first_model_pred = first_model.predict(scaled_X_test)

In [79]:
confusion_matrix(y_test,first_model_pred)

array([[123,  30,  44],
       [ 73,  33,  70],
       [ 66,  55, 159]], dtype=int64)

In [80]:
print(classification_report(y_test,first_model_pred))

              precision    recall  f1-score   support

           A       0.47      0.62      0.54       197
           D       0.28      0.19      0.22       176
           H       0.58      0.57      0.58       280

    accuracy                           0.48       653
   macro avg       0.44      0.46      0.45       653
weighted avg       0.47      0.48      0.47       653



In [81]:
leaf_size = list(range(1,30))
n_neighbors = list(range(1,40))
p=[1,2,3]
weights = ['uniform','distance']

In [82]:
grid_param = {'leaf_size':leaf_size,
              'n_neighbors':n_neighbors,
              'p': p,
              'weights': weights}

In [83]:
grid_model = GridSearchCV(first_model,param_grid=grid_param,scoring='accuracy')

In [84]:
grid_model.fit(scaled_X_train,y_train)

In [85]:
grid_model.best_params_

{'leaf_size': 1, 'n_neighbors': 37, 'p': 2, 'weights': 'distance'}

In [86]:
grid_pred = grid_model.predict(scaled_X_test)

In [87]:
confusion_matrix(y_test,grid_pred)

array([[115,  19,  63],
       [ 57,  14, 105],
       [ 43,  25, 212]], dtype=int64)

In [88]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.53      0.58      0.56       197
           D       0.24      0.08      0.12       176
           H       0.56      0.76      0.64       280

    accuracy                           0.52       653
   macro avg       0.44      0.47      0.44       653
weighted avg       0.47      0.52      0.48       653



**KNN actually worked really well for Seria A. It gave accuracy score of 0.52 and it also guessed right 30 draws.***

Features: **All**.

# Scenario 2

**Premier League**

In [12]:
df = pd.read_csv(r"C:\Users\George\Desktop\football-match-prediction-using-ml-techniques\fe\premier_league.csv")

In [13]:
#We assign a unique value at every team of the league
team_id = {'Arsenal':1, 'Leicester':2, 'Man United':3, 'QPR':4, 'Stoke':5, 'West Brom':6,
           'West Ham':7, 'Liverpool':8, 'Newcastle':9, 'Burnley':10, 'Aston Villa':11,
           'Chelsea':12, 'Crystal Palace':13, 'Everton':14, 'Southampton':15, 'Swansea':16,
           'Hull':17, 'Sunderland':18, 'Tottenham':19, 'Man City':20, 'Bournemouth':21,
           'Norwich':22, 'Watford':23, 'Middlesbrough':24, 'Brighton':25, 'Huddersfield':26,
           'Fulham':27, 'Wolves':28, 'Cardiff':29, 'Sheffield United':30}

for key,value in df.iteritems():
    df['HomeTeam'] = df['HomeTeam'].apply(lambda x: team_id.get(x,x))
    
for key,value in df.iteritems():
    df['AwayTeam'] = df['AwayTeam'].apply(lambda x: team_id.get(x,x))

In [14]:
#We are dropping the starts that are not available to us before the game 
df = df.drop(['Unnamed: 0', 'Date','FTHG', 'FTAG', 'HTHG','HTAG','league', 'Hppda_coef', 'Appda_coef','HTW', 'HTD',
              'HTCR', 'ATCR','HTGS','HTxpts', 'ATxpts','ATGS', 'HTGC', 'ATGC','HTOVA_S', 'HTatt_S', 'HomeTeamPoints','AwayTeamPoints',
              'HTmid_S','HTdef_S', 'ATOVA_S', 'ATatt_S','ATmid_S','ATdef_S','l5_ravg_HTgs', 'l5_ravg_ATgs','l5_ravg_HTgc', 'l5_ravg_ATgc',
              'HTL', 'ATL', 'ATD', 'ATW'],axis=1)

df = df.dropna()

In [15]:
s1415 = df[df['season']=='2014/2015'].drop('season',axis=1)
s1516 = df[df['season']=='2015/2016'].drop('season',axis=1)
s1617 = df[df['season']=='2016/2017'].drop('season',axis=1)
s1718 = df[df['season']=='2017/2018'].drop('season',axis=1)
s1819 = df[df['season']=='2018/2019'].drop('season',axis=1)
s1920 = df[df['season']=='2019/2020'].drop('season',axis=1)

**Season 14/15**

In [16]:
X = s1415.drop('FTR',axis=1)
y = s1415[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [17]:
first_model.fit(scaled_X_train,y_train.values.ravel())
base_pred =first_model.predict(scaled_X_test)

In [18]:
confusion_matrix(y_test,base_pred)

array([[35,  8, 16],
       [12,  8, 21],
       [29, 10, 51]], dtype=int64)

In [19]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.46      0.59      0.52        59
           D       0.31      0.20      0.24        41
           H       0.58      0.57      0.57        90

    accuracy                           0.49       190
   macro avg       0.45      0.45      0.44       190
weighted avg       0.48      0.49      0.48       190



We are going to implement GridSearchCV, since we can't perform RFE at KNN.

In [20]:
leaf_size = list(range(1,30))
n_neighbors = list(range(1,40))
p=[1,2,3]
weights = ['uniform','distance']

grid_param = {'leaf_size':leaf_size,
              'n_neighbors':n_neighbors,
              'p': p,
              'weights': weights}

In [22]:
grid_model = GridSearchCV(first_model,param_grid=grid_param,scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [23]:
grid_model.best_params_

{'leaf_size': 1, 'n_neighbors': 34, 'p': 1, 'weights': 'uniform'}

In [24]:
confusion_matrix(y_test,grid_pred)

array([[32,  0, 27],
       [ 9,  0, 32],
       [18,  0, 72]], dtype=int64)

In [25]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.54      0.54      0.54        59
           D       0.00      0.00      0.00        41
           H       0.55      0.80      0.65        90

    accuracy                           0.55       190
   macro avg       0.36      0.45      0.40       190
weighted avg       0.43      0.55      0.48       190



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**Season 15/16**

In [26]:
X = s1516.drop('FTR',axis=1)
y = s1516[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [27]:
first_model.fit(scaled_X_train,y_train.values.ravel())
base_pred =first_model.predict(scaled_X_test)

In [28]:
confusion_matrix(y_test,base_pred)

array([[24,  9, 20],
       [21, 10, 22],
       [26, 20, 38]], dtype=int64)

In [29]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.34      0.45      0.39        53
           D       0.26      0.19      0.22        53
           H       0.47      0.45      0.46        84

    accuracy                           0.38       190
   macro avg       0.36      0.36      0.36       190
weighted avg       0.38      0.38      0.37       190



In [30]:
leaf_size = list(range(1,30))
n_neighbors = list(range(1,40))
p=[1,2,3]
weights = ['uniform','distance']

grid_param = {'leaf_size':leaf_size,
              'n_neighbors':n_neighbors,
              'p': p,
              'weights': weights}

In [31]:
grid_model = GridSearchCV(first_model,param_grid=grid_param,scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [32]:
grid_model.best_params_

{'leaf_size': 1, 'n_neighbors': 20, 'p': 3, 'weights': 'uniform'}

In [33]:
confusion_matrix(y_test,grid_pred)

array([[21,  6, 26],
       [10,  5, 38],
       [ 9,  8, 67]], dtype=int64)

In [34]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.53      0.40      0.45        53
           D       0.26      0.09      0.14        53
           H       0.51      0.80      0.62        84

    accuracy                           0.49       190
   macro avg       0.43      0.43      0.40       190
weighted avg       0.45      0.49      0.44       190



Best accuracy we could get after GridSearchCV is 0,49.

Hyperparameters:
- 'leaf_size': 1 
- 'n_neighbors': 20 
- 'p': 3 
- 'weights': 'uniform'

**Season 16/17**

In [35]:
X = s1617.drop('FTR',axis=1)
y = s1617[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [36]:
first_model.fit(scaled_X_train,y_train.values.ravel())
base_pred =first_model.predict(scaled_X_test)

In [37]:
confusion_matrix(y_test,base_pred)

array([[44,  0,  9],
       [28,  0, 14],
       [35,  0, 60]], dtype=int64)

In [38]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.41      0.83      0.55        53
           D       0.00      0.00      0.00        42
           H       0.72      0.63      0.67        95

    accuracy                           0.55       190
   macro avg       0.38      0.49      0.41       190
weighted avg       0.48      0.55      0.49       190



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [39]:
leaf_size = list(range(1,30))
n_neighbors = list(range(1,40))
p=[1,2,3]
weights = ['uniform','distance']

grid_param = {'leaf_size':leaf_size,
              'n_neighbors':n_neighbors,
              'p': p,
              'weights': weights}

In [40]:
grid_model = GridSearchCV(first_model,param_grid=grid_param,scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [43]:
grid_model.best_params_

{'leaf_size': 1, 'n_neighbors': 39, 'p': 1, 'weights': 'uniform'}

In [41]:
confusion_matrix(y_test,grid_pred)

array([[29,  0, 24],
       [11,  0, 31],
       [ 8,  0, 87]], dtype=int64)

In [42]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.60      0.55      0.57        53
           D       0.00      0.00      0.00        42
           H       0.61      0.92      0.73        95

    accuracy                           0.61       190
   macro avg       0.41      0.49      0.44       190
weighted avg       0.47      0.61      0.53       190



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The best accuracy we could get is 0.61 after implementing GridSearchCV.

Hyperparameters:
- 'leaf_size': 1 
- 'n_neighbors': 39 
- 'p': 1 
- 'weights': 'uniform'

**Season 17/18**

In [44]:
X = s1718.drop('FTR',axis=1)
y = s1718[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [45]:
first_model.fit(scaled_X_train,y_train.values.ravel())
base_pred =first_model.predict(scaled_X_test)

In [46]:
confusion_matrix(y_test,base_pred)

array([[40,  7,  3],
       [31, 11, 10],
       [35, 15, 38]], dtype=int64)

In [47]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.38      0.80      0.51        50
           D       0.33      0.21      0.26        52
           H       0.75      0.43      0.55        88

    accuracy                           0.47       190
   macro avg       0.49      0.48      0.44       190
weighted avg       0.54      0.47      0.46       190



In [48]:
leaf_size = list(range(1,30))
n_neighbors = list(range(1,40))
p=[1,2,3]
weights = ['uniform','distance']

grid_param = {'leaf_size':leaf_size,
              'n_neighbors':n_neighbors,
              'p': p,
              'weights': weights}

grid_model = GridSearchCV(first_model,param_grid=grid_param,scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [49]:
grid_model.best_params_

{'leaf_size': 1, 'n_neighbors': 16, 'p': 3, 'weights': 'distance'}

In [50]:
confusion_matrix(y_test,grid_pred)

array([[36,  0, 14],
       [15,  6, 31],
       [23,  5, 60]], dtype=int64)

In [52]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.49      0.72      0.58        50
           D       0.55      0.12      0.19        52
           H       0.57      0.68      0.62        88

    accuracy                           0.54       190
   macro avg       0.53      0.51      0.46       190
weighted avg       0.54      0.54      0.49       190



The best accuracy we could get is 0,54 after implementing GridSearchCv.

Hyperparameters:
- 'leaf_size': 1 
- 'n_neighbors': 16 
- 'p': 3 
- 'weights': 'distance'

**Season 18/19**

In [53]:
X = s1819.drop('FTR',axis=1)
y = s1819[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [54]:
first_model.fit(scaled_X_train,y_train.values.ravel())
base_pred =first_model.predict(scaled_X_test)

In [55]:
confusion_matrix(y_test,base_pred)

array([[49,  4,  9],
       [19,  3, 11],
       [43,  5, 47]], dtype=int64)

In [56]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.44      0.79      0.57        62
           D       0.25      0.09      0.13        33
           H       0.70      0.49      0.58        95

    accuracy                           0.52       190
   macro avg       0.46      0.46      0.43       190
weighted avg       0.54      0.52      0.50       190



In [57]:
leaf_size = list(range(1,30))
n_neighbors = list(range(1,40))
p=[1,2,3]
weights = ['uniform','distance']

grid_param = {'leaf_size':leaf_size,
              'n_neighbors':n_neighbors,
              'p': p,
              'weights': weights}

grid_model = GridSearchCV(first_model,param_grid=grid_param,scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [58]:
grid_model.best_params_

{'leaf_size': 1, 'n_neighbors': 12, 'p': 2, 'weights': 'distance'}

In [59]:
confusion_matrix(y_test,grid_pred)

array([[47,  0, 15],
       [17,  1, 15],
       [36,  1, 58]], dtype=int64)

In [60]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.47      0.76      0.58        62
           D       0.50      0.03      0.06        33
           H       0.66      0.61      0.63        95

    accuracy                           0.56       190
   macro avg       0.54      0.47      0.42       190
weighted avg       0.57      0.56      0.52       190



Best accuracy we could get is 0,56 after GridSearchCV

Hyperparameters:
- 'leaf_size': 1 
- 'n_neighbors': 12 
- 'p': 2 
- 'weights': 'distance'

**Season 19/20**

In [61]:
X = s1920.drop('FTR',axis=1)
y = s1920[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [62]:
first_model.fit(scaled_X_train,y_train.values.ravel())
base_pred =first_model.predict(scaled_X_test)

In [63]:
confusion_matrix(y_test,base_pred)

array([[37,  5, 13],
       [20,  7, 19],
       [39,  5, 45]], dtype=int64)

In [64]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.39      0.67      0.49        55
           D       0.41      0.15      0.22        46
           H       0.58      0.51      0.54        89

    accuracy                           0.47       190
   macro avg       0.46      0.44      0.42       190
weighted avg       0.49      0.47      0.45       190



In [65]:
leaf_size = list(range(1,30))
n_neighbors = list(range(1,40))
p=[1,2,3]
weights = ['uniform','distance']

grid_param = {'leaf_size':leaf_size,
              'n_neighbors':n_neighbors,
              'p': p,
              'weights': weights}

grid_model = GridSearchCV(first_model,param_grid=grid_param,scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [66]:
grid_model.best_params_

{'leaf_size': 1, 'n_neighbors': 12, 'p': 3, 'weights': 'uniform'}

In [67]:
confusion_matrix(y_test,grid_pred)

array([[30,  0, 25],
       [13,  1, 32],
       [22,  1, 66]], dtype=int64)

In [68]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.46      0.55      0.50        55
           D       0.50      0.02      0.04        46
           H       0.54      0.74      0.62        89

    accuracy                           0.51       190
   macro avg       0.50      0.44      0.39       190
weighted avg       0.51      0.51      0.45       190



Best accuracy we could get is 0.51 after implementing GridSearchCV.

Hyperparameters:
- 'leaf_size': 1, 
- 'n_neighbors': 12, 
- 'p': 3, 
- 'weights': 'uniform'

**Bundesliga**

In [11]:
df = pd.read_csv(r"C:\Users\George\Desktop\football-match-prediction-using-ml-techniques\fe\bundesliga.csv")

In [12]:
#We assign a unique value at every team of the league
team_id = {'Bayern Munich':1, 'Dortmund':2, 'Ein Frankfurt':3, 'FC Koln':4,
           'Hannover':5, 'Hertha':6, 'Hoffenheim':7, "M'gladbach":8, 'Paderborn':9,
           'Augsburg':10, 'Hamburg':11, 'Leverkusen':12, 'Schalke 04':13, 'Stuttgart':14,
           'Werder Bremen':15, 'Wolfsburg':16, 'Freiburg':17, 'Mainz':18, 'Darmstadt':19,
           'Ingolstadt':20, 'RB Leipzig':21, 'Fortuna Dusseldorf':22, 'Nurnberg':23,
           'Union Berlin':24}

for key,value in df.iteritems():
    df['HomeTeam'] = df['HomeTeam'].apply(lambda x: team_id.get(x,x))
    
for key,value in df.iteritems():
    df['AwayTeam'] = df['AwayTeam'].apply(lambda x: team_id.get(x,x))

In [13]:
#We are dropping the starts that are not available to us before the game 
df = df.drop(['Unnamed: 0', 'Date','FTHG', 'FTAG', 'HTHG','HTAG','league', 'Hppda_coef', 'Appda_coef','HTW', 'HTD',
              'HTCR', 'ATCR','HTGS','HTxpts', 'ATxpts','ATGS', 'HTGC', 'ATGC','HTOVA_S', 'HTatt_S', 'HomeTeamPoints', 'AwayTeamPoints',
              'HTmid_S','HTdef_S', 'ATOVA_S', 'ATatt_S','ATmid_S','ATdef_S','l5_ravg_HTgs', 'l5_ravg_ATgs',
              'HTL', 'ATL', 'ATD', 'ATW','l5_ravg_HTgc', 'l5_ravg_ATgc'],axis=1)

df = df.dropna()

In [14]:
s1415 = df[df['season']=='2014/2015'].drop('season',axis=1)
s1516 = df[df['season']=='2015/2016'].drop('season',axis=1)
s1617 = df[df['season']=='2016/2017'].drop('season',axis=1)
s1718 = df[df['season']=='2017/2018'].drop('season',axis=1)
s1819 = df[df['season']=='2018/2019'].drop('season',axis=1)
s1920 = df[df['season']=='2019/2020'].drop('season',axis=1)

**Season 14/15**

In [15]:
X = s1415.drop('FTR',axis=1)
y = s1415[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [16]:
first_model.fit(scaled_X_train,y_train.values.ravel())
base_pred =first_model.predict(scaled_X_test)

In [17]:
confusion_matrix(y_test,base_pred)

array([[28,  8,  4],
       [22, 10,  7],
       [18, 43, 13]], dtype=int64)

In [18]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.41      0.70      0.52        40
           D       0.16      0.26      0.20        39
           H       0.54      0.18      0.27        74

    accuracy                           0.33       153
   macro avg       0.37      0.38      0.33       153
weighted avg       0.41      0.33      0.31       153



In [19]:
leaf_size = list(range(1,30))
n_neighbors = list(range(1,40))
p=[1,2,3]
weights = ['uniform','distance']

grid_param = {'leaf_size':leaf_size,
              'n_neighbors':n_neighbors,
              'p': p,
              'weights': weights}

grid_model = GridSearchCV(first_model,param_grid=grid_param,scoring='accuracy',n_jobs=-1)

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [20]:
grid_model.best_params_

{'leaf_size': 1, 'n_neighbors': 25, 'p': 3, 'weights': 'uniform'}

In [21]:
confusion_matrix(y_test,grid_pred)

array([[14,  4, 22],
       [ 6,  4, 29],
       [ 6, 17, 51]], dtype=int64)

In [22]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.54      0.35      0.42        40
           D       0.16      0.10      0.12        39
           H       0.50      0.69      0.58        74

    accuracy                           0.45       153
   macro avg       0.40      0.38      0.38       153
weighted avg       0.42      0.45      0.42       153



The best accuracy we could get is 0.45 after the implementation of GridSearchCV.

Hyperparameters:
- 'leaf_size': 1 
- 'n_neighbors': 25
- 'p': 3 
- 'weights': 'uniform'

**Season 15/16**

In [82]:
X = s1516.drop('FTR',axis=1)
y = s1516[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [83]:
first_model.fit(scaled_X_train,y_train.values.ravel())
base_pred =first_model.predict(scaled_X_test)

In [84]:
confusion_matrix(y_test,base_pred)

array([[19,  6, 21],
       [17,  6, 15],
       [20,  4, 45]], dtype=int64)

In [85]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.34      0.41      0.37        46
           D       0.38      0.16      0.22        38
           H       0.56      0.65      0.60        69

    accuracy                           0.46       153
   macro avg       0.42      0.41      0.40       153
weighted avg       0.45      0.46      0.44       153



In [86]:
leaf_size = list(range(1,30))
n_neighbors = list(range(1,40))
p=[1,2,3]
weights = ['uniform','distance']

grid_param = {'leaf_size':leaf_size,
              'n_neighbors':n_neighbors,
              'p': p,
              'weights': weights}

grid_model = GridSearchCV(first_model,param_grid=grid_param,scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [89]:
grid_model.best_params_

{'leaf_size': 1, 'n_neighbors': 27, 'p': 2, 'weights': 'uniform'}

In [87]:
confusion_matrix(y_test,grid_pred)

array([[20,  1, 25],
       [ 7,  0, 31],
       [ 8,  3, 58]], dtype=int64)

In [88]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.57      0.43      0.49        46
           D       0.00      0.00      0.00        38
           H       0.51      0.84      0.63        69

    accuracy                           0.51       153
   macro avg       0.36      0.43      0.38       153
weighted avg       0.40      0.51      0.43       153



Best accuracy we could get is 0.51 after implementing GridSearchCV.

Hyperparameters:
- 'leaf_size': 1
- 'n_neighbors': 27
- 'p': 2
- 'weights': 'uniform'

**Season 16/17**

In [90]:
X = s1617.drop('FTR',axis=1)
y = s1617[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [91]:
first_model.fit(scaled_X_train,y_train.values.ravel())
base_pred =first_model.predict(scaled_X_test)

In [92]:
confusion_matrix(y_test,base_pred)

array([[18, 11, 12],
       [ 9,  9, 17],
       [13, 30, 34]], dtype=int64)

In [93]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.45      0.44      0.44        41
           D       0.18      0.26      0.21        35
           H       0.54      0.44      0.49        77

    accuracy                           0.40       153
   macro avg       0.39      0.38      0.38       153
weighted avg       0.43      0.40      0.41       153



In [94]:
leaf_size = list(range(1,30))
n_neighbors = list(range(1,40))
p=[1,2,3]
weights = ['uniform','distance']

grid_param = {'leaf_size':leaf_size,
              'n_neighbors':n_neighbors,
              'p': p,
              'weights': weights}

grid_model = GridSearchCV(first_model,param_grid=grid_param,scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [95]:
grid_model.best_params_

{'leaf_size': 1, 'n_neighbors': 32, 'p': 1, 'weights': 'uniform'}

In [96]:
confusion_matrix(y_test,grid_pred)

array([[ 7,  0, 34],
       [ 3,  0, 32],
       [ 4,  0, 73]], dtype=int64)

In [97]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.50      0.17      0.25        41
           D       0.00      0.00      0.00        35
           H       0.53      0.95      0.68        77

    accuracy                           0.52       153
   macro avg       0.34      0.37      0.31       153
weighted avg       0.40      0.52      0.41       153



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Best accuracy we could get is 0,52 after implementing GridSearchCV.

Hyperparameters:
- 'leaf_size': 1  
- 'n_neighbors': 32
- 'p': 1 
- 'weights': 'uniform'

**Season 17/18**

In [26]:
X = s1718.drop('FTR',axis=1)
y = s1718[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [27]:
first_model.fit(scaled_X_train,y_train.values.ravel())
base_pred =first_model.predict(scaled_X_test)

In [28]:
confusion_matrix(y_test,base_pred)

array([[13, 13, 18],
       [12,  7, 19],
       [ 9, 23, 39]], dtype=int64)

In [29]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.38      0.30      0.33        44
           D       0.16      0.18      0.17        38
           H       0.51      0.55      0.53        71

    accuracy                           0.39       153
   macro avg       0.35      0.34      0.35       153
weighted avg       0.39      0.39      0.39       153



In [30]:
leaf_size = list(range(1,30))
n_neighbors = list(range(1,40))
p=[1,2,3]
weights = ['uniform','distance']

grid_param = {'leaf_size':leaf_size,
              'n_neighbors':n_neighbors,
              'p': p,
              'weights': weights}

grid_model = GridSearchCV(first_model,param_grid=grid_param,scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [31]:
grid_model.best_params_

{'leaf_size': 1, 'n_neighbors': 31, 'p': 1, 'weights': 'uniform'}

In [32]:
confusion_matrix(y_test,grid_pred)

array([[ 9,  8, 27],
       [ 5,  6, 27],
       [ 3, 15, 53]], dtype=int64)

In [34]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.53      0.20      0.30        44
           D       0.21      0.16      0.18        38
           H       0.50      0.75      0.60        71

    accuracy                           0.44       153
   macro avg       0.41      0.37      0.36       153
weighted avg       0.43      0.44      0.41       153



The best accuracy we could get is 0,44 after implementing GridSearchCV.

Hyperparameter:
- 'leaf_size': 1 
- 'n_neighbors': 31 
- 'p': 1 
- 'weights': 'uniform'

**Season 18/19**

In [35]:
X = s1819.drop('FTR',axis=1)
y = s1819[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [36]:
first_model.fit(scaled_X_train,y_train.values.ravel())
base_pred =first_model.predict(scaled_X_test)

In [37]:
confusion_matrix(y_test,base_pred)

array([[24, 13, 12],
       [13,  9, 13],
       [20, 13, 36]], dtype=int64)

In [38]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.42      0.49      0.45        49
           D       0.26      0.26      0.26        35
           H       0.59      0.52      0.55        69

    accuracy                           0.45       153
   macro avg       0.42      0.42      0.42       153
weighted avg       0.46      0.45      0.45       153



In [41]:
leaf_size = list(range(1,30))
n_neighbors = list(range(1,40))
p=[1,2,3]
weights = ['uniform','distance']

grid_param = {'leaf_size':leaf_size,
              'n_neighbors':n_neighbors,
              'p': p,
              'weights': weights}

grid_model = GridSearchCV(first_model,param_grid=grid_param,scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [42]:
grid_model.best_params_

{'leaf_size': 1, 'n_neighbors': 23, 'p': 3, 'weights': 'uniform'}

In [43]:
confusion_matrix(y_test,grid_pred)

array([[18,  5, 26],
       [ 6,  4, 25],
       [ 8,  3, 58]], dtype=int64)

In [44]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.56      0.37      0.44        49
           D       0.33      0.11      0.17        35
           H       0.53      0.84      0.65        69

    accuracy                           0.52       153
   macro avg       0.48      0.44      0.42       153
weighted avg       0.50      0.52      0.48       153



The best accuracy we could get is 0,52 after implementing GridSearchCV.

Hyperparameters:
- 'leaf_size': 1 
- 'n_neighbors': 23
- 'p': 3
- 'weights': 'uniform'

**Season 19/20**

In [45]:
X = s1920.drop('FTR',axis=1)
y = s1920[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [46]:
first_model.fit(scaled_X_train,y_train.values.ravel())
base_pred =first_model.predict(scaled_X_test)

In [47]:
confusion_matrix(y_test,base_pred)

array([[29,  9, 26],
       [11, 10, 14],
       [18,  8, 28]], dtype=int64)

In [48]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.50      0.45      0.48        64
           D       0.37      0.29      0.32        35
           H       0.41      0.52      0.46        54

    accuracy                           0.44       153
   macro avg       0.43      0.42      0.42       153
weighted avg       0.44      0.44      0.43       153



In [49]:
leaf_size = list(range(1,30))
n_neighbors = list(range(1,40))
p=[1,2,3]
weights = ['uniform','distance']

grid_param = {'leaf_size':leaf_size,
              'n_neighbors':n_neighbors,
              'p': p,
              'weights': weights}

grid_model = GridSearchCV(first_model,param_grid=grid_param,scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [50]:
grid_model.best_params_

{'leaf_size': 1, 'n_neighbors': 36, 'p': 1, 'weights': 'uniform'}

In [51]:
confusion_matrix(y_test,grid_pred)

array([[22,  0, 42],
       [ 2,  0, 33],
       [ 3,  0, 51]], dtype=int64)

In [52]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.81      0.34      0.48        64
           D       0.00      0.00      0.00        35
           H       0.40      0.94      0.57        54

    accuracy                           0.48       153
   macro avg       0.41      0.43      0.35       153
weighted avg       0.48      0.48      0.40       153



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The best accuracy we could get is 0,48 after GridSearchCV

Hyperparameter:
- 'leaf_size': 1 
- 'n_neighbors': 36 
- 'p': 1 
- 'weights': 'uniform'

**Serie A**

In [11]:
df = pd.read_csv(r"C:\Users\George\Desktop\football-match-prediction-using-ml-techniques\fe\serie_a.csv")

In [12]:
#We assign a unique value at every team of the league
team_id = {'Chievo':1, 'Roma':2, 'Atalanta':3, 'Cesena':4, 'Genoa':5, 'Milan':6,
           'Palermo':7, 'Sassuolo':8, 'Torino':9, 'Udinese':10, 'Empoli':11, 'Juventus':12,
           'Cagliari':13, 'Fiorentina':14, 'Inter':15, 'Lazio':16, 'Napoli':17, 'Parma':18,
           'Sampdoria':19, 'Verona':20, 'Frosinone':21, 'Bologna':22, 'Carpi':23, 'Pescara':24,
           'Crotone':25, 'Benevento':26, 'Spal':27, 'Lecce':28, 'Brescia':29}

for key,value in df.iteritems():
    df['HomeTeam'] = df['HomeTeam'].apply(lambda x: team_id.get(x,x))
    
for key,value in df.iteritems():
    df['AwayTeam'] = df['AwayTeam'].apply(lambda x: team_id.get(x,x))

In [13]:
#We are dropping the starts that are not available to us before the game 
df = df.drop(['Unnamed: 0', 'Date','FTHG', 'FTAG', 'HTHG','HTAG','league', 'Hppda_coef', 'Appda_coef','HTW', 'HTD',
              'HTCR', 'ATCR','HTGS','HTxpts', 'ATxpts','ATGS', 'HTGC', 'ATGC','HTOVA_S', 'HTatt_S', 'HomeTeamPoints', 'AwayTeamPoints',
              'HTmid_S','HTdef_S', 'ATOVA_S', 'ATatt_S','ATmid_S','ATdef_S','l5_ravg_HTgs', 'l5_ravg_ATgs',
              'HTL', 'ATL', 'ATD', 'ATW','l5_ravg_HTgc', 'l5_ravg_ATgc'],axis=1)

df = df.dropna()

In [14]:
s1415 = df[df['season']=='2014/2015'].drop('season',axis=1)
s1516 = df[df['season']=='2015/2016'].drop('season',axis=1)
s1617 = df[df['season']=='2016/2017'].drop('season',axis=1)
s1718 = df[df['season']=='2017/2018'].drop('season',axis=1)
s1819 = df[df['season']=='2018/2019'].drop('season',axis=1)
s1920 = df[df['season']=='2019/2020'].drop('season',axis=1)

**Season 14/15**

In [58]:
X = s1415.drop('FTR',axis=1)
y = s1415[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [59]:
first_model.fit(scaled_X_train,y_train.values.ravel())
base_pred =first_model.predict(scaled_X_test)

In [60]:
confusion_matrix(y_test,base_pred)

array([[25, 26,  2],
       [19, 25, 10],
       [26, 43, 14]], dtype=int64)

In [61]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.36      0.47      0.41        53
           D       0.27      0.46      0.34        54
           H       0.54      0.17      0.26        83

    accuracy                           0.34       190
   macro avg       0.39      0.37      0.33       190
weighted avg       0.41      0.34      0.32       190



In [63]:
leaf_size = list(range(1,30))
n_neighbors = list(range(1,40))
p=[1,2,3]
weights = ['uniform','distance']

grid_param = {'leaf_size':leaf_size,
              'n_neighbors':n_neighbors,
              'p': p,
              'weights': weights}

grid_model = GridSearchCV(first_model,param_grid=grid_param,scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [64]:
grid_model.best_params_

{'leaf_size': 1, 'n_neighbors': 30, 'p': 1, 'weights': 'distance'}

In [65]:
confusion_matrix(y_test,grid_pred)

array([[14, 35,  4],
       [ 9, 40,  5],
       [ 6, 65, 12]], dtype=int64)

In [66]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.48      0.26      0.34        53
           D       0.29      0.74      0.41        54
           H       0.57      0.14      0.23        83

    accuracy                           0.35       190
   macro avg       0.45      0.38      0.33       190
weighted avg       0.47      0.35      0.31       190



Τhe best accuracy we could get is 0,35 after GridSearchCV.

Hyperparameters:
- 'leaf_size': 1 
- 'n_neighbors': 30 
- 'p': 1 
- 'weights': 'distance'

**Season 15/16**

In [67]:
X = s1516.drop('FTR',axis=1)
y = s1516[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [68]:
first_model.fit(scaled_X_train,y_train.values.ravel())
base_pred =first_model.predict(scaled_X_test)

In [69]:
confusion_matrix(y_test,base_pred)

array([[45,  3,  0],
       [41, 10,  2],
       [60, 13, 16]], dtype=int64)

In [70]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.31      0.94      0.46        48
           D       0.38      0.19      0.25        53
           H       0.89      0.18      0.30        89

    accuracy                           0.37       190
   macro avg       0.53      0.44      0.34       190
weighted avg       0.60      0.37      0.33       190



In [71]:
leaf_size = list(range(1,30))
n_neighbors = list(range(1,40))
p=[1,2,3]
weights = ['uniform','distance']

grid_param = {'leaf_size':leaf_size,
              'n_neighbors':n_neighbors,
              'p': p,
              'weights': weights}

grid_model = GridSearchCV(first_model,param_grid=grid_param,scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [72]:
grid_model.best_params_

{'leaf_size': 1, 'n_neighbors': 32, 'p': 3, 'weights': 'uniform'}

In [73]:
confusion_matrix(y_test,grid_pred)

array([[46,  0,  2],
       [38,  0, 15],
       [50,  0, 39]], dtype=int64)

In [74]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.34      0.96      0.51        48
           D       0.00      0.00      0.00        53
           H       0.70      0.44      0.54        89

    accuracy                           0.45       190
   macro avg       0.35      0.47      0.35       190
weighted avg       0.41      0.45      0.38       190



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The best accuracy we could get is 0,45 after GridSearchCV.

Hyperparameters:
- 'leaf_size': 1 
- 'n_neighbors': 32 
- 'p': 3 
- 'weights': 'uniform'

**Season 16/17**

In [75]:
X = s1617.drop('FTR',axis=1)
y = s1617[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [76]:
first_model.fit(scaled_X_train,y_train.values.ravel())
base_pred =first_model.predict(scaled_X_test)

In [77]:
confusion_matrix(y_test,base_pred)

array([[36,  0, 28],
       [18,  1, 23],
       [18,  1, 65]], dtype=int64)

In [78]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.50      0.56      0.53        64
           D       0.50      0.02      0.05        42
           H       0.56      0.77      0.65        84

    accuracy                           0.54       190
   macro avg       0.52      0.45      0.41       190
weighted avg       0.53      0.54      0.48       190



In [79]:
leaf_size = list(range(1,30))
n_neighbors = list(range(1,40))
p=[1,2,3]
weights = ['uniform','distance']

grid_param = {'leaf_size':leaf_size,
              'n_neighbors':n_neighbors,
              'p': p,
              'weights': weights}

grid_model = GridSearchCV(first_model,param_grid=grid_param,scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [80]:
grid_model.best_params_

{'leaf_size': 1, 'n_neighbors': 18, 'p': 2, 'weights': 'uniform'}

In [81]:
confusion_matrix(y_test,grid_pred)

array([[35,  0, 29],
       [14,  0, 28],
       [11,  0, 73]], dtype=int64)

In [82]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.58      0.55      0.56        64
           D       0.00      0.00      0.00        42
           H       0.56      0.87      0.68        84

    accuracy                           0.57       190
   macro avg       0.38      0.47      0.42       190
weighted avg       0.44      0.57      0.49       190



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Best accuracy we could get is 0,57 after GridSearchCV.

Hyperparameters:
- 'leaf_size': 1 
- 'n_neighbors': 18 
- 'p': 2 
- 'weights': 'uniform'

**Season 17/18**

In [83]:
X = s1718.drop('FTR',axis=1)
y = s1718[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [84]:
first_model.fit(scaled_X_train,y_train.values.ravel())
base_pred =first_model.predict(scaled_X_test)

In [85]:
confusion_matrix(y_test,base_pred)

array([[35, 22,  6],
       [19, 16,  6],
       [30, 26, 30]], dtype=int64)

In [86]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.42      0.56      0.48        63
           D       0.25      0.39      0.30        41
           H       0.71      0.35      0.47        86

    accuracy                           0.43       190
   macro avg       0.46      0.43      0.42       190
weighted avg       0.52      0.43      0.44       190



In [87]:
leaf_size = list(range(1,30))
n_neighbors = list(range(1,40))
p=[1,2,3]
weights = ['uniform','distance']

grid_param = {'leaf_size':leaf_size,
              'n_neighbors':n_neighbors,
              'p': p,
              'weights': weights}

grid_model = GridSearchCV(first_model,param_grid=grid_param,scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [88]:
grid_model.best_params_

{'leaf_size': 1, 'n_neighbors': 9, 'p': 1, 'weights': 'uniform'}

In [89]:
confusion_matrix(y_test,grid_pred)

array([[39, 16,  8],
       [22, 12,  7],
       [25, 18, 43]], dtype=int64)

In [90]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.45      0.62      0.52        63
           D       0.26      0.29      0.28        41
           H       0.74      0.50      0.60        86

    accuracy                           0.49       190
   macro avg       0.49      0.47      0.47       190
weighted avg       0.54      0.49      0.50       190



Best accuracy we could get is 0,49 after GridSearchCV.

Hyperparameters:
- 'leaf_size': 1 
- 'n_neighbors': 9 
- 'p': 1 
- 'weights': 'uniform'

**Season 18/19**

In [91]:
X = s1819.drop('FTR',axis=1)
y = s1819[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [92]:
first_model.fit(scaled_X_train,y_train.values.ravel())
base_pred =first_model.predict(scaled_X_test)

In [93]:
confusion_matrix(y_test,base_pred)

array([[15, 24, 16],
       [12, 15, 25],
       [20, 16, 47]], dtype=int64)

In [94]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.32      0.27      0.29        55
           D       0.27      0.29      0.28        52
           H       0.53      0.57      0.55        83

    accuracy                           0.41       190
   macro avg       0.38      0.38      0.37       190
weighted avg       0.40      0.41      0.40       190



In [95]:
leaf_size = list(range(1,30))
n_neighbors = list(range(1,40))
p=[1,2,3]
weights = ['uniform','distance']

grid_param = {'leaf_size':leaf_size,
              'n_neighbors':n_neighbors,
              'p': p,
              'weights': weights}

grid_model = GridSearchCV(first_model,param_grid=grid_param,scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [96]:
grid_model.best_params_

{'leaf_size': 1, 'n_neighbors': 15, 'p': 1, 'weights': 'distance'}

In [97]:
confusion_matrix(y_test,grid_pred)

array([[16, 29, 10],
       [ 5, 23, 24],
       [ 6, 23, 54]], dtype=int64)

In [98]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.59      0.29      0.39        55
           D       0.31      0.44      0.36        52
           H       0.61      0.65      0.63        83

    accuracy                           0.49       190
   macro avg       0.50      0.46      0.46       190
weighted avg       0.52      0.49      0.49       190



The best accuracy we could get is 0,49 after implementing GridSearchCV.

Hyperparameters:
- 'leaf_size': 1 
- 'n_neighbors': 9 
- 'p': 1 
- 'weights': 'uniform'

**Season 19/20**

In [99]:
X = s1920.drop('FTR',axis=1)
y = s1920[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [100]:
first_model.fit(scaled_X_train,y_train.values.ravel())
base_pred =first_model.predict(scaled_X_test)

In [101]:
confusion_matrix(y_test,base_pred)

array([[44,  3, 20],
       [23,  4, 16],
       [25,  5, 50]], dtype=int64)

In [102]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.48      0.66      0.55        67
           D       0.33      0.09      0.15        43
           H       0.58      0.62      0.60        80

    accuracy                           0.52       190
   macro avg       0.46      0.46      0.43       190
weighted avg       0.49      0.52      0.48       190



In [103]:
leaf_size = list(range(1,30))
n_neighbors = list(range(1,40))
p=[1,2,3]
weights = ['uniform','distance']

grid_param = {'leaf_size':leaf_size,
              'n_neighbors':n_neighbors,
              'p': p,
              'weights': weights}

grid_model = GridSearchCV(first_model,param_grid=grid_param,scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [104]:
grid_model.best_params_

{'leaf_size': 1, 'n_neighbors': 35, 'p': 1, 'weights': 'distance'}

In [105]:
confusion_matrix(y_test,grid_pred)

array([[40,  0, 27],
       [20,  0, 23],
       [21,  0, 59]], dtype=int64)

In [106]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.49      0.60      0.54        67
           D       0.00      0.00      0.00        43
           H       0.54      0.74      0.62        80

    accuracy                           0.52       190
   macro avg       0.35      0.44      0.39       190
weighted avg       0.40      0.52      0.45       190



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The best accuracy we could get is 0,52 after implementing GridSearchCV.

Hyperparameters:
- 'leaf_size': 1 
- 'n_neighbors': 35 
- 'p': 1 
- 'weights': 'distance'

**La Liga**

In [12]:
df = pd.read_csv(r"C:\Users\George\Desktop\football-match-prediction-using-ml-techniques\fe\la_liga.csv")

In [13]:
#We assign a unique value at every team of the league
team_id = {'Almeria':1, 'Granada':2, 'Malaga':3, 'Sevilla':4, 'Barcelona':5, 'Celta':6,
           'Eibar':7, 'Levante':8, 'Real Madrid':9, 'Vallecano':10, 'Getafe':11,
           'Valencia':12, 'Ath Bilbao':13, 'Ath Madrid':14, 'Cordoba':15, 'Espanol':16,
           'Elche':17, 'La Coruna':18, 'Sociedad':19, 'Villarreal':20, 'Betis':21,
           'Sp Gijon':22, 'Las Palmas':23, 'Leganes':24, 'Osasuna':25, 'Alaves':26, 'Girona':27,
           'Valladolid':28, 'Huesca':29, 'Mallorca':30}

for key,value in df.iteritems():
    df['HomeTeam'] = df['HomeTeam'].apply(lambda x: team_id.get(x,x))
    
for key,value in df.iteritems():
    df['AwayTeam'] = df['AwayTeam'].apply(lambda x: team_id.get(x,x))

In [14]:
#We are dropping the starts that are not available to us before the game 
df = df.drop(['Unnamed: 0', 'Date','FTHG', 'FTAG', 'HTHG','HTAG','league', 'Hppda_coef', 'Appda_coef','HTW', 'HTD',
              'HTCR', 'ATCR','HTGS','HTxpts', 'ATxpts','ATGS', 'HTGC', 'ATGC','HTOVA_S', 'HTatt_S', 'HomeTeamPoints','AwayTeamPoints',
              'HTmid_S','HTdef_S', 'ATOVA_S', 'ATatt_S','ATmid_S','ATdef_S','l5_ravg_HTgs', 'l5_ravg_ATgs','l5_ravg_HTgc', 'l5_ravg_ATgc',
              'HTL', 'ATL', 'ATD', 'ATW'],axis=1)

df = df.dropna()

In [15]:
s1415 = df[df['season']=='2014/2015'].drop('season',axis=1)
s1516 = df[df['season']=='2015/2016'].drop('season',axis=1)
s1617 = df[df['season']=='2016/2017'].drop('season',axis=1)
s1718 = df[df['season']=='2017/2018'].drop('season',axis=1)
s1819 = df[df['season']=='2018/2019'].drop('season',axis=1)
s1920 = df[df['season']=='2019/2020'].drop('season',axis=1)

**Season 14/15**

In [111]:
X = s1415.drop('FTR',axis=1)
y = s1415[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [112]:
first_model.fit(scaled_X_train,y_train.values.ravel())
base_pred =first_model.predict(scaled_X_test)

In [113]:
confusion_matrix(y_test,base_pred)

array([[33,  9, 14],
       [15, 13, 16],
       [24, 10, 56]], dtype=int64)

In [114]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.46      0.59      0.52        56
           D       0.41      0.30      0.34        44
           H       0.65      0.62      0.64        90

    accuracy                           0.54       190
   macro avg       0.51      0.50      0.50       190
weighted avg       0.54      0.54      0.53       190



In [115]:
leaf_size = list(range(1,30))
n_neighbors = list(range(1,40))
p=[1,2,3]
weights = ['uniform','distance']

grid_param = {'leaf_size':leaf_size,
              'n_neighbors':n_neighbors,
              'p': p,
              'weights': weights}

grid_model = GridSearchCV(first_model,param_grid=grid_param,scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [116]:
grid_model.best_params_

{'leaf_size': 1, 'n_neighbors': 39, 'p': 2, 'weights': 'distance'}

In [117]:
confusion_matrix(y_test,grid_pred)

array([[38,  5, 13],
       [17,  2, 25],
       [17,  2, 71]], dtype=int64)

In [118]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.53      0.68      0.59        56
           D       0.22      0.05      0.08        44
           H       0.65      0.79      0.71        90

    accuracy                           0.58       190
   macro avg       0.47      0.50      0.46       190
weighted avg       0.52      0.58      0.53       190



The best accuracy we could get is 0,58 after implementing GridSearchCV

Hyperparameters:
- 'leaf_size': 1
- 'n_neighbors': 39 
- 'p': 2 
- 'weights': 'distance'

**Season 15/16**

In [119]:
X = s1516.drop('FTR',axis=1)
y = s1516[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [120]:
first_model.fit(scaled_X_train,y_train.values.ravel())
base_pred =first_model.predict(scaled_X_test)

In [121]:
confusion_matrix(y_test,base_pred)

array([[15, 15, 23],
       [11,  7, 27],
       [18,  3, 71]], dtype=int64)

In [122]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.34      0.28      0.31        53
           D       0.28      0.16      0.20        45
           H       0.59      0.77      0.67        92

    accuracy                           0.49       190
   macro avg       0.40      0.40      0.39       190
weighted avg       0.45      0.49      0.46       190



In [123]:
leaf_size = list(range(1,30))
n_neighbors = list(range(1,40))
p=[1,2,3]
weights = ['uniform','distance']

grid_param = {'leaf_size':leaf_size,
              'n_neighbors':n_neighbors,
              'p': p,
              'weights': weights}

grid_model = GridSearchCV(first_model,param_grid=grid_param,scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [124]:
grid_model.best_params_

{'leaf_size': 1, 'n_neighbors': 9, 'p': 2, 'weights': 'distance'}

In [125]:
confusion_matrix(y_test,grid_pred)

array([[20,  7, 26],
       [ 8,  3, 34],
       [10,  2, 80]], dtype=int64)

In [126]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.53      0.38      0.44        53
           D       0.25      0.07      0.11        45
           H       0.57      0.87      0.69        92

    accuracy                           0.54       190
   macro avg       0.45      0.44      0.41       190
weighted avg       0.48      0.54      0.48       190



Best accuracy we could get is 0.54 after implementing GridSearchCV.

Hyperparameters:
- 'leaf_size': 1 
- 'n_neighbors': 9 
- 'p': 2 
- 'weights': 'distance'

**Season 16/17**

In [16]:
X = s1617.drop('FTR',axis=1)
y = s1617[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [17]:
first_model.fit(scaled_X_train,y_train.values.ravel())
base_pred =first_model.predict(scaled_X_test)

In [18]:
confusion_matrix(y_test,base_pred)

array([[32, 25,  4],
       [ 5, 18, 12],
       [10, 42, 42]], dtype=int64)

In [19]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.68      0.52      0.59        61
           D       0.21      0.51      0.30        35
           H       0.72      0.45      0.55        94

    accuracy                           0.48       190
   macro avg       0.54      0.50      0.48       190
weighted avg       0.62      0.48      0.52       190



In [20]:
leaf_size = list(range(1,30))
n_neighbors = list(range(1,40))
p=[1,2,3]
weights = ['uniform','distance']

grid_param = {'leaf_size':leaf_size,
              'n_neighbors':n_neighbors,
              'p': p,
              'weights': weights}

grid_model = GridSearchCV(first_model,param_grid=grid_param,scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [21]:
grid_model.best_params_

{'leaf_size': 1, 'n_neighbors': 10, 'p': 2, 'weights': 'uniform'}

In [22]:
confusion_matrix(y_test,grid_pred)

array([[27, 23, 11],
       [ 6, 12, 17],
       [ 8, 35, 51]], dtype=int64)

In [23]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.66      0.44      0.53        61
           D       0.17      0.34      0.23        35
           H       0.65      0.54      0.59        94

    accuracy                           0.47       190
   macro avg       0.49      0.44      0.45       190
weighted avg       0.56      0.47      0.50       190



Best accuracy we could get is 0,47 after implementing GridSearchCV

Hyperparameters:
- 'leaf_size': 1
- 'n_neighbors': 10
- 'p': 2
- 'weights': 'uniform'

**Season 17/18**

In [46]:
X = s1718.drop('FTR',axis=1)
y = s1718[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [47]:
first_model.fit(scaled_X_train,y_train.values.ravel())
base_pred =first_model.predict(scaled_X_test)

In [48]:
confusion_matrix(y_test,base_pred)

array([[34,  9, 10],
       [24,  9, 11],
       [44, 10, 39]], dtype=int64)

In [49]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.33      0.64      0.44        53
           D       0.32      0.20      0.25        44
           H       0.65      0.42      0.51        93

    accuracy                           0.43       190
   macro avg       0.43      0.42      0.40       190
weighted avg       0.49      0.43      0.43       190



In [50]:
leaf_size = list(range(1,30))
n_neighbors = list(range(1,40))
p=[1,2,3]
weights = ['uniform','distance']

grid_param = {'leaf_size':leaf_size,
              'n_neighbors':n_neighbors,
              'p': p,
              'weights': weights}

grid_model = GridSearchCV(first_model,param_grid=grid_param,scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [51]:
grid_model.best_params_

{'leaf_size': 1, 'n_neighbors': 29, 'p': 1, 'weights': 'uniform'}

In [52]:
confusion_matrix(y_test,grid_pred)

array([[25,  0, 28],
       [18,  0, 26],
       [20,  0, 73]], dtype=int64)

In [53]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.40      0.47      0.43        53
           D       0.00      0.00      0.00        44
           H       0.57      0.78      0.66        93

    accuracy                           0.52       190
   macro avg       0.32      0.42      0.36       190
weighted avg       0.39      0.52      0.45       190



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Best accuracy we could get is 0,52 after implementing GridSearchCV

Hyperparameters:
- 'leaf_size': 1
- 'n_neighbors': 29
- 'p': 1
- 'weights': 'uniform'

**Season 18/19**

In [54]:
X = s1819.drop('FTR',axis=1)
y = s1819[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [55]:
first_model.fit(scaled_X_train,y_train.values.ravel())
base_pred =first_model.predict(scaled_X_test)

In [56]:
confusion_matrix(y_test,base_pred)

array([[24, 15, 11],
       [18, 12, 18],
       [28, 27, 37]], dtype=int64)

In [57]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.34      0.48      0.40        50
           D       0.22      0.25      0.24        48
           H       0.56      0.40      0.47        92

    accuracy                           0.38       190
   macro avg       0.38      0.38      0.37       190
weighted avg       0.42      0.38      0.39       190



In [58]:
leaf_size = list(range(1,30))
n_neighbors = list(range(1,40))
p=[1,2,3]
weights = ['uniform','distance']

grid_param = {'leaf_size':leaf_size,
              'n_neighbors':n_neighbors,
              'p': p,
              'weights': weights}

grid_model = GridSearchCV(first_model,param_grid=grid_param,scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [59]:
grid_model.best_params_

{'leaf_size': 1, 'n_neighbors': 10, 'p': 3, 'weights': 'distance'}

In [60]:
confusion_matrix(y_test,grid_pred)

array([[11, 22, 17],
       [ 8, 18, 22],
       [15, 22, 55]], dtype=int64)

In [61]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.32      0.22      0.26        50
           D       0.29      0.38      0.33        48
           H       0.59      0.60      0.59        92

    accuracy                           0.44       190
   macro avg       0.40      0.40      0.39       190
weighted avg       0.44      0.44      0.44       190



Best accuracy we could get is 0,44 after implementing Grid Search.

Hyperparameters:
- 'leaf_size': 1
- 'n_neighbors': 10
- 'p': 3
- 'weights': 'distance

**Season 19/20**

In [38]:
X = s1920.drop('FTR',axis=1)
y = s1920[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [39]:
first_model.fit(scaled_X_train,y_train.values.ravel())
base_pred =first_model.predict(scaled_X_test)

In [40]:
confusion_matrix(y_test,base_pred)

array([[21, 25,  6],
       [20, 22,  8],
       [19, 46, 23]], dtype=int64)

In [41]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.35      0.40      0.38        52
           D       0.24      0.44      0.31        50
           H       0.62      0.26      0.37        88

    accuracy                           0.35       190
   macro avg       0.40      0.37      0.35       190
weighted avg       0.45      0.35      0.35       190



In [42]:
leaf_size = list(range(1,30))
n_neighbors = list(range(1,40))
p=[1,2,3]
weights = ['uniform','distance']

grid_param = {'leaf_size':leaf_size,
              'n_neighbors':n_neighbors,
              'p': p,
              'weights': weights}

grid_model = GridSearchCV(first_model,param_grid=grid_param,scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [45]:
grid_model.best_params_

{'leaf_size': 1, 'n_neighbors': 24, 'p': 2, 'weights': 'uniform'}

In [43]:
confusion_matrix(y_test,grid_pred)

array([[15, 13, 24],
       [12, 14, 24],
       [12, 16, 60]], dtype=int64)

In [44]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.38      0.29      0.33        52
           D       0.33      0.28      0.30        50
           H       0.56      0.68      0.61        88

    accuracy                           0.47       190
   macro avg       0.42      0.42      0.41       190
weighted avg       0.45      0.47      0.45       190



Best accuracy we could get is 0,47 after implementing GridSearchCV

Hyperparameters:
- 'leaf_size': 1
- 'n_neighbors': 24
- 'p': 2
- 'weights': 'uniform' 