In [1]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.feature_selection import RFECV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv(r"C:\Users\George\Desktop\football-match-prediction-using-ml-techniques\fe\bundesliga.csv")

In [3]:
team_id = {'Bayern Munich':1, 'Dortmund':2, 'Ein Frankfurt':3, 'FC Koln':4,
           'Hannover':5, 'Hertha':6, 'Hoffenheim':7, "M'gladbach":8, 'Paderborn':9,
           'Augsburg':10, 'Hamburg':11, 'Leverkusen':12, 'Schalke 04':13, 'Stuttgart':14,
           'Werder Bremen':15, 'Wolfsburg':16, 'Freiburg':17, 'Mainz':18, 'Darmstadt':19,
           'Ingolstadt':20, 'RB Leipzig':21, 'Fortuna Dusseldorf':22, 'Nurnberg':23,
           'Union Berlin':24}

In [4]:
for key,value in df.iteritems():
    df['HomeTeam'] = df['HomeTeam'].apply(lambda x: team_id.get(x,x))
    
for key,value in df.iteritems():
    df['AwayTeam'] = df['AwayTeam'].apply(lambda x: team_id.get(x,x))

In [5]:
#We are dropping the starts that are not available to us before the game 
df = df.drop(['Unnamed: 0', 'Date','FTHG', 'FTAG', 'HTHG','HTAG','league', 'Hppda_coef', 'Appda_coef','HTW', 'HTD',
              'season','HTCR', 'ATCR','HTGS','HTxpts', 'ATxpts','ATGS', 'HTGC', 'ATGC','HTOVA_S', 'HTatt_S', 
              'HTmid_S','HTdef_S', 'ATOVA_S', 'ATatt_S','ATmid_S','ATdef_S','l5_ravg_HTgs', 'l5_ravg_ATgs',
              'l5_ravg_HTgc', 'l5_ravg_ATgc', 'HomeTeamPoints','AwayTeamPoints',
              'HTL', 'ATL', 'ATD', 'ATW'],axis=1)

df = df.dropna()

In [6]:
X = df.drop(['FTR'],axis=1)
y = df['FTR']

In [7]:
X.columns

Index(['HomeTeam', 'AwayTeam', 'round', 'AVGH', 'AVGD', 'AVGA', 'HT_wins',
       'AT_wins', 'HT_draws', 'AT_draws', 'HT_losses', 'AT_losses',
       'l5_ravg_HTST', 'l5_ravg_ATST', 'l5_ravg_HTCR', 'l5_ravg_ATCR',
       'HToveral', 'AToveral', 'l5_ravg_HTxG', 'l5_ravg_ATxG',
       'l5_ravg_HTxpts', 'l5_ravg_ATxpts', 'l5_ravg_HTdeep', 'l5_ravg_ATdeep',
       'l5_ravg_HTppda', 'l5_ravg_ATppda', 'l5_htdiff', 'l5_atdiff', 'avgHTP',
       'avgATP', 'l5_ravg_HTp', 'l3_ravg_ATp', 'diff_points', 'diff_ATT',
       'diff_MID', 'diff_DEF', 'diff_OVA'],
      dtype='object')

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101)

In [9]:
scaler = MinMaxScaler()

In [10]:
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [11]:
model = SVC()

In [13]:
model.fit(scaled_X_train,y_train)

In [14]:
pred = model.predict(scaled_X_test)

In [16]:
c = confusion_matrix(y_test,pred)

In [17]:
c

array([[ 72,   0,  71],
       [ 38,   0,  83],
       [ 46,   0, 207]], dtype=int64)

In [18]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           A       0.46      0.50      0.48       143
           D       0.00      0.00      0.00       121
           H       0.57      0.82      0.67       253

    accuracy                           0.54       517
   macro avg       0.34      0.44      0.39       517
weighted avg       0.41      0.54      0.46       517



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
estimator = SVC(kernel="linear")
selector = RFECV(estimator, step=1, scoring='accuracy', cv=5)
selector = selector.fit(X, y)
print("Features selected", selector.support_)
print("Feature ranking", selector.ranking_)

Features selected [False False  True False  True False False  True False False False False
 False False  True False False False False False False False False False
 False False False False  True False False False False False False False]
Feature ranking [31 29  1  7  1 20 17  1  2  4  3 13 28  6  1 32 24 12  8 10  9 15 21 30
 27 14 19 11  1 18 25  5 26 16 23 22]


In [20]:
featured_columns = pd.DataFrame(selector.support_,
                            index = X.columns,
                            columns=['is_in'])

In [21]:
featured_columns = featured_columns[featured_columns.is_in == True].index.tolist()

In [22]:
featured_columns

['AVGH', 'AVGA', 'HT_draws', 'l5_ravg_ATCR', 'avgATP']

In [23]:
X = df[featured_columns]

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101)

In [25]:
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [26]:
model.fit(scaled_X_train,y_train)

In [27]:
pred = model.predict(scaled_X_test)

In [28]:
confusion_matrix(y_test,pred)

array([[ 58,   0,  85],
       [ 15,   0, 106],
       [ 22,   0, 231]], dtype=int64)

In [29]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           A       0.61      0.41      0.49       143
           D       0.00      0.00      0.00       121
           H       0.55      0.91      0.68       253

    accuracy                           0.56       517
   macro avg       0.39      0.44      0.39       517
weighted avg       0.44      0.56      0.47       517



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [30]:
kernel = ['linear','rbf', 'sigmoid']
C = [1,2,3,4,5]
gamma = [0.01,0.02,0.03,0.05,0.1,0.2]

In [31]:
param_grid = {'kernel': kernel,
              'C': C,
              'gamma': gamma}

In [33]:
grid_model = GridSearchCV(model,param_grid=param_grid)

In [34]:
grid_model.fit(scaled_X_train,y_train)

In [35]:
grid_model.best_params_

{'C': 5, 'gamma': 0.01, 'kernel': 'linear'}

In [36]:
y_pred = grid_model.predict(scaled_X_test)

In [37]:
c = confusion_matrix(y_test,y_pred)

In [38]:
c

array([[ 39,   0, 104],
       [ 10,   0, 111],
       [ 13,   0, 240]], dtype=int64)

In [39]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           A       0.63      0.27      0.38       143
           D       0.00      0.00      0.00       121
           H       0.53      0.95      0.68       253

    accuracy                           0.54       517
   macro avg       0.39      0.41      0.35       517
weighted avg       0.43      0.54      0.44       517



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**We didn't manage to improve the model using GridSearchCV.**

**Premier League**

In [40]:
df = pd.read_csv(r"C:\Users\George\Desktop\football-match-prediction-using-ml-techniques\fe\premier_league.csv")

In [41]:
#We assign a unique value at every team of the league
team_id = {'Arsenal':1, 'Leicester':2, 'Man United':3, 'QPR':4, 'Stoke':5, 'West Brom':6,
           'West Ham':7, 'Liverpool':8, 'Newcastle':9, 'Burnley':10, 'Aston Villa':11,
           'Chelsea':12, 'Crystal Palace':13, 'Everton':14, 'Southampton':15, 'Swansea':16,
           'Hull':17, 'Sunderland':18, 'Tottenham':19, 'Man City':20, 'Bournemouth':21,
           'Norwich':22, 'Watford':23, 'Middlesbrough':24, 'Brighton':25, 'Huddersfield':26,
           'Fulham':27, 'Wolves':28, 'Cardiff':29, 'Sheffield United':30}

for key,value in df.iteritems():
    df['HomeTeam'] = df['HomeTeam'].apply(lambda x: team_id.get(x,x))
    
for key,value in df.iteritems():
    df['AwayTeam'] = df['AwayTeam'].apply(lambda x: team_id.get(x,x))

In [42]:
#We are dropping the starts that are not available to us before the game 
df = df.drop(['Unnamed: 0', 'Date','FTHG', 'FTAG', 'HTHG','HTAG','league', 'Hppda_coef', 'Appda_coef','HTW', 'HTD',
              'season','HTCR', 'ATCR','HTGS','HTxpts', 'ATxpts','ATGS', 'HTGC', 'ATGC','HTOVA_S', 'HTatt_S', 
              'HTmid_S','HTdef_S', 'ATOVA_S', 'ATatt_S','ATmid_S','ATdef_S','l5_ravg_HTgs', 'l5_ravg_ATgs',
              'l5_ravg_HTgc', 'l5_ravg_ATgc', 'HomeTeamPoints','AwayTeamPoints',
              'HTL', 'ATL', 'ATD', 'ATW'],axis=1)

df = df.dropna()

In [43]:
X = df.drop('FTR',axis=1)
y = df['FTR']

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101)

In [45]:
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [46]:
model = SVC()

In [47]:
model.fit(scaled_X_train,y_train)

In [48]:
pred = model.predict(scaled_X_test)

In [49]:
confusion_matrix(y_test,pred)

array([[ 85,   0,  96],
       [ 32,   0, 128],
       [ 37,   1, 275]], dtype=int64)

In [50]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           A       0.55      0.47      0.51       181
           D       0.00      0.00      0.00       160
           H       0.55      0.88      0.68       313

    accuracy                           0.55       654
   macro avg       0.37      0.45      0.39       654
weighted avg       0.42      0.55      0.46       654



In [51]:
estimator = SVC(kernel="linear")
selector = RFECV(estimator, step=1, scoring='accuracy', cv=5)
selector = selector.fit(X, y)
print("Features selected", selector.support_)
print("Feature ranking", selector.ranking_)

Features selected [False False  True  True  True False False  True False False False  True
 False False  True False False  True  True  True  True False False False
 False False  True  True  True False False False False  True  True False]
Feature ranking [16 22  1  1  1  6 20  1  3  2  4  1 10 17  1 15 21  1  1  1  1 11  9 12
 13  7  1  1  1  8 14  5 18  1  1 19]


In [52]:
featured_columns = pd.DataFrame(selector.support_,
                            index = X.columns,
                            columns=['is_in'])

In [53]:
featured_columns = featured_columns[featured_columns.is_in == True].index.tolist()

In [54]:
featured_columns

['AVGH',
 'AVGD',
 'AVGA',
 'HT_draws',
 'l5_ravg_HTST',
 'l5_ravg_ATCR',
 'l5_ravg_HTxG',
 'l5_ravg_ATxG',
 'l5_ravg_HTxpts',
 'l5_ravg_ATxpts',
 'l5_atdiff',
 'avgHTP',
 'avgATP',
 'diff_MID',
 'diff_DEF']

In [55]:
X = df[featured_columns]

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101)

In [57]:
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [58]:
model.fit(scaled_X_train,y_train)

In [59]:
rfe_pred = model.predict(scaled_X_test)

In [60]:
confusion_matrix(y_test,rfe_pred)

array([[ 90,   0,  91],
       [ 37,   0, 123],
       [ 36,   2, 275]], dtype=int64)

In [61]:
print(classification_report(y_test,rfe_pred))

              precision    recall  f1-score   support

           A       0.55      0.50      0.52       181
           D       0.00      0.00      0.00       160
           H       0.56      0.88      0.69       313

    accuracy                           0.56       654
   macro avg       0.37      0.46      0.40       654
weighted avg       0.42      0.56      0.47       654



In [62]:
kernel = ['rbf', 'sigmoid']
C = [1,2,3,4,5]
gamma = [0.01,0.02,0.03,0.05,0.1,0.2,'scale','auto']

In [63]:
param_grid = {'kernel': kernel,
              'C': C,
              'gamma': gamma}

In [64]:
grid_model = GridSearchCV(model, param_grid=param_grid, scoring='accuracy')

In [65]:
grid_model.fit(scaled_X_train,y_train)

In [66]:
grid_model.best_params_

{'C': 5, 'gamma': 0.2, 'kernel': 'rbf'}

In [67]:
grid_pred = grid_model.predict(scaled_X_test)

In [68]:
confusion_matrix(y_test,grid_pred)

array([[ 86,   0,  95],
       [ 32,   0, 128],
       [ 31,   0, 282]], dtype=int64)

We will try one more time

In [86]:
kernel = ['rbf', 'sigmoid']
C = [1,2,3,4,5,6,7,8,9,10]
gamma = [0.1,0.15,0.16,0.17,0.18,0.2,0.3,'scale']

In [87]:
param_grid = {'kernel': kernel,
              'C': C,
              'gamma': gamma}

In [88]:
grid_model = GridSearchCV(model, param_grid=param_grid, scoring='accuracy')

In [89]:
grid_model.fit(scaled_X_train,y_train)

In [90]:
grid_model.best_params_

{'C': 4, 'gamma': 0.15, 'kernel': 'sigmoid'}

In [91]:
grid_pred = grid_model.predict(scaled_X_test)

In [92]:
confusion_matrix(y_test,grid_pred)

array([[ 91,   1,  89],
       [ 31,   0, 129],
       [ 38,   0, 275]], dtype=int64)

In [93]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.57      0.50      0.53       181
           D       0.00      0.00      0.00       160
           H       0.56      0.88      0.68       313

    accuracy                           0.56       654
   macro avg       0.38      0.46      0.41       654
weighted avg       0.42      0.56      0.47       654



**We can't improve the model even further. 0.56 accuracy is the best I could get.**

Features:
- AVGH
- AVGD
- AVGA
- HT_draws
- l5_ravg_HTST
- l5_ravg_ATCR
- l5_ravg_HTxG
- l5_ravg_ATxG
- l5_ravg_HTxpts
- l5_ravg_ATxpts
- l5_atdiff
- avgHTP
- avgATP
- diff_MID
- diff_DEF

**La Liga**

In [47]:
df = pd.read_csv(r"C:\Users\George\Desktop\football-match-prediction-using-ml-techniques\fe\la_liga.csv")

In [48]:
#We assign a unique value at every team of the league
team_id = {'Almeria':1, 'Granada':2, 'Malaga':3, 'Sevilla':4, 'Barcelona':5, 'Celta':6,
           'Eibar':7, 'Levante':8, 'Real Madrid':9, 'Vallecano':10, 'Getafe':11,
           'Valencia':12, 'Ath Bilbao':13, 'Ath Madrid':14, 'Cordoba':15, 'Espanol':16,
           'Elche':17, 'La Coruna':18, 'Sociedad':19, 'Villarreal':20, 'Betis':21,
           'Sp Gijon':22, 'Las Palmas':23, 'Leganes':24, 'Osasuna':25, 'Alaves':26, 'Girona':27,
           'Valladolid':28, 'Huesca':29, 'Mallorca':30}

for key,value in df.iteritems():
    df['HomeTeam'] = df['HomeTeam'].apply(lambda x: team_id.get(x,x))
    
for key,value in df.iteritems():
    df['AwayTeam'] = df['AwayTeam'].apply(lambda x: team_id.get(x,x))

In [49]:
#We are dropping the starts that are not available to us before the game 
df = df.drop(['Unnamed: 0', 'Date','FTHG', 'FTAG', 'HTHG','HTAG','league', 'Hppda_coef', 'Appda_coef','HTW', 'HTD',
              'season','HTCR', 'ATCR','HTGS','HTxpts', 'ATxpts','ATGS', 'HTGC', 'ATGC','HTOVA_S', 'HTatt_S', 
              'HTmid_S','HTdef_S', 'ATOVA_S', 'ATatt_S','ATmid_S','ATdef_S','l5_ravg_HTgs', 'l5_ravg_ATgs',
              'l5_ravg_HTgc', 'l5_ravg_ATgc', 'HomeTeamPoints','AwayTeamPoints',
              'HTL', 'ATL', 'ATD', 'ATW'],axis=1)

df = df.dropna()

In [50]:
X = df.drop(['FTR'],axis=1)
y = df['FTR']

In [51]:
X.shape

(1979, 36)

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101)

In [38]:
scaler = MinMaxScaler()

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [8]:
first_model = SVC()

first_model.fit(scaled_X_train,y_train)
first_model_pred = first_model.predict(scaled_X_test)

In [9]:
confusion_matrix(y_test,first_model_pred)

array([[ 74,   3, 107],
       [ 48,   4, 107],
       [ 50,   0, 261]], dtype=int64)

In [10]:
print(classification_report(y_test,first_model_pred))

              precision    recall  f1-score   support

           A       0.43      0.40      0.42       184
           D       0.57      0.03      0.05       159
           H       0.55      0.84      0.66       311

    accuracy                           0.52       654
   macro avg       0.52      0.42      0.38       654
weighted avg       0.52      0.52      0.44       654



In [11]:
estimator = SVC(kernel="linear")
selector = RFECV(estimator, step=1, scoring='accuracy', cv=5)
selector = selector.fit(X, y)
print("Features selected", selector.support_)
print("Feature ranking", selector.ranking_)

Features selected [False False  True  True  True  True  True  True  True  True  True False
  True  True  True False False  True  True  True  True False False False
 False  True  True  True  True  True False  True  True  True False  True]
Feature ranking [12  8  1  1  1  1  1  1  1  1  1  9  1  1  1  2 11  1  1  1  1  3  6  4
  5  1  1  1  1  1  7  1  1  1 10  1]


In [12]:
featured_columns = pd.DataFrame(selector.support_,
                            index = X.columns,
                            columns=['is_in'])

In [13]:
featured_columns = featured_columns[featured_columns.is_in == True].index.tolist()

In [14]:
featured_columns

['AVGH',
 'AVGD',
 'AVGA',
 'HT_wins',
 'AT_wins',
 'HT_draws',
 'AT_draws',
 'HT_losses',
 'AT_losses',
 'l5_ravg_ATST',
 'l5_ravg_HTCR',
 'l5_ravg_ATCR',
 'l5_ravg_HTxG',
 'l5_ravg_ATxG',
 'l5_ravg_HTxpts',
 'l5_ravg_ATxpts',
 'l5_htdiff',
 'l5_atdiff',
 'avgHTP',
 'avgATP',
 'l5_ravg_HTp',
 'diff_points',
 'diff_ATT',
 'diff_MID',
 'diff_OVA']

In [15]:
X = df[featured_columns]

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101)

In [17]:
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [18]:
first_model.fit(scaled_X_train,y_train)

In [20]:
rfe_pred = first_model.predict(scaled_X_test)

In [21]:
confusion_matrix(y_test,rfe_pred)

array([[ 65,   2, 117],
       [ 46,   1, 112],
       [ 44,   0, 267]], dtype=int64)

In [22]:
print(classification_report(y_test,rfe_pred))

              precision    recall  f1-score   support

           A       0.42      0.35      0.38       184
           D       0.33      0.01      0.01       159
           H       0.54      0.86      0.66       311

    accuracy                           0.51       654
   macro avg       0.43      0.41      0.35       654
weighted avg       0.46      0.51      0.43       654



In [39]:
kernel = ['rbf', 'sigmoid']
C = [1,2,3,4,5,6,7,8,9,10]
gamma = [0.1,0.15,0.16,0.17,0.18,0.2,0.3,'scale']

In [40]:
param_grid = {'kernel': kernel,
              'C': C,
              'gamma': gamma}

In [41]:
grid_model = GridSearchCV(first_model, param_grid=param_grid, scoring='accuracy')

In [42]:
grid_model.fit(scaled_X_train,y_train)

In [43]:
grid_model.best_params_

{'C': 4, 'gamma': 0.15, 'kernel': 'rbf'}

In [44]:
grid_pred = grid_model.predict(scaled_X_test)

In [45]:
confusion_matrix(y_test,grid_pred)

array([[ 68,   2, 114],
       [ 42,   2, 115],
       [ 48,   1, 262]], dtype=int64)

In [46]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.43      0.37      0.40       184
           D       0.40      0.01      0.02       159
           H       0.53      0.84      0.65       311

    accuracy                           0.51       654
   macro avg       0.45      0.41      0.36       654
weighted avg       0.47      0.51      0.43       654



# Serie A

In [2]:
df = pd.read_csv(r"C:\Users\George\Desktop\football-match-prediction-using-ml-techniques\fe\serie_a.csv")

In [3]:
#We assign a unique value at every team of the league
team_id = {'Chievo':1, 'Roma':2, 'Atalanta':3, 'Cesena':4, 'Genoa':5, 'Milan':6,
           'Palermo':7, 'Sassuolo':8, 'Torino':9, 'Udinese':10, 'Empoli':11, 'Juventus':12,
           'Cagliari':13, 'Fiorentina':14, 'Inter':15, 'Lazio':16, 'Napoli':17, 'Parma':18,
           'Sampdoria':19, 'Verona':20, 'Frosinone':21, 'Bologna':22, 'Carpi':23, 'Pescara':24,
           'Crotone':25, 'Benevento':26, 'Spal':27, 'Lecce':28, 'Brescia':29}

for key,value in df.iteritems():
    df['HomeTeam'] = df['HomeTeam'].apply(lambda x: team_id.get(x,x))
    
for key,value in df.iteritems():
    df['AwayTeam'] = df['AwayTeam'].apply(lambda x: team_id.get(x,x))

In [4]:
#We are dropping the starts that are not available to us before the game 
df = df.drop(['Unnamed: 0', 'Date','FTHG', 'FTAG', 'HTHG','HTAG','league', 'Hppda_coef', 'Appda_coef','HTW', 'HTD','season',
              'HTCR', 'ATCR','HTGS','HTxpts', 'ATxpts','ATGS', 'HTGC', 'ATGC','HTOVA_S', 'HTatt_S', 'HomeTeamPoints', 'AwayTeamPoints',
              'HTmid_S','HTdef_S', 'ATOVA_S', 'ATatt_S','ATmid_S','ATdef_S','l5_ravg_HTgs', 'l5_ravg_ATgs',
              'HTL', 'ATL', 'ATD', 'ATW','l5_ravg_HTgc', 'l5_ravg_ATgc','round'],axis=1)

df = df.dropna()

In [5]:
X = df.drop('FTR',axis=1)
y = df['FTR']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101)

In [7]:
scaler = MinMaxScaler()

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [8]:
model = SVC()

model.fit(scaled_X_train,y_train)
model_pred = model.predict(scaled_X_test)

In [9]:
confusion_matrix(y_test,model_pred)

array([[118,   5,  74],
       [ 68,   0, 108],
       [ 43,   2, 235]], dtype=int64)

In [10]:
print(classification_report(y_test,model_pred))

              precision    recall  f1-score   support

           A       0.52      0.60      0.55       197
           D       0.00      0.00      0.00       176
           H       0.56      0.84      0.67       280

    accuracy                           0.54       653
   macro avg       0.36      0.48      0.41       653
weighted avg       0.40      0.54      0.46       653



In [11]:
estimator = SVC(kernel="linear")
selector = RFECV(estimator, step=1, scoring='accuracy', cv=5)
selector = selector.fit(X, y)
print("Features selected", selector.support_)
print("Feature ranking", selector.ranking_)

Features selected [False False  True  True  True  True  True  True  True  True  True  True
  True  True  True False  True  True  True  True  True False  True False
 False  True  True  True  True  True False  True  True  True  True  True]
Feature ranking [8 7 1 1 1 1 1 1 1 1 1 1 1 1 1 6 1 1 1 1 1 5 1 3 2 1 1 1 1 1 4 1 1 1 1 1]


In [12]:
featured_columns = pd.DataFrame(selector.support_,
                            index = X.columns,
                            columns=['is_in'])

In [13]:
featured_columns = featured_columns[featured_columns.is_in == True].index.tolist()

In [14]:
featured_columns

['AVGH',
 'AVGD',
 'AVGA',
 'HT_wins',
 'AT_wins',
 'HT_draws',
 'AT_draws',
 'HT_losses',
 'AT_losses',
 'l5_ravg_HTST',
 'l5_ravg_ATST',
 'l5_ravg_HTCR',
 'l5_ravg_ATCR',
 'AToveral',
 'l5_ravg_HTxG',
 'l5_ravg_ATxG',
 'l5_ravg_HTxpts',
 'l5_ravg_ATxpts',
 'l5_ravg_ATdeep',
 'l5_htdiff',
 'l5_atdiff',
 'avgHTP',
 'avgATP',
 'l5_ravg_HTp',
 'diff_points',
 'diff_ATT',
 'diff_MID',
 'diff_DEF',
 'diff_OVA']

In [15]:
X = df[featured_columns]

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101)

In [17]:
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [19]:
model.fit(scaled_X_train,y_train)

In [20]:
rfe_pred = model.predict(scaled_X_test)

In [21]:
confusion_matrix(y_test,rfe_pred)

array([[121,   1,  75],
       [ 60,   2, 114],
       [ 39,   1, 240]], dtype=int64)

In [22]:
print(classification_report(y_test,rfe_pred))

              precision    recall  f1-score   support

           A       0.55      0.61      0.58       197
           D       0.50      0.01      0.02       176
           H       0.56      0.86      0.68       280

    accuracy                           0.56       653
   macro avg       0.54      0.49      0.43       653
weighted avg       0.54      0.56      0.47       653



In [23]:
kernel = ['rbf', 'sigmoid']
C = [1,2,3,4,5,6,7,8,9,10]
gamma = [0.1,0.15,0.16,0.17,0.18,0.2,0.3,'scale']

In [24]:
param_grid = {'kernel': kernel,
              'C': C,
              'gamma': gamma}

In [26]:
grid_model = GridSearchCV(model, param_grid=param_grid, scoring='accuracy')

In [27]:
grid_model.fit(scaled_X_train,y_train)

In [28]:
grid_pred = grid_model.predict(scaled_X_test)

In [29]:
confusion_matrix(y_test,grid_pred)

array([[119,   1,  77],
       [ 62,   2, 112],
       [ 46,   5, 229]], dtype=int64)

In [30]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.52      0.60      0.56       197
           D       0.25      0.01      0.02       176
           H       0.55      0.82      0.66       280

    accuracy                           0.54       653
   macro avg       0.44      0.48      0.41       653
weighted avg       0.46      0.54      0.46       653



**Best accuracy we could get is 0.56 with the default hyperparameters after implementing RFECV.**

Features:
- AVGH
- AVGD
- AVGA
- HT_wins
- AT_wins
- HT_draws
- AT_draws
- HT_losses
- AT_losses
- l5_ravg_HTST
- l5_ravg_ATST
- l5_ravg_HTCR
- l5_ravg_ATCR
- AToveral
- l5_ravg_HTxG
- l5_ravg_ATxG
- l5_ravg_HTxpts
- l5_ravg_ATxpts
- l5_ravg_ATdeep
- l5_htdiff
- l5_atdiff
- avgHTP
- avgATP
- l5_ravg_HTp
- diff_points
- diff_ATT
- diff_MID
- diff_DEF
- diff_OVA

# Scenario 2

**Premier League**

In [274]:
df = pd.read_csv(r"C:\Users\George\Desktop\football-match-prediction-using-ml-techniques\fe\premier_league.csv")

In [275]:
#We assign a unique value at every team of the league
team_id = {'Arsenal':1, 'Leicester':2, 'Man United':3, 'QPR':4, 'Stoke':5, 'West Brom':6,
           'West Ham':7, 'Liverpool':8, 'Newcastle':9, 'Burnley':10, 'Aston Villa':11,
           'Chelsea':12, 'Crystal Palace':13, 'Everton':14, 'Southampton':15, 'Swansea':16,
           'Hull':17, 'Sunderland':18, 'Tottenham':19, 'Man City':20, 'Bournemouth':21,
           'Norwich':22, 'Watford':23, 'Middlesbrough':24, 'Brighton':25, 'Huddersfield':26,
           'Fulham':27, 'Wolves':28, 'Cardiff':29, 'Sheffield United':30}

for key,value in df.iteritems():
    df['HomeTeam'] = df['HomeTeam'].apply(lambda x: team_id.get(x,x))
    
for key,value in df.iteritems():
    df['AwayTeam'] = df['AwayTeam'].apply(lambda x: team_id.get(x,x))

In [276]:
#We are dropping the starts that are not available to us before the game 
df = df.drop(['Unnamed: 0', 'Date','FTHG', 'FTAG', 'HTHG','HTAG','league', 'Hppda_coef', 'Appda_coef','HTW', 'HTD',
              'HTCR', 'ATCR','HTGS','HTxpts', 'ATxpts','ATGS', 'HTGC', 'ATGC','HTOVA_S', 'HTatt_S', 'HomeTeamPoints','AwayTeamPoints',
              'HTmid_S','HTdef_S', 'ATOVA_S', 'ATatt_S','ATmid_S','ATdef_S','l5_ravg_HTgs', 'l5_ravg_ATgs','l5_ravg_HTgc', 'l5_ravg_ATgc',
              'HTL', 'ATL', 'ATD', 'ATW'],axis=1)

df = df.dropna()

In [277]:
s1415 = df[df['season']=='2014/2015'].drop('season',axis=1)
s1516 = df[df['season']=='2015/2016'].drop('season',axis=1)
s1617 = df[df['season']=='2016/2017'].drop('season',axis=1)
s1718 = df[df['season']=='2017/2018'].drop('season',axis=1)
s1819 = df[df['season']=='2018/2019'].drop('season',axis=1)
s1920 = df[df['season']=='2019/2020'].drop('season',axis=1)

**Season 14/15**

In [278]:
X = s1415.drop('FTR',axis=1)
y = s1415[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [279]:
model.fit(scaled_X_train,y_train.values.ravel())
base_pred =model.predict(scaled_X_test)

In [280]:
confusion_matrix(y_test,base_pred)

array([[43,  0, 16],
       [17,  0, 24],
       [35,  0, 55]], dtype=int64)

In [281]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.45      0.73      0.56        59
           D       0.00      0.00      0.00        41
           H       0.58      0.61      0.59        90

    accuracy                           0.52       190
   macro avg       0.34      0.45      0.38       190
weighted avg       0.41      0.52      0.46       190



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [282]:
estimator = SVC(kernel="linear")
selector = RFECV(estimator, step=1, scoring='accuracy', cv=5)
selector = selector.fit(X, y.drop('round',axis=1).values.ravel())
print("Features selected", selector.support_)
print("Feature ranking", selector.ranking_)

Features selected [False False  True  True False False False False False False False False
 False False False  True False False False  True False False False False
 False False False  True  True False False False False False False False
 False]
Feature ranking [27 32  1  1  2  3 25 22  5 10  9  8  7 14 19  1 30 26  6  1 21 12 24 13
 31 29 20  1  1  4 23 28 11 16 17 15 18]


In [283]:
featured_columns = pd.DataFrame(selector.support_,
                            index = X.columns,
                            columns=['is_in'])

featured_columns = featured_columns[featured_columns.is_in == True].index.tolist()
featured_columns

['round', 'AVGH', 'l5_ravg_ATCR', 'l5_ravg_ATxG', 'l5_atdiff', 'avgHTP']

In [284]:
X = s1415[featured_columns]
y = s1415[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [285]:
model.fit(scaled_X_train,y_train.values.ravel())
rfe_pred =model.predict(scaled_X_test)

In [286]:
confusion_matrix(y_test,rfe_pred)

array([[23,  2, 34],
       [ 5,  2, 34],
       [10,  3, 77]], dtype=int64)

In [287]:
print(classification_report(y_test,rfe_pred))

              precision    recall  f1-score   support

           A       0.61      0.39      0.47        59
           D       0.29      0.05      0.08        41
           H       0.53      0.86      0.66        90

    accuracy                           0.54       190
   macro avg       0.47      0.43      0.40       190
weighted avg       0.50      0.54      0.48       190



In [288]:
#Define the parameters
kernel = ['linear','rbf', 'sigmoid']
C = [1,2,3,4,5,6,7,8,9,10]
gamma = [0.1,0.15,0.16,0.17,0.18,0.2,0.3,'scale']

param_grid = {'kernel': kernel,
              'C': C,
              'gamma': gamma}

#Grid Search
grid_model = GridSearchCV(model, param_grid=param_grid, scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [289]:
grid_model.best_params_

{'C': 5, 'gamma': 0.1, 'kernel': 'linear'}

In [290]:
confusion_matrix(y_test,grid_pred)


array([[23,  1, 35],
       [ 5,  0, 36],
       [ 9,  0, 81]], dtype=int64)

In [291]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.62      0.39      0.48        59
           D       0.00      0.00      0.00        41
           H       0.53      0.90      0.67        90

    accuracy                           0.55       190
   macro avg       0.38      0.43      0.38       190
weighted avg       0.45      0.55      0.47       190



The best accuracy we could get is 0,55 after implementing RFE and GridSearchCV.

Hyperparameters:
- 'C': 5
- 'gamma': 0.1
- 'kernel': 'linear'

Features:
- 'AVGH'
- 'l5_ravg_ATCR'
- 'l5_ravg_ATxG'
- 'l5_atdiff'
- 'avgHTP'

**Season 15/16**

In [292]:
X = s1516.drop('FTR',axis=1)
y = s1516[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [293]:
model.fit(scaled_X_train,y_train.values.ravel())
base_pred =model.predict(scaled_X_test)

In [294]:
confusion_matrix(y_test,base_pred)

array([[ 4,  0, 49],
       [ 3,  0, 50],
       [ 2,  0, 82]], dtype=int64)

In [295]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.44      0.08      0.13        53
           D       0.00      0.00      0.00        53
           H       0.45      0.98      0.62        84

    accuracy                           0.45       190
   macro avg       0.30      0.35      0.25       190
weighted avg       0.32      0.45      0.31       190



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [296]:
estimator = SVC(kernel="linear")
selector = RFECV(estimator, step=1, scoring='accuracy', cv=5)
selector = selector.fit(X, y.drop('round',axis=1).values.ravel())
print("Features selected", selector.support_)
print("Feature ranking", selector.ranking_)

Features selected [False False  True  True  True False False False False False False False
  True  True  True  True False False  True  True  True  True False False
 False False False  True  True  True False False False False False False
 False]
Feature ranking [18 22  1  1  1 12  7  9  4  5  8 10  1  1  1  1 24 19  1  1  1  1 20 13
 17 15 14  1  1  1 16 11 21 23  2  3  6]


In [297]:
featured_columns = pd.DataFrame(selector.support_,
                            index = X.columns,
                            columns=['is_in'])

featured_columns = featured_columns[featured_columns.is_in == True].index.tolist()
featured_columns

['round',
 'AVGH',
 'AVGD',
 'l5_ravg_HTST',
 'l5_ravg_ATST',
 'l5_ravg_HTCR',
 'l5_ravg_ATCR',
 'l5_ravg_HTxG',
 'l5_ravg_ATxG',
 'l5_ravg_HTxpts',
 'l5_ravg_ATxpts',
 'l5_atdiff',
 'avgHTP',
 'avgATP']

In [298]:
X = s1516[featured_columns]
y = s1516[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [299]:
model.fit(scaled_X_train,y_train.values.ravel())
rfe_pred =model.predict(scaled_X_test)

In [300]:
confusion_matrix(y_test,rfe_pred)

array([[35,  0, 18],
       [22,  1, 30],
       [23,  1, 60]], dtype=int64)

In [301]:
print(classification_report(y_test,rfe_pred))

              precision    recall  f1-score   support

           A       0.44      0.66      0.53        53
           D       0.50      0.02      0.04        53
           H       0.56      0.71      0.63        84

    accuracy                           0.51       190
   macro avg       0.50      0.46      0.40       190
weighted avg       0.51      0.51      0.43       190



In [302]:
#Define the parameters
kernel = ['linear','rbf', 'sigmoid']
C = [1,2,3,4,5,6,7,8,9,10]
gamma = [0.1,0.15,0.16,0.17,0.18,0.2,0.3,'scale']

param_grid = {'kernel': kernel,
              'C': C,
              'gamma': gamma}

#Grid Search
grid_model = GridSearchCV(model, param_grid=param_grid, scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [304]:
grid_model.best_params_

{'C': 4, 'gamma': 0.16, 'kernel': 'sigmoid'}

In [303]:
confusion_matrix(y_test,grid_pred)

array([[30,  0, 23],
       [14,  0, 39],
       [18,  0, 66]], dtype=int64)

In [305]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.48      0.57      0.52        53
           D       0.00      0.00      0.00        53
           H       0.52      0.79      0.62        84

    accuracy                           0.51       190
   macro avg       0.33      0.45      0.38       190
weighted avg       0.36      0.51      0.42       190



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Best accuracy we could get is 0,51 from the base model after RFE.

Features:
- 'AVGH'
- 'AVGD'
- 'l5_ravg_HTST'
- 'l5_ravg_ATST'
- 'l5_ravg_HTCR'
- 'l5_ravg_ATCR'
- 'l5_ravg_HTxG'
- 'l5_ravg_ATxG'
- 'l5_ravg_HTxpts'
- 'l5_ravg_ATxpts'
- 'l5_atdiff'
- 'avgHTP'
- 'avgATP'

**Season 16/17**

In [306]:
X = s1617.drop('FTR',axis=1)
y = s1617[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [307]:
model.fit(scaled_X_train,y_train.values.ravel())
base_pred = model.predict(scaled_X_test)

In [308]:
confusion_matrix(y_test,base_pred)

array([[12,  0, 41],
       [ 6,  0, 36],
       [ 6,  0, 89]], dtype=int64)

In [309]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.50      0.23      0.31        53
           D       0.00      0.00      0.00        42
           H       0.54      0.94      0.68        95

    accuracy                           0.53       190
   macro avg       0.35      0.39      0.33       190
weighted avg       0.41      0.53      0.43       190



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [310]:
estimator = SVC(kernel="linear")
selector = RFECV(estimator, step=1, scoring='accuracy', cv=5)
selector = selector.fit(X, y.drop('round',axis=1).values.ravel())
print("Features selected", selector.support_)
print("Feature ranking", selector.ranking_)

Features selected [False False False  True  True False False False False False False False
 False False  True  True False False  True False  True False False False
 False False False  True  True  True False False False False False  True
 False]
Feature ranking [26 24  3  1  1  5 16 15  4  7  8  6 11 14  1  1 20 23  1 10  1 12 27 21
 13 19  2  1  1  1 25 17  9 22 18  1 28]


In [311]:
featured_columns = pd.DataFrame(selector.support_,
                               index = X.columns,
                               columns=['is_in'])

featured_columns = featured_columns[featured_columns.is_in == True].index.tolist()
featured_columns

['AVGH',
 'AVGD',
 'l5_ravg_HTCR',
 'l5_ravg_ATCR',
 'l5_ravg_HTxG',
 'l5_ravg_HTxpts',
 'l5_atdiff',
 'avgHTP',
 'avgATP',
 'diff_DEF']

In [312]:
X = pd.concat([s1617[featured_columns],s1617['round']],axis=1)
y = s1617[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [313]:
model.fit(scaled_X_train,y_train.values.ravel())
rfe_pred = model.predict(scaled_X_test)

In [314]:
confusion_matrix(y_test,rfe_pred)

array([[26,  0, 27],
       [ 6,  0, 36],
       [ 5,  0, 90]], dtype=int64)

In [315]:
print(classification_report(y_test,rfe_pred))

              precision    recall  f1-score   support

           A       0.70      0.49      0.58        53
           D       0.00      0.00      0.00        42
           H       0.59      0.95      0.73        95

    accuracy                           0.61       190
   macro avg       0.43      0.48      0.43       190
weighted avg       0.49      0.61      0.52       190



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [316]:
#Define the parameters
kernel = ['linear','rbf', 'sigmoid']
C = [1,2,3,4,5,6,7,8,9,10]
gamma = [0.1,0.15,0.16,0.17,0.18,0.2,0.3,'scale']

param_grid = {'kernel': kernel,
              'C': C,
              'gamma': gamma}

#Grid Search
grid_model = GridSearchCV(model, param_grid=param_grid, scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [317]:
grid_model.best_params_

{'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}

In [318]:
confusion_matrix(y_test,grid_pred)


array([[18,  0, 35],
       [ 5,  0, 37],
       [ 3,  0, 92]], dtype=int64)

In [319]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.69      0.34      0.46        53
           D       0.00      0.00      0.00        42
           H       0.56      0.97      0.71        95

    accuracy                           0.58       190
   macro avg       0.42      0.44      0.39       190
weighted avg       0.47      0.58      0.48       190



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The best accuracy we could get is 0,58 after implementing RFE.

Features:
- 'AVGH'
- 'AVGD'
- 'l5_ravg_HTCR'
- 'l5_ravg_ATCR'
- 'l5_ravg_HTxG'
- 'l5_ravg_HTxpts'
- 'l5_atdiff'
- 'avgHTP'
- 'avgATP'
- 'diff_DEF'

**Season 17/18**

In [320]:
X = s1718.drop('FTR',axis=1)
y = s1718[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [321]:
model.fit(scaled_X_train,y_train.values.ravel())
base_pred = model.predict(scaled_X_test)

In [322]:
confusion_matrix(y_test,base_pred)

array([[19,  1, 30],
       [ 7,  2, 43],
       [12,  1, 75]], dtype=int64)

In [323]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.50      0.38      0.43        50
           D       0.50      0.04      0.07        52
           H       0.51      0.85      0.64        88

    accuracy                           0.51       190
   macro avg       0.50      0.42      0.38       190
weighted avg       0.50      0.51      0.43       190



In [324]:
estimator = SVC(kernel="linear")
selector = RFECV(estimator, step=1, scoring='accuracy', cv=5)
selector = selector.fit(X, y.drop('round',axis=1).values.ravel())
print("Features selected", selector.support_)
print("Feature ranking", selector.ranking_)

Features selected [False False  True False False False  True False False False False False
 False False  True False False False  True  True  True  True False False
 False False  True  True  True False False False False False False False
  True]
Feature ranking [23 27  1  6  5 18  1 20  4 26  7  3 17  2  1 14 25 19  1  1  1  1 10 11
 16 15  1  1  1 12 24  8  9 21 13 22  1]


In [325]:
featured_columns = pd.DataFrame(selector.support_,
                               index = X.columns,
                               columns=['is_in'])

featured_columns = featured_columns[featured_columns.is_in == True].index.tolist()
featured_columns

['round',
 'HT_wins',
 'l5_ravg_HTCR',
 'l5_ravg_HTxG',
 'l5_ravg_ATxG',
 'l5_ravg_HTxpts',
 'l5_ravg_ATxpts',
 'l5_htdiff',
 'l5_atdiff',
 'avgHTP',
 'diff_OVA']

In [326]:
X = s1718[featured_columns]
y = s1718[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [327]:
model.fit(scaled_X_train,y_train.values.ravel())
rfe_pred = model.predict(scaled_X_test)

In [328]:
confusion_matrix(y_test,rfe_pred)

array([[25,  6, 19],
       [11,  0, 41],
       [15,  2, 71]], dtype=int64)

In [329]:
print(classification_report(y_test,rfe_pred))

              precision    recall  f1-score   support

           A       0.49      0.50      0.50        50
           D       0.00      0.00      0.00        52
           H       0.54      0.81      0.65        88

    accuracy                           0.51       190
   macro avg       0.34      0.44      0.38       190
weighted avg       0.38      0.51      0.43       190



In [330]:
#Define the parameters
kernel = ['linear','rbf', 'sigmoid']
C = [1,2,3,4,5,6,7,8,9,10]
gamma = [0.1,0.15,0.16,0.17,0.18,0.2,0.3,'scale']

param_grid = {'kernel': kernel,
              'C': C,
              'gamma': gamma}

#Grid Search
grid_model = GridSearchCV(model, param_grid=param_grid, scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [332]:
grid_model.best_params_

{'C': 5, 'gamma': 0.1, 'kernel': 'linear'}

In [331]:
confusion_matrix(y_test,grid_pred)

array([[27,  1, 22],
       [15,  0, 37],
       [22,  0, 66]], dtype=int64)

In [333]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.42      0.54      0.47        50
           D       0.00      0.00      0.00        52
           H       0.53      0.75      0.62        88

    accuracy                           0.49       190
   macro avg       0.32      0.43      0.36       190
weighted avg       0.36      0.49      0.41       190



The best accuracy we could get is 0,51 from the base model.

**Season 18/19**

In [334]:
X = s1819.drop('FTR',axis=1)
y = s1819[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [335]:
model.fit(scaled_X_train,y_train.values.ravel())
base_pred = model.predict(scaled_X_test)

In [336]:
confusion_matrix(y_test,base_pred)

array([[27,  0, 35],
       [ 7,  0, 26],
       [23,  0, 72]], dtype=int64)

In [337]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.47      0.44      0.45        62
           D       0.00      0.00      0.00        33
           H       0.54      0.76      0.63        95

    accuracy                           0.52       190
   macro avg       0.34      0.40      0.36       190
weighted avg       0.43      0.52      0.46       190



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [338]:
estimator = SVC(kernel="linear")
selector = RFECV(estimator, step=1, scoring='accuracy', cv=5)
selector = selector.fit(X, y.drop('round',axis=1).values.ravel())
print("Features selected", selector.support_)
print("Feature ranking", selector.ranking_)

Features selected [False False False False  True False False False False False False False
 False False  True  True False False  True  True False False False  True
 False False  True  True  True  True False False False False False  True
  True]
Feature ranking [25 22  2 18  1 23 16 15  7  8 10  9  3  4  1  1 24 19  1  1  6  5 14  1
 26 12  1  1  1  1 17 20 11 21 13  1  1]


In [339]:
featured_columns = pd.DataFrame(selector.support_,
                               index = X.columns,
                               columns=['is_in'])

featured_columns = featured_columns[featured_columns.is_in == True].index.tolist()
featured_columns

['AVGD',
 'l5_ravg_HTCR',
 'l5_ravg_ATCR',
 'l5_ravg_HTxG',
 'l5_ravg_ATxG',
 'l5_ravg_ATdeep',
 'l5_htdiff',
 'l5_atdiff',
 'avgHTP',
 'avgATP',
 'diff_DEF',
 'diff_OVA']

In [340]:
X = pd.concat([s1819[featured_columns],s1819['round']],axis=1)
y = s1819[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [341]:
model.fit(scaled_X_train,y_train.values.ravel())
rfe_pred = model.predict(scaled_X_test)

In [342]:
confusion_matrix(y_test,rfe_pred)

array([[30,  0, 32],
       [ 5,  0, 28],
       [16,  0, 79]], dtype=int64)

In [343]:
print(classification_report(y_test,rfe_pred))

              precision    recall  f1-score   support

           A       0.59      0.48      0.53        62
           D       0.00      0.00      0.00        33
           H       0.57      0.83      0.68        95

    accuracy                           0.57       190
   macro avg       0.39      0.44      0.40       190
weighted avg       0.48      0.57      0.51       190



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [344]:
#Define the parameters
kernel = ['linear','rbf', 'sigmoid']
C = [1,2,3,4,5,6,7,8,9,10]
gamma = [0.1,0.15,0.16,0.17,0.18,0.2,0.3,'scale']

param_grid = {'kernel': kernel,
              'C': C,
              'gamma': gamma}

#Grid Search
grid_model = GridSearchCV(model, param_grid=param_grid, scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [345]:
grid_model.best_params_

{'C': 2, 'gamma': 'scale', 'kernel': 'rbf'}

In [346]:
confusion_matrix(y_test,grid_pred)

array([[27,  5, 30],
       [ 3,  1, 29],
       [11,  4, 80]], dtype=int64)

In [347]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.66      0.44      0.52        62
           D       0.10      0.03      0.05        33
           H       0.58      0.84      0.68        95

    accuracy                           0.57       190
   macro avg       0.44      0.44      0.42       190
weighted avg       0.52      0.57      0.52       190



The best accuracy we could get is 0,57 after implementing RFE and Grid Search.

Hyperparameters:
- 'C': 2
- 'gamma': 'scale'
- 'kernel': 'rbf'

Features:
- 'AVGD'
- 'l5_ravg_HTCR'
- 'l5_ravg_ATCR'
- 'l5_ravg_HTxG'
- 'l5_ravg_ATxG'
- 'l5_ravg_ATdeep'
- 'l5_htdiff'
- 'l5_atdiff'
- 'avgHTP'
- 'avgATP'
- 'diff_DEF'
- 'diff_OVA'

**Season 19/20**

In [348]:
X = s1920.drop('FTR',axis=1)
y = s1920[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [349]:
model.fit(scaled_X_train,y_train.values.ravel())
base_pred = model.predict(scaled_X_test)

In [350]:
confusion_matrix(y_test,base_pred)

array([[20,  0, 35],
       [13,  0, 33],
       [15,  0, 74]], dtype=int64)

In [351]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.42      0.36      0.39        55
           D       0.00      0.00      0.00        46
           H       0.52      0.83      0.64        89

    accuracy                           0.49       190
   macro avg       0.31      0.40      0.34       190
weighted avg       0.36      0.49      0.41       190



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [352]:
estimator = SVC(kernel="linear")
selector = RFECV(estimator, step=1, scoring='accuracy', cv=5)
selector = selector.fit(X, y.drop('round',axis=1).values.ravel())
print("Features selected", selector.support_)
print("Feature ranking", selector.ranking_)

Features selected [False False  True  True  True False False False False False  True False
  True False  True  True False False  True  True  True False False False
 False False  True False  True False False False False False False  True
 False]
Feature ranking [24 25  1  1  1 14  4  3 11  9  1  5  1  7  1  1 20 22  1  1  1 15 19 13
 23 18  1  8  1  2 12 10  6 17 21  1 16]


In [353]:
featured_columns = pd.DataFrame(selector.support_,
                               index = X.columns,
                               columns=['is_in'])

featured_columns = featured_columns[featured_columns.is_in == True].index.tolist()
featured_columns

['round',
 'AVGH',
 'AVGD',
 'HT_losses',
 'l5_ravg_HTST',
 'l5_ravg_HTCR',
 'l5_ravg_ATCR',
 'l5_ravg_HTxG',
 'l5_ravg_ATxG',
 'l5_ravg_HTxpts',
 'l5_htdiff',
 'avgHTP',
 'diff_DEF']

In [354]:
X = s1920[featured_columns]
y = s1920[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [355]:
model.fit(scaled_X_train,y_train.values.ravel())
rfe_pred = model.predict(scaled_X_test)

In [356]:
confusion_matrix(y_test,rfe_pred)

array([[40,  0, 15],
       [26,  0, 20],
       [39,  0, 50]], dtype=int64)

In [357]:
print(classification_report(y_test,rfe_pred))

              precision    recall  f1-score   support

           A       0.38      0.73      0.50        55
           D       0.00      0.00      0.00        46
           H       0.59      0.56      0.57        89

    accuracy                           0.47       190
   macro avg       0.32      0.43      0.36       190
weighted avg       0.39      0.47      0.41       190



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [360]:
#Define the parameters
kernel = ['linear','rbf', 'sigmoid']
C = [1,2,3,4,5,6,7,8,9,10]
gamma = [0.1,0.15,0.16,0.17,0.18,0.2,0.3,'scale']

param_grid = {'kernel': kernel,
              'C': C,
              'gamma': gamma}

#Grid Search
grid_model = GridSearchCV(model, param_grid=param_grid, scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [361]:
grid_model.best_params_

{'C': 3, 'gamma': 0.3, 'kernel': 'sigmoid'}

In [362]:
confusion_matrix(y_test,grid_pred)

array([[44,  0, 11],
       [27,  0, 19],
       [38,  0, 51]], dtype=int64)

In [363]:
print(classification_report(y_test,rfe_pred))

              precision    recall  f1-score   support

           A       0.38      0.73      0.50        55
           D       0.00      0.00      0.00        46
           H       0.59      0.56      0.57        89

    accuracy                           0.47       190
   macro avg       0.32      0.43      0.36       190
weighted avg       0.39      0.47      0.41       190



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The best accuracy we could get is 0,49 at base model.

**Serie A**

In [364]:
df = pd.read_csv(r"C:\Users\George\Desktop\football-match-prediction-using-ml-techniques\fe\serie_a.csv")

In [365]:
#We assign a unique value at every team of the league
team_id = {'Chievo':1, 'Roma':2, 'Atalanta':3, 'Cesena':4, 'Genoa':5, 'Milan':6,
           'Palermo':7, 'Sassuolo':8, 'Torino':9, 'Udinese':10, 'Empoli':11, 'Juventus':12,
           'Cagliari':13, 'Fiorentina':14, 'Inter':15, 'Lazio':16, 'Napoli':17, 'Parma':18,
           'Sampdoria':19, 'Verona':20, 'Frosinone':21, 'Bologna':22, 'Carpi':23, 'Pescara':24,
           'Crotone':25, 'Benevento':26, 'Spal':27, 'Lecce':28, 'Brescia':29}

for key,value in df.iteritems():
    df['HomeTeam'] = df['HomeTeam'].apply(lambda x: team_id.get(x,x))
    
for key,value in df.iteritems():
    df['AwayTeam'] = df['AwayTeam'].apply(lambda x: team_id.get(x,x))

In [366]:
#We are dropping the starts that are not available to us before the game 
df = df.drop(['Unnamed: 0', 'Date','FTHG', 'FTAG', 'HTHG','HTAG','league', 'Hppda_coef', 'Appda_coef','HTW', 'HTD',
              'HTCR', 'ATCR','HTGS','HTxpts', 'ATxpts','ATGS', 'HTGC', 'ATGC','HTOVA_S', 'HTatt_S', 'HomeTeamPoints','AwayTeamPoints',
              'HTmid_S','HTdef_S', 'ATOVA_S', 'ATatt_S','ATmid_S','ATdef_S','l5_ravg_HTgs', 'l5_ravg_ATgs','l5_ravg_HTgc', 'l5_ravg_ATgc',
              'HTL', 'ATL', 'ATD', 'ATW'],axis=1)

df = df.dropna()

In [367]:
s1415 = df[df['season']=='2014/2015'].drop('season',axis=1)
s1516 = df[df['season']=='2015/2016'].drop('season',axis=1)
s1617 = df[df['season']=='2016/2017'].drop('season',axis=1)
s1718 = df[df['season']=='2017/2018'].drop('season',axis=1)
s1819 = df[df['season']=='2018/2019'].drop('season',axis=1)
s1920 = df[df['season']=='2019/2020'].drop('season',axis=1)

**Season 1415**

In [368]:
X = s1415.drop('FTR',axis=1)
y = s1415[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [369]:
model.fit(scaled_X_train,y_train.values.ravel())
base_pred = model.predict(scaled_X_test)

In [370]:
confusion_matrix(y_test,base_pred)

array([[11, 16, 26],
       [11, 16, 27],
       [ 6, 33, 44]], dtype=int64)

In [371]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.39      0.21      0.27        53
           D       0.25      0.30      0.27        54
           H       0.45      0.53      0.49        83

    accuracy                           0.37       190
   macro avg       0.36      0.34      0.34       190
weighted avg       0.38      0.37      0.37       190



In [372]:
estimator = SVC(kernel="linear")
selector = RFECV(estimator, step=1, scoring='accuracy', cv=5)
selector = selector.fit(X, y.drop('round',axis=1).values.ravel())
print("Features selected", selector.support_)
print("Feature ranking", selector.ranking_)

Features selected [False False  True  True  True False False False False False False False
 False False  True  True False False  True  True  True  True False False
 False False False False  True  True False False False False False False
  True]
Feature ranking [24 26  1  1  1 19  5  2  3 12  6  7 16 15  1  1 17 18  1  1  1  1 20 25
 21  8 14 10  1  1  9  4 13 23 22 11  1]


In [373]:
featured_columns = pd.DataFrame(selector.support_,
                               index = X.columns,
                               columns=['is_in'])

featured_columns = featured_columns[featured_columns.is_in == True].index.tolist()
featured_columns

['round',
 'AVGH',
 'AVGD',
 'l5_ravg_HTCR',
 'l5_ravg_ATCR',
 'l5_ravg_HTxG',
 'l5_ravg_ATxG',
 'l5_ravg_HTxpts',
 'l5_ravg_ATxpts',
 'avgHTP',
 'avgATP',
 'diff_OVA']

In [374]:
X = s1415[featured_columns]
y = s1415[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [375]:
model.fit(scaled_X_train,y_train.values.ravel())
rfe_pred = model.predict(scaled_X_test)

In [376]:
confusion_matrix(y_test,base_pred)

array([[11, 16, 26],
       [11, 16, 27],
       [ 6, 33, 44]], dtype=int64)

In [377]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.39      0.21      0.27        53
           D       0.25      0.30      0.27        54
           H       0.45      0.53      0.49        83

    accuracy                           0.37       190
   macro avg       0.36      0.34      0.34       190
weighted avg       0.38      0.37      0.37       190



In [378]:
#Define the parameters
kernel = ['linear','rbf', 'sigmoid']
C = [1,2,3,4,5,6,7,8,9,10]
gamma = [0.1,0.15,0.16,0.17,0.18,0.2,0.3,'scale']

param_grid = {'kernel': kernel,
              'C': C,
              'gamma': gamma}

#Grid Search
grid_model = GridSearchCV(model, param_grid=param_grid, scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [379]:
grid_model.best_params_

{'C': 5, 'gamma': 0.3, 'kernel': 'sigmoid'}

In [380]:
confusion_matrix(y_test,grid_pred)

array([[ 6, 42,  5],
       [ 7, 32, 15],
       [ 3, 45, 35]], dtype=int64)

In [381]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.38      0.11      0.17        53
           D       0.27      0.59      0.37        54
           H       0.64      0.42      0.51        83

    accuracy                           0.38       190
   macro avg       0.43      0.38      0.35       190
weighted avg       0.46      0.38      0.38       190



The best accuracy we could get is 0,39 after RFE.

Features:
- 'AVGH'
- 'AVGD'
- 'l5_ravg_HTCR'
- 'l5_ravg_ATCR'
- 'l5_ravg_HTxG'
- 'l5_ravg_ATxG'
- 'l5_ravg_HTxpts'
- 'l5_ravg_ATxpts'
- 'avgHTP'
- 'avgATP'
- 'diff_OVA'

**Season 15/16**

In [139]:
X = s1516.drop('FTR',axis=1)
y = s1516[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [141]:
model.fit(scaled_X_train,y_train.values.ravel())
base_pred = model.predict(scaled_X_test)

In [142]:
confusion_matrix(y_test,base_pred)

array([[48,  0,  0],
       [48,  0,  5],
       [73,  0, 16]], dtype=int64)

In [143]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.28      1.00      0.44        48
           D       0.00      0.00      0.00        53
           H       0.76      0.18      0.29        89

    accuracy                           0.34       190
   macro avg       0.35      0.39      0.24       190
weighted avg       0.43      0.34      0.25       190



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [144]:
estimator = SVC(kernel="linear")
selector = RFECV(estimator, step=1, scoring='accuracy', cv=5)
selector = selector.fit(X, y.drop('round',axis=1).values.ravel())
print("Features selected", selector.support_)
print("Feature ranking", selector.ranking_)

Features selected [False False  True  True  True  True False False False  True False False
  True  True  True  True False False  True  True  True  True False False
  True False  True False  True  True False False False False False False
 False]
Feature ranking [18 21  1  1  1  1  3  2  7  1 10  9  1  1  1  1 15 16  1  1  1  1 17 12
  1  6  1 13  1  1  4 11 19 14  5 20  8]


In [145]:
featured_columns = pd.DataFrame(selector.support_,
                               index = X.columns,
                               columns=['is_in'])

featured_columns = featured_columns[featured_columns.is_in == True].index.tolist()
featured_columns

['round',
 'AVGH',
 'AVGD',
 'AVGA',
 'AT_draws',
 'l5_ravg_HTST',
 'l5_ravg_ATST',
 'l5_ravg_HTCR',
 'l5_ravg_ATCR',
 'l5_ravg_HTxG',
 'l5_ravg_ATxG',
 'l5_ravg_HTxpts',
 'l5_ravg_ATxpts',
 'l5_ravg_HTppda',
 'l5_htdiff',
 'avgHTP',
 'avgATP']

In [146]:
X = s1516[featured_columns]
y = s1516[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [148]:
model.fit(scaled_X_train,y_train.values.ravel())
rfe_pred = model.predict(scaled_X_test)

In [149]:
confusion_matrix(y_test,rfe_pred)

array([[41,  0,  7],
       [30,  0, 23],
       [36,  0, 53]], dtype=int64)

In [150]:
print(classification_report(y_test,rfe_pred))

              precision    recall  f1-score   support

           A       0.38      0.85      0.53        48
           D       0.00      0.00      0.00        53
           H       0.64      0.60      0.62        89

    accuracy                           0.49       190
   macro avg       0.34      0.48      0.38       190
weighted avg       0.40      0.49      0.42       190



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [151]:
#Define the parameters
kernel = ['rbf', 'sigmoid']
C = [1,2,3,4,5,6,7,8,9,10]
gamma = [0.1,0.15,0.16,0.17,0.18,0.2,0.3,'scale']

param_grid = {'kernel': kernel,
              'C': C,
              'gamma': gamma}

#Grid Search
grid_model = GridSearchCV(model, param_grid=param_grid, scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [154]:
grid_model.best_params_

{'C': 8, 'gamma': 0.3, 'kernel': 'sigmoid'}

In [152]:
confusion_matrix(y_test,grid_pred)

array([[40,  0,  8],
       [27,  2, 24],
       [29,  8, 52]], dtype=int64)

In [153]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.42      0.83      0.56        48
           D       0.20      0.04      0.06        53
           H       0.62      0.58      0.60        89

    accuracy                           0.49       190
   macro avg       0.41      0.49      0.41       190
weighted avg       0.45      0.49      0.44       190



The best accuracy we could get is 0,49 after implementing RFE and GridSearch.

Hyperparameters:
- 'C': 8
- 'gamma': 0.3
- 'kernel': 'sigmoid'

Features:
- 'AVGH'
- 'AVGD'
- 'AVGA'
- 'AT_draws'
- 'l5_ravg_HTST'
- 'l5_ravg_ATST'
- 'l5_ravg_HTCR'
- 'l5_ravg_ATCR'
- 'l5_ravg_HTxG'
- 'l5_ravg_ATxG'
- 'l5_ravg_HTxpts'
- 'l5_ravg_ATxpts'
- 'l5_ravg_HTppda'
- 'l5_htdiff'
- 'avgHTP'
- 'avgATP'

**Season 16/17**

In [382]:
X = s1617.drop('FTR',axis=1)
y = s1617[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [383]:
model.fit(scaled_X_train,y_train.values.ravel())
base_pred = model.predict(scaled_X_test)

In [384]:
confusion_matrix(y_test,base_pred)

array([[21,  0, 43],
       [ 8,  0, 34],
       [ 4,  0, 80]], dtype=int64)

In [385]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.64      0.33      0.43        64
           D       0.00      0.00      0.00        42
           H       0.51      0.95      0.66        84

    accuracy                           0.53       190
   macro avg       0.38      0.43      0.37       190
weighted avg       0.44      0.53      0.44       190



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [386]:
estimator = SVC(kernel="linear")
selector = RFECV(estimator, step=1, scoring='accuracy', cv=5)
selector = selector.fit(X, y.drop('round',axis=1).values.ravel())
print("Features selected", selector.support_)
print("Feature ranking", selector.ranking_)

Features selected [False False False  True  True False False False False False False False
 False False  True  True False False  True False  True False False False
 False False  True False  True  True False False False False False False
 False]
Feature ranking [29 27  5  1  1 24 12  9 14 19 17 23 21 16  1  1 25 18  1  6  1 11 22  2
 28 20  1  7  1  1 13 10 26  8  3 15  4]


In [387]:
featured_columns = pd.DataFrame(selector.support_,
                               index = X.columns,
                               columns=['is_in'])

featured_columns = featured_columns[featured_columns.is_in == True].index.tolist()
featured_columns

['AVGH',
 'AVGD',
 'l5_ravg_HTCR',
 'l5_ravg_ATCR',
 'l5_ravg_HTxG',
 'l5_ravg_HTxpts',
 'l5_htdiff',
 'avgHTP',
 'avgATP']

In [388]:
X = pd.concat([s1617[featured_columns],s1617['round']],axis=1)
y = s1617[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [389]:
model.fit(scaled_X_train,y_train.values.ravel())
rfe_pred = model.predict(scaled_X_test)

In [390]:
confusion_matrix(y_test,rfe_pred)

array([[23,  0, 41],
       [ 5,  0, 37],
       [ 6,  0, 78]], dtype=int64)

In [391]:
print(classification_report(y_test,rfe_pred))

              precision    recall  f1-score   support

           A       0.68      0.36      0.47        64
           D       0.00      0.00      0.00        42
           H       0.50      0.93      0.65        84

    accuracy                           0.53       190
   macro avg       0.39      0.43      0.37       190
weighted avg       0.45      0.53      0.45       190



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [392]:
#Define the parameters
kernel = ['linear','rbf', 'sigmoid']
C = [1,2,3,4,5,6,7,8,9,10]
gamma = [0.1,0.15,0.16,0.17,0.18,0.2,0.3,'scale']

param_grid = {'kernel': kernel,
              'C': C,
              'gamma': gamma}

#Grid Search
grid_model = GridSearchCV(model, param_grid=param_grid, scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [393]:
grid_model.best_params_

{'C': 7, 'gamma': 0.1, 'kernel': 'linear'}

In [394]:
confusion_matrix(y_test,grid_pred)

array([[30,  0, 34],
       [11,  0, 31],
       [ 9,  0, 75]], dtype=int64)

In [395]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.60      0.47      0.53        64
           D       0.00      0.00      0.00        42
           H       0.54      0.89      0.67        84

    accuracy                           0.55       190
   macro avg       0.38      0.45      0.40       190
weighted avg       0.44      0.55      0.47       190



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Best accuracy we could get is 0,55.

Hyperparameters:
- 'C': 7
- 'gamma': 0.1
- 'kernel': 'linear'

Features:
- 'AVGH'
- 'AVGD'
- 'l5_ravg_HTCR'
- 'l5_ravg_ATCR'
- 'l5_ravg_HTxG'
- 'l5_ravg_HTxpts'
- 'l5_htdiff'
- 'avgHTP'
- 'avgATP'

**Season 17/18**

In [16]:
X = s1718.drop('FTR',axis=1)
y = s1718[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [17]:
model.fit(scaled_X_train,y_train.values.ravel())
base_pred = model.predict(scaled_X_test)

In [18]:
confusion_matrix(y_test,base_pred)

array([[41, 17,  5],
       [22, 10,  9],
       [39, 13, 34]], dtype=int64)

In [19]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.40      0.65      0.50        63
           D       0.25      0.24      0.25        41
           H       0.71      0.40      0.51        86

    accuracy                           0.45       190
   macro avg       0.45      0.43      0.42       190
weighted avg       0.51      0.45      0.45       190



In [20]:
estimator = SVC(kernel="linear")
selector = RFECV(estimator, step=1, scoring='accuracy', cv=5)
selector = selector.fit(X, y.drop('round',axis=1).values.ravel())
print("Features selected", selector.support_)
print("Feature ranking", selector.ranking_)

Features selected [False False  True  True  True  True False False False False False False
  True False  True  True False False  True  True  True  True False False
 False False False False  True  True False False False False False False
 False]
Feature ranking [23 25  1  1  1  1  8  6 13  9 11 12  1 18  1  1 22 24  1  1  1  1 19 16
 21 20  2 14  1  1 10 15 17  5  7  3  4]


In [21]:
featured_columns = pd.DataFrame(selector.support_,
                               index = X.columns,
                               columns=['is_in'])

featured_columns = featured_columns[featured_columns.is_in == True].index.tolist()
featured_columns

['round',
 'AVGH',
 'AVGD',
 'AVGA',
 'l5_ravg_HTST',
 'l5_ravg_HTCR',
 'l5_ravg_ATCR',
 'l5_ravg_HTxG',
 'l5_ravg_ATxG',
 'l5_ravg_HTxpts',
 'l5_ravg_ATxpts',
 'avgHTP',
 'avgATP']

In [22]:
X = s1718[featured_columns]
y = s1718[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [23]:
model.fit(scaled_X_train,y_train.values.ravel())
rfe_pred = model.predict(scaled_X_test)

In [24]:
confusion_matrix(y_test,rfe_pred)

array([[51,  2, 10],
       [25,  7,  9],
       [29,  2, 55]], dtype=int64)

In [25]:
print(classification_report(y_test,rfe_pred))

              precision    recall  f1-score   support

           A       0.49      0.81      0.61        63
           D       0.64      0.17      0.27        41
           H       0.74      0.64      0.69        86

    accuracy                           0.59       190
   macro avg       0.62      0.54      0.52       190
weighted avg       0.63      0.59      0.57       190



In [26]:
#Define the parameters
kernel = ['rbf', 'sigmoid']
C = [1,2,3,4,5,6,7,8,9,10]
gamma = [0.1,0.15,0.16,0.17,0.18,0.2,0.3,'scale']

param_grid = {'kernel': kernel,
              'C': C,
              'gamma': gamma}

#Grid Search
grid_model = GridSearchCV(model, param_grid=param_grid, scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [27]:
confusion_matrix(y_test,grid_pred)

array([[53,  2,  8],
       [27,  6,  8],
       [39,  2, 45]], dtype=int64)

In [28]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.45      0.84      0.58        63
           D       0.60      0.15      0.24        41
           H       0.74      0.52      0.61        86

    accuracy                           0.55       190
   macro avg       0.59      0.50      0.48       190
weighted avg       0.61      0.55      0.52       190



The best accuracy we could get is 0,59 after implementing RFE at the base model

Features:
- 'AVGH'
- 'AVGD'
- 'AVGA'
- 'l5_ravg_HTST'
- 'l5_ravg_HTCR'
- 'l5_ravg_ATCR'
- 'l5_ravg_HTxG'
- 'l5_ravg_ATxG'
- 'l5_ravg_HTxpts'
- 'l5_ravg_ATxpts'
- 'avgHTP'
- 'avgATP'

**Season 18/19**

In [42]:
X = s1819.drop('FTR',axis=1)
y = s1819[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [30]:
model.fit(scaled_X_train,y_train.values.ravel())
base_pred = model.predict(scaled_X_test)

In [31]:
confusion_matrix(y_test,base_pred)

array([[16,  7, 32],
       [ 7,  8, 37],
       [ 7,  3, 73]], dtype=int64)

In [32]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.53      0.29      0.38        55
           D       0.44      0.15      0.23        52
           H       0.51      0.88      0.65        83

    accuracy                           0.51       190
   macro avg       0.50      0.44      0.42       190
weighted avg       0.50      0.51      0.45       190



In [33]:
estimator = SVC(kernel="linear")
selector = RFECV(estimator, step=1, scoring='accuracy', cv=5)
selector = selector.fit(X, y.drop('round',axis=1).values.ravel())
print("Features selected", selector.support_)
print("Feature ranking", selector.ranking_)

Features selected [False False  True  True False False False False False False False False
 False False  True  True False False False False  True False False False
 False False False False  True  True False False False False False False
  True]
Feature ranking [29 28  1  1  4 25 12 11 14 13 15 16 26  6  1  1 27 30  7  5  1  3 20 17
 23 21 22  8  1  1 19 18 24  9 10  2  1]


In [34]:
featured_columns = pd.DataFrame(selector.support_,
                               index = X.columns,
                               columns=['is_in'])

featured_columns = featured_columns[featured_columns.is_in == True].index.tolist()
featured_columns

['round',
 'AVGH',
 'l5_ravg_HTCR',
 'l5_ravg_ATCR',
 'l5_ravg_HTxpts',
 'avgHTP',
 'avgATP',
 'diff_OVA']

In [35]:
X = s1819[featured_columns]
y = s1819[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [36]:
model.fit(scaled_X_train,y_train.values.ravel())
rfe_pred = model.predict(scaled_X_test)

In [37]:
confusion_matrix(y_test,rfe_pred)

array([[14, 27, 14],
       [ 4, 25, 23],
       [ 6, 33, 44]], dtype=int64)

In [38]:
print(classification_report(y_test,rfe_pred))

              precision    recall  f1-score   support

           A       0.58      0.25      0.35        55
           D       0.29      0.48      0.36        52
           H       0.54      0.53      0.54        83

    accuracy                           0.44       190
   macro avg       0.47      0.42      0.42       190
weighted avg       0.49      0.44      0.44       190



After RFE the accuracy of the model fell. So we are going to implement Grid Search using all features.

In [43]:
#Define the parameters
kernel = ['rbf', 'sigmoid']
C = [1,2,3,4,5,6,7,8,9,10]
gamma = [0.1,0.15,0.16,0.17,0.18,0.2,0.3,'scale']

param_grid = {'kernel': kernel,
              'C': C,
              'gamma': gamma}

#Grid Search
grid_model = GridSearchCV(model, param_grid=param_grid, scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [44]:
confusion_matrix(y_test,grid_pred)

array([[28,  7, 20],
       [13,  3, 36],
       [17,  5, 61]], dtype=int64)

In [45]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.48      0.51      0.50        55
           D       0.20      0.06      0.09        52
           H       0.52      0.73      0.61        83

    accuracy                           0.48       190
   macro avg       0.40      0.43      0.40       190
weighted avg       0.42      0.48      0.43       190



The best accuyracy we could get is 0,51 at the base model.

**Season 19/20**

In [396]:
X = s1920.drop('FTR',axis=1)
y = s1920[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [397]:
model.fit(scaled_X_train,y_train.values.ravel())
base_pred = model.predict(scaled_X_test)

In [398]:
confusion_matrix(y_test,base_pred)

array([[31,  0, 36],
       [16,  0, 27],
       [16,  1, 63]], dtype=int64)

In [399]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.49      0.46      0.48        67
           D       0.00      0.00      0.00        43
           H       0.50      0.79      0.61        80

    accuracy                           0.49       190
   macro avg       0.33      0.42      0.36       190
weighted avg       0.38      0.49      0.43       190



In [400]:
estimator = SVC(kernel="linear")
selector = RFECV(estimator, step=1, scoring='accuracy', cv=5)
selector = selector.fit(X, y.drop('round',axis=1).values.ravel())
print("Features selected", selector.support_)
print("Feature ranking", selector.ranking_)

Features selected [False False False  True  True False False False False False False False
 False False  True  True False False  True  True  True False False False
 False False False False  True  True False False False False False False
 False]
Feature ranking [28 29  3  1  1 19 14 16  6  5  8  7 21 18  1  1 25 17  1  1  1  4 24 26
 23 27 11  2  1  1 15 22  9 20 10 13 12]


In [401]:
featured_columns = pd.DataFrame(selector.support_,
                               index = X.columns,
                               columns=['is_in'])

featured_columns = featured_columns[featured_columns.is_in == True].index.tolist()
featured_columns

['AVGH',
 'AVGD',
 'l5_ravg_HTCR',
 'l5_ravg_ATCR',
 'l5_ravg_HTxG',
 'l5_ravg_ATxG',
 'l5_ravg_HTxpts',
 'avgHTP',
 'avgATP']

In [402]:
X = pd.concat([s1920[featured_columns],s1920['round']],axis=1)
y = s1920[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [403]:
model.fit(scaled_X_train,y_train.values.ravel())
rfe_pred = model.predict(scaled_X_test)

In [404]:
confusion_matrix(y_test,rfe_pred)

array([[44,  1, 22],
       [16,  3, 24],
       [15,  1, 64]], dtype=int64)

In [405]:
print(classification_report(y_test,rfe_pred))

              precision    recall  f1-score   support

           A       0.59      0.66      0.62        67
           D       0.60      0.07      0.12        43
           H       0.58      0.80      0.67        80

    accuracy                           0.58       190
   macro avg       0.59      0.51      0.47       190
weighted avg       0.59      0.58      0.53       190



In [406]:
#Define the parameters
kernel = ['linear','rbf', 'sigmoid']
C = [1,2,3,4,5,6,7,8,9,10]
gamma = [0.1,0.15,0.16,0.17,0.18,0.2,0.3,'scale']

param_grid = {'kernel': kernel,
              'C': C,
              'gamma': gamma}

#Grid Search
grid_model = GridSearchCV(model, param_grid=param_grid, scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [407]:
grid_model.best_params_

{'C': 5, 'gamma': 0.1, 'kernel': 'linear'}

In [408]:
confusion_matrix(y_test,grid_pred)

array([[45,  0, 22],
       [19,  2, 22],
       [18,  1, 61]], dtype=int64)

In [409]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.55      0.67      0.60        67
           D       0.67      0.05      0.09        43
           H       0.58      0.76      0.66        80

    accuracy                           0.57       190
   macro avg       0.60      0.49      0.45       190
weighted avg       0.59      0.57      0.51       190



The best accuracy we could get is 0,58 at the base model after implementing RFE.

Features:
- 'AVGH'
- 'AVGD'
- 'l5_ravg_HTCR'
- 'l5_ravg_ATCR'
- 'l5_ravg_HTxG'
- 'l5_ravg_ATxG'
- 'l5_ravg_HTxpts'
- 'avgHTP'
- 'avgATP'

**La Liga**

In [31]:
df = pd.read_csv(r"C:\Users\George\Desktop\football-match-prediction-using-ml-techniques\fe\la_liga.csv")

In [32]:
#We assign a unique value at every team of the league
team_id = {'Almeria':1, 'Granada':2, 'Malaga':3, 'Sevilla':4, 'Barcelona':5, 'Celta':6,
           'Eibar':7, 'Levante':8, 'Real Madrid':9, 'Vallecano':10, 'Getafe':11,
           'Valencia':12, 'Ath Bilbao':13, 'Ath Madrid':14, 'Cordoba':15, 'Espanol':16,
           'Elche':17, 'La Coruna':18, 'Sociedad':19, 'Villarreal':20, 'Betis':21,
           'Sp Gijon':22, 'Las Palmas':23, 'Leganes':24, 'Osasuna':25, 'Alaves':26, 'Girona':27,
           'Valladolid':28, 'Huesca':29, 'Mallorca':30}

for key,value in df.iteritems():
    df['HomeTeam'] = df['HomeTeam'].apply(lambda x: team_id.get(x,x))
    
for key,value in df.iteritems():
    df['AwayTeam'] = df['AwayTeam'].apply(lambda x: team_id.get(x,x))

In [33]:
#We are dropping the starts that are not available to us before the game 
df = df.drop(['Unnamed: 0', 'Date','FTHG', 'FTAG', 'HTHG','HTAG','league', 'Hppda_coef', 'Appda_coef','HTW', 'HTD',
              'HTCR', 'ATCR','HTGS','HTxpts', 'ATxpts','ATGS', 'HTGC', 'ATGC','HTOVA_S', 'HTatt_S', 'HomeTeamPoints','AwayTeamPoints',
              'HTmid_S','HTdef_S', 'ATOVA_S', 'ATatt_S','ATmid_S','ATdef_S','l5_ravg_HTgs', 'l5_ravg_ATgs','l5_ravg_HTgc', 'l5_ravg_ATgc',
              'HTL', 'ATL', 'ATD', 'ATW'],axis=1)

df = df.dropna()

In [34]:
s1415 = df[df['season']=='2014/2015'].drop('season',axis=1)
s1516 = df[df['season']=='2015/2016'].drop('season',axis=1)
s1617 = df[df['season']=='2016/2017'].drop('season',axis=1)
s1718 = df[df['season']=='2017/2018'].drop('season',axis=1)
s1819 = df[df['season']=='2018/2019'].drop('season',axis=1)
s1920 = df[df['season']=='2019/2020'].drop('season',axis=1)

**Season 14/15**

In [64]:
X = s1415.drop('FTR',axis=1)
y = s1415[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [65]:
model.fit(scaled_X_train,y_train.values.ravel())
base_pred = model.predict(scaled_X_test)

In [66]:
confusion_matrix(y_test,base_pred)

array([[22,  1, 33],
       [ 7,  2, 35],
       [15,  2, 73]], dtype=int64)

In [67]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.50      0.39      0.44        56
           D       0.40      0.05      0.08        44
           H       0.52      0.81      0.63        90

    accuracy                           0.51       190
   macro avg       0.47      0.42      0.38       190
weighted avg       0.49      0.51      0.45       190



In [72]:
estimator = SVC(kernel="linear")
selector = RFECV(estimator, step=1, scoring='accuracy', cv=5)
selector = selector.fit(X, y.drop('round',axis=1).values.ravel())
print("Features selected", selector.support_)
print("Feature ranking", selector.ranking_)

Features selected [False False False False False False False False False False False False
 False False  True False False False  True  True False False False False
 False False False False False  True False False False False False False
 False]
Feature ranking [29 21  6  2  5 28 23 13 27 14 22 25 17 10  1  7 33 34  1  1  8 16 31 26
 24 18  4 12  3  1 19 15 20 32 11 30  9]


In [73]:
featured_columns = pd.DataFrame(selector.support_,
                               index = X.columns,
                               columns=['is_in'])

featured_columns = featured_columns[featured_columns.is_in == True].index.tolist()
featured_columns

['l5_ravg_HTCR', 'l5_ravg_HTxG', 'l5_ravg_ATxG', 'avgATP']

In [75]:
X = pd.concat([s1415[featured_columns],s1415['round']],axis=1)
y = s1415[['FTR','round']]
          
X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [76]:
model.fit(scaled_X_train,y_train.values.ravel())
rfe_pred = model.predict(scaled_X_test)

In [77]:
confusion_matrix(y_test,rfe_pred)

array([[26, 13, 17],
       [ 4, 15, 25],
       [ 9, 24, 57]], dtype=int64)

In [78]:
print(classification_report(y_test,rfe_pred))

              precision    recall  f1-score   support

           A       0.67      0.46      0.55        56
           D       0.29      0.34      0.31        44
           H       0.58      0.63      0.60        90

    accuracy                           0.52       190
   macro avg       0.51      0.48      0.49       190
weighted avg       0.54      0.52      0.52       190



In [79]:
#Define the parameters
kernel = ['rbf', 'sigmoid']
C = [1,2,3,4,5,6,7,8,9,10]
gamma = [0.1,0.15,0.16,0.17,0.18,0.2,0.3,'scale']

param_grid = {'kernel': kernel,
              'C': C,
              'gamma': gamma}

#Grid Search
grid_model = GridSearchCV(model, param_grid=param_grid, scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [80]:
grid_model.best_params_

{'C': 2, 'gamma': 0.2, 'kernel': 'sigmoid'}

In [81]:
confusion_matrix(y_test,grid_pred)

array([[21,  0, 35],
       [ 2,  0, 42],
       [ 3,  0, 87]], dtype=int64)

In [82]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.81      0.38      0.51        56
           D       0.00      0.00      0.00        44
           H       0.53      0.97      0.69        90

    accuracy                           0.57       190
   macro avg       0.45      0.45      0.40       190
weighted avg       0.49      0.57      0.48       190



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The best accuracy we could get is 0,57 after implemented Grid Search.

Hyperparameters:
- 'C': 2
- 'gamma': 0.2
- 'kernel': 'sigmoid'

Features:
- 'l5_ravg_HTCR'
- 'l5_ravg_HTxG'
- 'l5_ravg_ATxG'
- 'avgATP'

**Season 15/16**

In [83]:
X = s1516.drop('FTR',axis=1)
y = s1516[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [85]:
model.fit(scaled_X_train,y_train.values.ravel())
base_pred = model.predict(scaled_X_test)

In [86]:
confusion_matrix(y_test,base_pred)

array([[ 2,  3, 48],
       [ 0,  2, 43],
       [ 0,  0, 92]], dtype=int64)

In [87]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       1.00      0.04      0.07        53
           D       0.40      0.04      0.08        45
           H       0.50      1.00      0.67        92

    accuracy                           0.51       190
   macro avg       0.63      0.36      0.27       190
weighted avg       0.62      0.51      0.36       190



In [88]:
estimator = SVC(kernel="linear")
selector = RFECV(estimator, step=1, scoring='accuracy', cv=5)
selector = selector.fit(X, y.drop('round',axis=1).values.ravel())
print("Features selected", selector.support_)
print("Feature ranking", selector.ranking_)

Features selected [False False  True  True  True  True  True  True  True  True  True  True
  True  True  True  True False  True  True  True  True  True  True False
  True  True  True  True  True  True  True  True  True  True  True  True
  True]
Feature ranking [5 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 4 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [89]:
featured_columns = pd.DataFrame(selector.support_,
                               index = X.columns,
                               columns=['is_in'])

featured_columns = featured_columns[featured_columns.is_in == True].index.tolist()
featured_columns

['round',
 'AVGH',
 'AVGD',
 'AVGA',
 'HT_wins',
 'AT_wins',
 'HT_draws',
 'AT_draws',
 'HT_losses',
 'AT_losses',
 'l5_ravg_HTST',
 'l5_ravg_ATST',
 'l5_ravg_HTCR',
 'l5_ravg_ATCR',
 'AToveral',
 'l5_ravg_HTxG',
 'l5_ravg_ATxG',
 'l5_ravg_HTxpts',
 'l5_ravg_ATxpts',
 'l5_ravg_HTdeep',
 'l5_ravg_HTppda',
 'l5_ravg_ATppda',
 'l5_htdiff',
 'l5_atdiff',
 'avgHTP',
 'avgATP',
 'l5_ravg_HTp',
 'l3_ravg_ATp',
 'diff_points',
 'diff_ATT',
 'diff_MID',
 'diff_DEF',
 'diff_OVA']

In [90]:
X = s1516[featured_columns]
y = s1516[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [91]:
model.fit(scaled_X_train,y_train.values.ravel())
rfe_pred = model.predict(scaled_X_test)

In [92]:
confusion_matrix(y_test,rfe_pred)

array([[ 2,  3, 48],
       [ 0,  2, 43],
       [ 0,  0, 92]], dtype=int64)

In [93]:
print(classification_report(y_test,rfe_pred))

              precision    recall  f1-score   support

           A       1.00      0.04      0.07        53
           D       0.40      0.04      0.08        45
           H       0.50      1.00      0.67        92

    accuracy                           0.51       190
   macro avg       0.63      0.36      0.27       190
weighted avg       0.62      0.51      0.36       190



In [94]:
#Define the parameters
kernel = ['rbf', 'sigmoid']
C = [1,2,3,4,5,6,7,8,9,10]
gamma = [0.1,0.15,0.16,0.17,0.18,0.2,0.3,'scale']

param_grid = {'kernel': kernel,
              'C': C,
              'gamma': gamma}

#Grid Search
grid_model = GridSearchCV(model, param_grid=param_grid, scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [97]:
grid_model.best_params_

{'C': 6, 'gamma': 0.16, 'kernel': 'sigmoid'}

In [95]:
confusion_matrix(y_test,grid_pred)

array([[28,  0, 25],
       [10,  0, 35],
       [26,  0, 66]], dtype=int64)

In [96]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.44      0.53      0.48        53
           D       0.00      0.00      0.00        45
           H       0.52      0.72      0.61        92

    accuracy                           0.49       190
   macro avg       0.32      0.42      0.36       190
weighted avg       0.38      0.49      0.43       190



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The best accuracy we could get is 0,52 after implementing RFE. But the model almost predicted everything to be a home win. That tactic probably won't give any profit.

**Season 16/17**

In [98]:
X = s1617.drop('FTR',axis=1)
y = s1617[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [100]:
model.fit(scaled_X_train,y_train.values.ravel())
base_pred = model.predict(scaled_X_test)

In [101]:
confusion_matrix(y_test,base_pred)

array([[11, 11, 39],
       [ 4,  4, 27],
       [ 3, 10, 81]], dtype=int64)

In [102]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.61      0.18      0.28        61
           D       0.16      0.11      0.13        35
           H       0.55      0.86      0.67        94

    accuracy                           0.51       190
   macro avg       0.44      0.39      0.36       190
weighted avg       0.50      0.51      0.45       190



In [103]:
estimator = SVC(kernel="linear")
selector = RFECV(estimator, step=1, scoring='accuracy', cv=5)
selector = selector.fit(X, y.drop('round',axis=1).values.ravel())
print("Features selected", selector.support_)
print("Feature ranking", selector.ranking_)

Features selected [False False  True False False False False False False False False False
 False False False False False False False  True  True False False False
 False False False False  True  True False False False False False False
 False]
Feature ranking [28 31  1  4  3 24 13 22  9 14 12 11  7 20  6 26 30 32  2  1  1 10 29 19
  8 21 17  5  1  1 25 15 23 27 18 33 16]


In [104]:
featured_columns = pd.DataFrame(selector.support_,
                               index = X.columns,
                               columns=['is_in'])

featured_columns = featured_columns[featured_columns.is_in == True].index.tolist()
featured_columns

['round', 'l5_ravg_ATxG', 'l5_ravg_HTxpts', 'avgHTP', 'avgATP']

In [105]:
X = s1617[featured_columns]
y = s1617[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [106]:
model.fit(scaled_X_train,y_train.values.ravel())
rfe_pred = model.predict(scaled_X_test)

In [107]:
confusion_matrix(y_test,rfe_pred)

array([[25,  9, 27],
       [ 6,  3, 26],
       [11,  6, 77]], dtype=int64)

In [108]:
print(classification_report(y_test,rfe_pred))

              precision    recall  f1-score   support

           A       0.60      0.41      0.49        61
           D       0.17      0.09      0.11        35
           H       0.59      0.82      0.69        94

    accuracy                           0.55       190
   macro avg       0.45      0.44      0.43       190
weighted avg       0.51      0.55      0.52       190



In [109]:
#Define the parameters
kernel = ['rbf', 'sigmoid']
C = [1,2,3,4,5,6,7,8,9,10]
gamma = [0.1,0.15,0.16,0.17,0.18,0.2,0.3,'scale']

param_grid = {'kernel': kernel,
              'C': C,
              'gamma': gamma}

#Grid Search
grid_model = GridSearchCV(model, param_grid=param_grid, scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [112]:
grid_model.best_params_

{'C': 7, 'gamma': 0.3, 'kernel': 'sigmoid'}

In [110]:
confusion_matrix(y_test,grid_pred)

array([[25,  0, 36],
       [ 5,  0, 30],
       [10,  0, 84]], dtype=int64)

In [111]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.62      0.41      0.50        61
           D       0.00      0.00      0.00        35
           H       0.56      0.89      0.69        94

    accuracy                           0.57       190
   macro avg       0.40      0.43      0.39       190
weighted avg       0.48      0.57      0.50       190



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Best accuracy we could get is 0,57 after implementing Grid Search.

Hyperparameters:
- 'C': 7 
- 'gamma': 0.3 
- 'kernel': 'sigmoid'

Features:
- 'l5_ravg_ATxG' 
- 'l5_ravg_HTxpts' 
- 'avgHTP'
- 'avgATP'

**Season 17/18**

In [113]:
X = s1718.drop('FTR',axis=1)
y = s1718[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [114]:
model.fit(scaled_X_train,y_train.values.ravel())
base_pred = model.predict(scaled_X_test)

In [115]:
confusion_matrix(y_test,base_pred)

array([[40,  0, 13],
       [28,  0, 16],
       [46,  0, 47]], dtype=int64)

In [116]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.35      0.75      0.48        53
           D       0.00      0.00      0.00        44
           H       0.62      0.51      0.56        93

    accuracy                           0.46       190
   macro avg       0.32      0.42      0.35       190
weighted avg       0.40      0.46      0.41       190



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [122]:
estimator = SVC(kernel="linear")
selector = RFECV(estimator, step=1, scoring='accuracy', cv=5)
selector = selector.fit(X, y.drop('round',axis=1).values.ravel())
print("Features selected", selector.support_)
print("Feature ranking", selector.ranking_)

Features selected [False False  True  True  True False False False  True  True  True  True
  True False  True  True False False  True  True  True  True False False
 False False  True  True  True  True False False  True False  True False
 False]
Feature ranking [17 15  1  1  1 10  3  5  1  1  1  1  1  7  1  1 18 14  1  1  1  1 13  8
 11 16  1  1  1  1  6  4  1  9  1 12  2]


In [123]:
featured_columns = pd.DataFrame(selector.support_,
                               index = X.columns,
                               columns=['is_in'])

featured_columns = featured_columns[featured_columns.is_in == True].index.tolist()
featured_columns

['round',
 'AVGH',
 'AVGD',
 'HT_draws',
 'AT_draws',
 'HT_losses',
 'AT_losses',
 'l5_ravg_HTST',
 'l5_ravg_HTCR',
 'l5_ravg_ATCR',
 'l5_ravg_HTxG',
 'l5_ravg_ATxG',
 'l5_ravg_HTxpts',
 'l5_ravg_ATxpts',
 'l5_htdiff',
 'l5_atdiff',
 'avgHTP',
 'avgATP',
 'diff_points',
 'diff_MID']

In [124]:
X = s1718[featured_columns]
y = s1718[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [125]:
model.fit(scaled_X_train,y_train.values.ravel())
rfe_pred = model.predict(scaled_X_test)

In [126]:
confusion_matrix(y_test,rfe_pred)

array([[41,  0, 12],
       [19,  0, 25],
       [35,  0, 58]], dtype=int64)

In [127]:
print(classification_report(y_test,rfe_pred))

              precision    recall  f1-score   support

           A       0.43      0.77      0.55        53
           D       0.00      0.00      0.00        44
           H       0.61      0.62      0.62        93

    accuracy                           0.52       190
   macro avg       0.35      0.47      0.39       190
weighted avg       0.42      0.52      0.46       190



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [128]:
#Define the parameters
kernel = ['rbf', 'sigmoid']
C = [1,2,3,4,5,6,7,8,9,10]
gamma = [0.1,0.15,0.16,0.17,0.18,0.2,0.3,'scale']

param_grid = {'kernel': kernel,
              'C': C,
              'gamma': gamma}

#Grid Search
grid_model = GridSearchCV(model, param_grid=param_grid, scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [129]:
confusion_matrix(y_test,grid_pred)

array([[34,  0, 19],
       [18,  0, 26],
       [40,  3, 50]], dtype=int64)

In [130]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.37      0.64      0.47        53
           D       0.00      0.00      0.00        44
           H       0.53      0.54      0.53        93

    accuracy                           0.44       190
   macro avg       0.30      0.39      0.33       190
weighted avg       0.36      0.44      0.39       190



The best accuracy we could get is 0,52 after implementing RFE.

Features:
- 'AVGH'
- 'AVGD'
- 'HT_draws'
- 'AT_draws'
- 'HT_losses'
- 'AT_losses'
- 'l5_ravg_HTST'
- 'l5_ravg_HTCR'
- 'l5_ravg_ATCR'
- 'l5_ravg_HTxG'
- 'l5_ravg_ATxG'
- 'l5_ravg_HTxpts'
- 'l5_ravg_ATxpts'
- 'l5_htdiff'
- 'l5_atdiff'
- 'avgHTP'
- 'avgATP'
- 'diff_points'
- 'diff_MID'

**Season 18/19**

In [152]:
X = s1819.drop('FTR',axis=1)
y = s1819[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [153]:
model.fit(scaled_X_train,y_train.values.ravel())
base_pred = model.predict(scaled_X_test)

In [154]:
confusion_matrix(y_test,base_pred)

array([[ 3,  1, 46],
       [ 0,  4, 44],
       [ 0,  5, 87]], dtype=int64)

In [135]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       1.00      0.06      0.11        50
           D       0.40      0.08      0.14        48
           H       0.49      0.95      0.65        92

    accuracy                           0.49       190
   macro avg       0.63      0.36      0.30       190
weighted avg       0.60      0.49      0.38       190



In [139]:
estimator = SVC(kernel="linear")
selector = RFECV(estimator, step=1, scoring='accuracy', cv=3)
selector = selector.fit(X, y.drop('round',axis=1).values.ravel())
print("Features selected", selector.support_)
print("Feature ranking", selector.ranking_)

Features selected [False False  True  True  True False False False False False False False
 False False False  True False False  True  True  True  True False False
 False False False False  True False False False False False  True False
  True]
Feature ranking [27 26  1  1  1 15 11 10  6  5  8  7 16 20  3  1 21 24  1  1  1  1 23 14
 22 25 12  2  1  4 13 18  9 19  1 17  1]


In [140]:
featured_columns = pd.DataFrame(selector.support_,
                               index = X.columns,
                               columns=['is_in'])

featured_columns = featured_columns[featured_columns.is_in == True].index.tolist()
featured_columns

['round',
 'AVGH',
 'AVGD',
 'l5_ravg_ATCR',
 'l5_ravg_HTxG',
 'l5_ravg_ATxG',
 'l5_ravg_HTxpts',
 'l5_ravg_ATxpts',
 'avgHTP',
 'diff_MID',
 'diff_OVA']

In [141]:
X = s1819[featured_columns]
y = s1819[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [143]:
model.fit(scaled_X_train,y_train.values.ravel())
rfe_pred = model.predict(scaled_X_test)

In [144]:
confusion_matrix(y_test,rfe_pred)

array([[ 4, 30, 16],
       [ 1, 25, 22],
       [ 1, 42, 49]], dtype=int64)

In [145]:
print(classification_report(y_test,rfe_pred))

              precision    recall  f1-score   support

           A       0.67      0.08      0.14        50
           D       0.26      0.52      0.34        48
           H       0.56      0.53      0.55        92

    accuracy                           0.41       190
   macro avg       0.50      0.38      0.35       190
weighted avg       0.51      0.41      0.39       190



In [155]:
#Define the parameters
kernel = ['linear','rbf', 'sigmoid']
C = [1,2,3,4,5,6,7,8,9,10]
gamma = [0.1,0.15,0.16,0.17,0.18,0.2,0.3,'scale']

param_grid = {'kernel': kernel,
              'C': C,
              'gamma': gamma}

#Grid Search
grid_model = GridSearchCV(model, param_grid=param_grid, scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [156]:
grid_model.best_params_

{'C': 1, 'gamma': 0.3, 'kernel': 'rbf'}

In [157]:
confusion_matrix(y_test,grid_pred)

array([[ 4, 12, 34],
       [ 1,  6, 41],
       [ 0,  9, 83]], dtype=int64)

In [158]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.80      0.08      0.15        50
           D       0.22      0.12      0.16        48
           H       0.53      0.90      0.66        92

    accuracy                           0.49       190
   macro avg       0.52      0.37      0.32       190
weighted avg       0.52      0.49      0.40       190



The best accuracy we could get is 0,49 after implementing GridSearchCV.

Hyperparameters:
- 'C': 1
- 'gamma': 0.3 
- 'kernel': 'rbf'

**Season 19/20**

In [159]:
X = s1920.drop('FTR',axis=1)
y = s1920[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [161]:
model.fit(scaled_X_train,y_train.values.ravel())
base_pred = model.predict(scaled_X_test)

In [162]:
confusion_matrix(y_test,base_pred)

array([[ 1, 24, 27],
       [ 0, 19, 31],
       [ 3, 33, 52]], dtype=int64)

In [163]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.25      0.02      0.04        52
           D       0.25      0.38      0.30        50
           H       0.47      0.59      0.53        88

    accuracy                           0.38       190
   macro avg       0.32      0.33      0.29       190
weighted avg       0.35      0.38      0.33       190



In [166]:
estimator = SVC(kernel="linear")
selector = RFECV(estimator, step=1, scoring='accuracy', cv=5)
selector = selector.fit(X, y.drop('round',axis=1).values.ravel())
print("Features selected", selector.support_)
print("Feature ranking", selector.ranking_)

Features selected [False False False  True  True False False False False False False False
 False False  True False False False  True  True False  True False False
 False False  True False  True  True False False False False False False
 False]
Feature ranking [28 29  8  1  1  2 10  9 12 19 20 13  6  4  1  3 21 16  1  1  5  1 25 26
 27 22  1  7  1  1 11 24 23 14 18 17 15]


In [167]:
featured_columns = pd.DataFrame(selector.support_,
                               index = X.columns,
                               columns=['is_in'])

featured_columns = featured_columns[featured_columns.is_in == True].index.tolist()
featured_columns

['AVGH',
 'AVGD',
 'l5_ravg_HTCR',
 'l5_ravg_HTxG',
 'l5_ravg_ATxG',
 'l5_ravg_ATxpts',
 'l5_htdiff',
 'avgHTP',
 'avgATP']

In [168]:
X = pd.concat([s1920[featured_columns],s1920['round']],axis=1)
y = s1920[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [169]:
model.fit(scaled_X_train,y_train.values.ravel())
rfe_pred = model.predict(scaled_X_test)

In [170]:
confusion_matrix(y_test,rfe_pred)

array([[16,  1, 35],
       [ 8,  1, 41],
       [ 9,  1, 78]], dtype=int64)

In [171]:
print(classification_report(y_test,rfe_pred))

              precision    recall  f1-score   support

           A       0.48      0.31      0.38        52
           D       0.33      0.02      0.04        50
           H       0.51      0.89      0.64        88

    accuracy                           0.50       190
   macro avg       0.44      0.40      0.35       190
weighted avg       0.46      0.50      0.41       190



In [172]:
#Define the parameters
kernel = ['linear','rbf', 'sigmoid']
C = [1,2,3,4,5,6,7,8,9,10]
gamma = [0.1,0.15,0.16,0.17,0.18,0.2,0.3,'scale']

param_grid = {'kernel': kernel,
              'C': C,
              'gamma': gamma}

#Grid Search
grid_model = GridSearchCV(model, param_grid=param_grid, scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [175]:
grid_model.best_params_

{'C': 6, 'gamma': 0.17, 'kernel': 'sigmoid'}

In [173]:
confusion_matrix(y_test,grid_pred)

array([[15,  0, 37],
       [ 4,  0, 46],
       [ 5,  0, 83]], dtype=int64)

In [174]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.62      0.29      0.39        52
           D       0.00      0.00      0.00        50
           H       0.50      0.94      0.65        88

    accuracy                           0.52       190
   macro avg       0.38      0.41      0.35       190
weighted avg       0.40      0.52      0.41       190



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The best accuracy we could get is 0,52 after Grid Search.

Hyperparameters:
- 'C': 6
- 'gamma': 0.17
- 'kernel': 'sigmoid'

Features:
- 'AVGH'
- 'AVGD'
- 'l5_ravg_HTCR'
- 'l5_ravg_HTxG'
- 'l5_ravg_ATxG'
- 'l5_ravg_ATxpts'
- 'l5_htdiff'
- 'avgHTP'
- 'avgATP'


**Bundesliga**

In [13]:
df = pd.read_csv(r"C:\Users\George\Desktop\football-match-prediction-using-ml-techniques\fe\bundesliga.csv")

In [14]:
team_id = {'Bayern Munich':1, 'Dortmund':2, 'Ein Frankfurt':3, 'FC Koln':4,
           'Hannover':5, 'Hertha':6, 'Hoffenheim':7, "M'gladbach":8, 'Paderborn':9,
           'Augsburg':10, 'Hamburg':11, 'Leverkusen':12, 'Schalke 04':13, 'Stuttgart':14,
           'Werder Bremen':15, 'Wolfsburg':16, 'Freiburg':17, 'Mainz':18, 'Darmstadt':19,
           'Ingolstadt':20, 'RB Leipzig':21, 'Fortuna Dusseldorf':22, 'Nurnberg':23,
           'Union Berlin':24}

for key,value in df.iteritems():
    df['HomeTeam'] = df['HomeTeam'].apply(lambda x: team_id.get(x,x))
    
for key,value in df.iteritems():
    df['AwayTeam'] = df['AwayTeam'].apply(lambda x: team_id.get(x,x))

In [15]:
#We are dropping the starts that are not available to us before the game 
df = df.drop(['Unnamed: 0', 'Date','FTHG', 'FTAG', 'HTHG','HTAG','league', 'Hppda_coef', 'Appda_coef','HTW', 'HTD',
              'HTCR', 'ATCR','HTGS','HTxpts', 'ATxpts','ATGS', 'HTGC', 'ATGC','HTOVA_S', 'HTatt_S', 'HomeTeamPoints','AwayTeamPoints',
              'HTmid_S','HTdef_S', 'ATOVA_S', 'ATatt_S','ATmid_S','ATdef_S','l5_ravg_HTgs', 'l5_ravg_ATgs','l5_ravg_HTgc', 'l5_ravg_ATgc',
              'HTL', 'ATL', 'ATD', 'ATW'],axis=1)

df = df.dropna()

In [16]:
s1415 = df[df['season']=='2014/2015'].drop('season',axis=1)
s1516 = df[df['season']=='2015/2016'].drop('season',axis=1)
s1617 = df[df['season']=='2016/2017'].drop('season',axis=1)
s1718 = df[df['season']=='2017/2018'].drop('season',axis=1)
s1819 = df[df['season']=='2018/2019'].drop('season',axis=1)
s1920 = df[df['season']=='2019/2020'].drop('season',axis=1)

**Season 14/15**

In [205]:
X = s1415.drop('FTR',axis=1)
y = s1415[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [206]:
model.fit(scaled_X_train,y_train.values.ravel())
base_pred = model.predict(scaled_X_test)

In [207]:
confusion_matrix(y_test,base_pred)

array([[ 4,  2, 34],
       [ 0,  2, 37],
       [ 1,  3, 70]], dtype=int64)

In [195]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.80      0.10      0.18        40
           D       0.29      0.05      0.09        39
           H       0.50      0.95      0.65        74

    accuracy                           0.50       153
   macro avg       0.53      0.37      0.31       153
weighted avg       0.52      0.50      0.38       153



In [196]:
estimator = SVC(kernel="linear")
selector = RFECV(estimator, step=1, scoring='accuracy', cv=3)
selector = selector.fit(X, y.drop('round',axis=1).values.ravel())
print("Features selected", selector.support_)
print("Feature ranking", selector.ranking_)

Features selected [False False  True  True False False False False False False False False
 False  True  True  True False False False  True  True False False  True
 False False  True False  True  True False False False False False False
 False]
Feature ranking [27 26  1  1  8  9 10 20 18 23 17 11  7  1  1  1 16 13  5  1  1 14  6  1
 22 25  1 15  1  1 19 24 12 21  4  2  3]


In [197]:
featured_columns = pd.DataFrame(selector.support_,
                               index = X.columns,
                               columns=['is_in'])

featured_columns = featured_columns[featured_columns.is_in == True].index.tolist()
featured_columns

['round',
 'AVGH',
 'l5_ravg_ATST',
 'l5_ravg_HTCR',
 'l5_ravg_ATCR',
 'l5_ravg_ATxG',
 'l5_ravg_HTxpts',
 'l5_ravg_ATdeep',
 'l5_htdiff',
 'avgHTP',
 'avgATP']

In [198]:
X = s1415[featured_columns]
y = s1415[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [199]:
model.fit(scaled_X_train,y_train.values.ravel())
rfe_pred = model.predict(scaled_X_test)

In [200]:
confusion_matrix(y_test,rfe_pred)

array([[ 4,  0, 36],
       [ 2,  0, 37],
       [ 9,  0, 65]], dtype=int64)

In [201]:
print(classification_report(y_test,rfe_pred))

              precision    recall  f1-score   support

           A       0.27      0.10      0.15        40
           D       0.00      0.00      0.00        39
           H       0.47      0.88      0.61        74

    accuracy                           0.45       153
   macro avg       0.25      0.33      0.25       153
weighted avg       0.30      0.45      0.33       153



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [208]:
#Define the parameters
kernel = ['linear','rbf', 'sigmoid']
C = [1,2,3,4,5,6,7,8,9,10]
gamma = [0.1,0.15,0.16,0.17,0.18,0.2,0.3,'scale']

param_grid = {'kernel': kernel,
              'C': C,
              'gamma': gamma}

#Grid Search
grid_model = GridSearchCV(model, param_grid=param_grid, scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [211]:
grid_model.best_params_

{'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}

In [209]:
confusion_matrix(y_test,grid_pred)

array([[11,  0, 29],
       [ 4,  0, 35],
       [ 6,  0, 68]], dtype=int64)

In [210]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.52      0.28      0.36        40
           D       0.00      0.00      0.00        39
           H       0.52      0.92      0.66        74

    accuracy                           0.52       153
   macro avg       0.35      0.40      0.34       153
weighted avg       0.39      0.52      0.41       153



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The best accuracy we could get is 0,52 after implementing Grid Search.

Hyperparameters:
- 'C': 1
- 'gamma': 0.1
- 'kernel': 'rbf'

**Season 15/16**

In [212]:
X = s1516.drop('FTR',axis=1)
y = s1516[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [214]:
model.fit(scaled_X_train,y_train.values.ravel())
base_pred = model.predict(scaled_X_test)

In [215]:
confusion_matrix(y_test,base_pred)

array([[ 5,  0, 41],
       [ 0,  0, 38],
       [ 2,  0, 67]], dtype=int64)

In [216]:
print(classification_report(y_test,rfe_pred))

              precision    recall  f1-score   support

           A       0.53      0.17      0.26        46
           D       0.00      0.00      0.00        38
           H       0.48      0.96      0.64        69

    accuracy                           0.48       153
   macro avg       0.34      0.38      0.30       153
weighted avg       0.38      0.48      0.37       153



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [219]:
estimator = SVC(kernel="linear")
selector = RFECV(estimator, step=1, scoring='accuracy', cv=5)
selector = selector.fit(X, y.drop('round',axis=1).values.ravel())
print("Features selected", selector.support_)
print("Feature ranking", selector.ranking_)

Features selected [False False  True  True False False False False False False False False
 False False  True  True False False  True  True False  True  True False
 False False  True  True  True  True False False False False False False
 False]
Feature ranking [26 23  1  1  2 12 14  7 15  4 13 25 11  5  1  1 24 16  1  1  3  1  1 17
 21 19  1  1  1  1 20  8 22  6  9 18 10]


In [233]:
featured_columns = pd.DataFrame(selector.support_,
                               index = X.columns,
                               columns=['is_in'])

featured_columns = featured_columns[featured_columns.is_in == True].index.tolist()
featured_columns

['round',
 'AVGH',
 'l5_ravg_HTCR',
 'l5_ravg_ATCR',
 'l5_ravg_HTxG',
 'l5_ravg_ATxG',
 'l5_ravg_ATxpts',
 'l5_ravg_HTdeep',
 'l5_htdiff',
 'l5_atdiff',
 'avgHTP',
 'avgATP']

In [221]:
X = s1516[featured_columns]
y = s1516[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [222]:
model.fit(scaled_X_train,y_train.values.ravel())
rfe_pred = model.predict(scaled_X_test)

In [223]:
confusion_matrix(y_test,rfe_pred)

array([[29,  0, 17],
       [18,  1, 19],
       [23,  1, 45]], dtype=int64)

In [224]:
print(classification_report(y_test,rfe_pred))

              precision    recall  f1-score   support

           A       0.41      0.63      0.50        46
           D       0.50      0.03      0.05        38
           H       0.56      0.65      0.60        69

    accuracy                           0.49       153
   macro avg       0.49      0.44      0.38       153
weighted avg       0.50      0.49      0.43       153



In [225]:
#Define the parameters
kernel = ['linear','rbf', 'sigmoid']
C = [1,2,3,4,5,6,7,8,9,10]
gamma = [0.1,0.15,0.16,0.17,0.18,0.2,0.3,'scale']

param_grid = {'kernel': kernel,
              'C': C,
              'gamma': gamma}

#Grid Search
grid_model = GridSearchCV(model, param_grid=param_grid, scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [228]:
grid_model.best_params_

{'C': 6, 'gamma': 0.1, 'kernel': 'linear'}

In [226]:
confusion_matrix(y_test,grid_pred)

array([[26,  0, 20],
       [13,  0, 25],
       [15,  0, 54]], dtype=int64)

In [227]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.48      0.57      0.52        46
           D       0.00      0.00      0.00        38
           H       0.55      0.78      0.64        69

    accuracy                           0.52       153
   macro avg       0.34      0.45      0.39       153
weighted avg       0.39      0.52      0.45       153



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The best accuracy we could get is 0,52 after implementing Grid Search.

Hyperparameters:
- 'C': 6
- 'gamma': 0.1
- 'kernel': 'linear'

Features:
- 'AVGH'
- 'l5_ravg_HTCR'
- 'l5_ravg_ATCR'
- 'l5_ravg_HTxG'
- 'l5_ravg_ATxG'
- 'l5_ravg_ATxpts'
- 'l5_ravg_HTdeep'
- 'l5_htdiff'
- 'l5_atdiff'
- 'avgHTP'
- 'avgATP

**Season 16/17**

In [17]:
X = s1617.drop('FTR',axis=1)
y = s1617[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [18]:
model.fit(scaled_X_train,y_train.values.ravel())
base_pred = model.predict(scaled_X_test)

In [19]:
confusion_matrix(y_test,base_pred)

array([[ 5,  0, 36],
       [ 3,  1, 31],
       [ 6,  0, 71]], dtype=int64)

In [20]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.36      0.12      0.18        41
           D       1.00      0.03      0.06        35
           H       0.51      0.92      0.66        77

    accuracy                           0.50       153
   macro avg       0.62      0.36      0.30       153
weighted avg       0.58      0.50      0.39       153



In [21]:
estimator = SVC(kernel="linear")
selector = RFECV(estimator, step=1, scoring='accuracy', cv=5)
selector = selector.fit(X, y.drop('round',axis=1).values.ravel())
print("Features selected", selector.support_)
print("Feature ranking", selector.ranking_)

Features selected [False False False  True  True False False False False False False False
  True False  True  True False False  True  True  True False  True False
 False False  True False False False False False False False False False
 False]
Feature ranking [18 26  4  1  1  5  7 13 10 23 20 22  1 16  1  1 21 28  1  1  1  2  1  9
 24 19  1  6  3 17  8 11 12 15 25 27 14]


In [22]:
featured_columns = pd.DataFrame(selector.support_,
                               index = X.columns,
                               columns=['is_in'])

featured_columns = featured_columns[featured_columns.is_in == True].index.tolist()
featured_columns

['AVGH',
 'AVGD',
 'l5_ravg_HTST',
 'l5_ravg_HTCR',
 'l5_ravg_ATCR',
 'l5_ravg_HTxG',
 'l5_ravg_ATxG',
 'l5_ravg_HTxpts',
 'l5_ravg_HTdeep',
 'l5_htdiff']

In [23]:
X = pd.concat([s1617[featured_columns],s1617['round']],axis=1)
y = s1617[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [24]:
model.fit(scaled_X_train,y_train.values.ravel())
rfe_pred = model.predict(scaled_X_test)

In [25]:
confusion_matrix(y_test,rfe_pred)

array([[ 1,  2, 38],
       [ 1,  0, 34],
       [ 2,  1, 74]], dtype=int64)

In [26]:
print(classification_report(y_test,rfe_pred))

              precision    recall  f1-score   support

           A       0.25      0.02      0.04        41
           D       0.00      0.00      0.00        35
           H       0.51      0.96      0.66        77

    accuracy                           0.49       153
   macro avg       0.25      0.33      0.24       153
weighted avg       0.32      0.49      0.35       153



In [27]:
#Define the parameters
kernel = ['linear','rbf', 'sigmoid']
C = [1,2,3,4,5,6,7,8,9,10]
gamma = [0.1,0.15,0.16,0.17,0.18,0.2,0.3,'scale']

param_grid = {'kernel': kernel,
              'C': C,
              'gamma': gamma}

#Grid Search
grid_model = GridSearchCV(model, param_grid=param_grid, scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [28]:
grid_model.best_params_

{'C': 1, 'gamma': 0.3, 'kernel': 'rbf'}

In [29]:
confusion_matrix(y_test,grid_pred)

array([[ 1,  0, 40],
       [ 0,  0, 35],
       [ 2,  0, 75]], dtype=int64)

In [30]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.33      0.02      0.05        41
           D       0.00      0.00      0.00        35
           H       0.50      0.97      0.66        77

    accuracy                           0.50       153
   macro avg       0.28      0.33      0.24       153
weighted avg       0.34      0.50      0.34       153



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**Season 17/18**

In [229]:
X = s1718.drop('FTR',axis=1)
y = s1718[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [230]:
model.fit(scaled_X_train,y_train.values.ravel())
base_pred = model.predict(scaled_X_test)

In [231]:
confusion_matrix(y_test,base_pred)

array([[ 1,  1, 42],
       [ 1,  0, 37],
       [ 0,  1, 70]], dtype=int64)

In [232]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.50      0.02      0.04        44
           D       0.00      0.00      0.00        38
           H       0.47      0.99      0.64        71

    accuracy                           0.46       153
   macro avg       0.32      0.34      0.23       153
weighted avg       0.36      0.46      0.31       153



In [234]:
estimator = SVC(kernel="linear")
selector = RFECV(estimator, step=1, scoring='accuracy', cv=5)
selector = selector.fit(X, y.drop('round',axis=1).values.ravel())
print("Features selected", selector.support_)
print("Feature ranking", selector.ranking_)

Features selected [False False  True  True False False False False False False False False
 False False  True  True False False False  True False  True False False
 False False False False False  True False False False False False False
 False]
Feature ranking [31 25  1  1  7 13 10 20 28 23 24 21  6  9  1  1 22 26  2  1  3  1 29 30
 15 27  5 12  4  1 11 18 19 14 17 16  8]


In [235]:
featured_columns = pd.DataFrame(selector.support_,
                               index = X.columns,
                               columns=['is_in'])

featured_columns = featured_columns[featured_columns.is_in == True].index.tolist()
featured_columns

['round',
 'AVGH',
 'l5_ravg_HTCR',
 'l5_ravg_ATCR',
 'l5_ravg_ATxG',
 'l5_ravg_ATxpts',
 'avgATP']

In [237]:
X = s1718[featured_columns]
y = s1718[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [239]:
model.fit(scaled_X_train,y_train.values.ravel())
rfe_pred = model.predict(scaled_X_test)

In [240]:
confusion_matrix(y_test,rfe_pred)

array([[ 3,  3, 38],
       [ 0,  3, 35],
       [ 0,  7, 64]], dtype=int64)

In [241]:
print(classification_report(y_test,rfe_pred))

              precision    recall  f1-score   support

           A       1.00      0.07      0.13        44
           D       0.23      0.08      0.12        38
           H       0.47      0.90      0.62        71

    accuracy                           0.46       153
   macro avg       0.57      0.35      0.29       153
weighted avg       0.56      0.46      0.35       153



In [242]:
#Define the parameters
kernel = ['linear','rbf', 'sigmoid']
C = [1,2,3,4,5,6,7,8,9,10]
gamma = [0.1,0.15,0.16,0.17,0.18,0.2,0.3,'scale']

param_grid = {'kernel': kernel,
              'C': C,
              'gamma': gamma}

#Grid Search
grid_model = GridSearchCV(model, param_grid=param_grid, scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [245]:
grid_model.best_params_

{'C': 4, 'gamma': 0.3, 'kernel': 'sigmoid'}

In [243]:
confusion_matrix(y_test,grid_pred)

array([[ 6,  0, 38],
       [ 0,  0, 38],
       [ 1,  0, 70]], dtype=int64)

In [244]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.86      0.14      0.24        44
           D       0.00      0.00      0.00        38
           H       0.48      0.99      0.65        71

    accuracy                           0.50       153
   macro avg       0.45      0.37      0.29       153
weighted avg       0.47      0.50      0.37       153



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The best accuracy we could get is 0,50 after implementing Grid Search

Hyperparameters:
- 'C': 4
- 'gamma': 0.3
- 'kernel': 'sigmoid'

Features:
- 'AVGH'
- 'l5_ravg_HTCR'
- 'l5_ravg_ATCR'
- 'l5_ravg_ATxG'
- 'l5_ravg_ATxpts'
- 'avgATP'

**Season 18/19**

In [246]:
X = s1819.drop('FTR',axis=1)
y = s1819[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [247]:
model.fit(scaled_X_train,y_train.values.ravel())
base_pred = model.predict(scaled_X_test)

In [248]:
confusion_matrix(y_test,base_pred)

array([[14,  0, 35],
       [ 6,  0, 29],
       [14,  0, 55]], dtype=int64)

In [249]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       0.41      0.29      0.34        49
           D       0.00      0.00      0.00        35
           H       0.46      0.80      0.59        69

    accuracy                           0.45       153
   macro avg       0.29      0.36      0.31       153
weighted avg       0.34      0.45      0.37       153



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [250]:
estimator = SVC(kernel="linear")
selector = RFECV(estimator, step=1, scoring='accuracy', cv=5)
selector = selector.fit(X, y.drop('round',axis=1).values.ravel())
print("Features selected", selector.support_)
print("Feature ranking", selector.ranking_)

Features selected [False False  True  True  True  True  True False False False False False
 False False  True  True False False False  True False  True False False
 False False  True False False  True  True False False False  True  True
 False]
Feature ranking [20 23  1  1  1  1  1 14 11 15 16  8 13  7  1  1 21  9 10  1 12  1 18 17
 22 24  1  3  6  1  1  4  5 19  1  1  2]


In [251]:
featured_columns = pd.DataFrame(selector.support_,
                               index = X.columns,
                               columns=['is_in'])

featured_columns = featured_columns[featured_columns.is_in == True].index.tolist()
featured_columns

['round',
 'AVGH',
 'AVGD',
 'AVGA',
 'HT_wins',
 'l5_ravg_HTCR',
 'l5_ravg_ATCR',
 'l5_ravg_ATxG',
 'l5_ravg_ATxpts',
 'l5_htdiff',
 'avgATP',
 'l5_ravg_HTp',
 'diff_MID',
 'diff_DEF']

In [252]:
X = s1819[featured_columns]
y = s1819[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [253]:
model.fit(scaled_X_train,y_train.values.ravel())
rfe_pred = model.predict(scaled_X_test)

In [254]:
confusion_matrix(y_test,rfe_pred)

array([[20,  0, 29],
       [ 7,  0, 28],
       [ 9,  0, 60]], dtype=int64)

In [255]:
print(classification_report(y_test,rfe_pred))

              precision    recall  f1-score   support

           A       0.56      0.41      0.47        49
           D       0.00      0.00      0.00        35
           H       0.51      0.87      0.65        69

    accuracy                           0.52       153
   macro avg       0.36      0.43      0.37       153
weighted avg       0.41      0.52      0.44       153



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [256]:
#Define the parameters
kernel = ['linear','rbf', 'sigmoid']
C = [1,2,3,4,5,6,7,8,9,10]
gamma = [0.1,0.15,0.16,0.17,0.18,0.2,0.3,'scale']

param_grid = {'kernel': kernel,
              'C': C,
              'gamma': gamma}

#Grid Search
grid_model = GridSearchCV(model, param_grid=param_grid, scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [259]:
grid_model.best_params_

{'C': 5, 'gamma': 0.1, 'kernel': 'sigmoid'}

In [257]:
confusion_matrix(y_test,grid_pred)

array([[22,  0, 27],
       [ 7,  0, 28],
       [ 6,  0, 63]], dtype=int64)

In [258]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.63      0.45      0.52        49
           D       0.00      0.00      0.00        35
           H       0.53      0.91      0.67        69

    accuracy                           0.56       153
   macro avg       0.39      0.45      0.40       153
weighted avg       0.44      0.56      0.47       153



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The best accuracy we could get is 0,56.

Hyperparameters:
- 'C': 5
- 'gamma': 0.1
- 'kernel': 'sigmoid'

Features:
- 'AVGH'
- 'AVGD'
- 'AVGA'
- 'HT_wins'
- 'l5_ravg_HTCR'
- 'l5_ravg_ATCR'
- 'l5_ravg_ATxG'
- 'l5_ravg_ATxpts'
- 'l5_htdiff'
- 'avgATP'
- 'l5_ravg_HTp'
- 'diff_MID'
- 'diff_DEF

**Season 19/20**

In [260]:
X = s1920.drop('FTR',axis=1)
y = s1920[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [261]:
model.fit(scaled_X_train,y_train.values.ravel())
base_pred = model.predict(scaled_X_test)

In [262]:
confusion_matrix(y_test,base_pred)

array([[ 2,  0, 62],
       [ 0,  0, 35],
       [ 0,  0, 54]], dtype=int64)

In [263]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

           A       1.00      0.03      0.06        64
           D       0.00      0.00      0.00        35
           H       0.36      1.00      0.53        54

    accuracy                           0.37       153
   macro avg       0.45      0.34      0.20       153
weighted avg       0.54      0.37      0.21       153



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [264]:
estimator = SVC(kernel="linear")
selector = RFECV(estimator, step=1, scoring='accuracy', cv=5)
selector = selector.fit(X, y.drop('round',axis=1).values.ravel())
print("Features selected", selector.support_)
print("Feature ranking", selector.ranking_)

Features selected [False False  True False False  True False False False False False False
  True False False  True False False  True False  True  True False False
 False False  True  True  True  True False False False False False False
 False]
Feature ranking [24 27  1 20 13  1 17 18  5  6  8  7  1  3 15  1 23 26  1  2  1  1 14 16
 25 21  1  1  1  1 22 19  9  4 11 12 10]


In [265]:
featured_columns = pd.DataFrame(selector.support_,
                               index = X.columns,
                               columns=['is_in'])

featured_columns = featured_columns[featured_columns.is_in == True].index.tolist()
featured_columns

['round',
 'AVGA',
 'l5_ravg_HTST',
 'l5_ravg_ATCR',
 'l5_ravg_HTxG',
 'l5_ravg_HTxpts',
 'l5_ravg_ATxpts',
 'l5_htdiff',
 'l5_atdiff',
 'avgHTP',
 'avgATP']

In [266]:
X = s1920[featured_columns]
y = s1920[['FTR','round']]

X_train = X[X['round']==1].drop('round',axis=1)
X_test = X[X['round']==2].drop('round',axis=1)

y_train = y[y['round']==1].drop('round',axis=1)
y_test = y[y['round']==2].drop('round',axis=1)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [267]:
model.fit(scaled_X_train,y_train.values.ravel())
rfe_pred = model.predict(scaled_X_test)

In [268]:
confusion_matrix(y_test,rfe_pred)

array([[13,  0, 51],
       [ 5,  0, 30],
       [ 2,  0, 52]], dtype=int64)

In [269]:
print(classification_report(y_test,rfe_pred))

              precision    recall  f1-score   support

           A       0.65      0.20      0.31        64
           D       0.00      0.00      0.00        35
           H       0.39      0.96      0.56        54

    accuracy                           0.42       153
   macro avg       0.35      0.39      0.29       153
weighted avg       0.41      0.42      0.33       153



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [270]:
#Define the parameters
kernel = ['linear','rbf', 'sigmoid']
C = [1,2,3,4,5,6,7,8,9,10]
gamma = [0.1,0.15,0.16,0.17,0.18,0.2,0.3,'scale']

param_grid = {'kernel': kernel,
              'C': C,
              'gamma': gamma}

#Grid Search
grid_model = GridSearchCV(model, param_grid=param_grid, scoring='accuracy')

grid_model.fit(scaled_X_train,y_train.values.ravel())
grid_pred = grid_model.predict(scaled_X_test)

In [271]:
grid_model.best_params_

{'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}

In [272]:
confusion_matrix(y_test,grid_pred)

array([[13,  0, 51],
       [ 5,  0, 30],
       [ 2,  0, 52]], dtype=int64)

In [273]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

           A       0.65      0.20      0.31        64
           D       0.00      0.00      0.00        35
           H       0.39      0.96      0.56        54

    accuracy                           0.42       153
   macro avg       0.35      0.39      0.29       153
weighted avg       0.41      0.42      0.33       153



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Best accuracy we could get is 0,42.

Features:
- 'AVGA'
- 'l5_ravg_HTST'
- 'l5_ravg_ATCR'
- 'l5_ravg_HTxG'
- 'l5_ravg_HTxpts'
- 'l5_ravg_ATxpts'
- 'l5_htdiff'
- 'l5_atdiff'
- 'avgHTP'
- 'avgATP'