In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from urllib.request import urlopen
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier

In [None]:
epl20 = pd.read_csv("https://www.football-data.co.uk/mmz4281/1920/E0.csv")
epl21 = pd.read_csv("https://www.football-data.co.uk/mmz4281/2021/E0.csv")
epl22 = pd.read_csv("https://www.football-data.co.uk/mmz4281/2122/E0.csv")

columns = urlopen("https://www.football-data.co.uk/notes.txt") 

In [None]:
usable = ['HomeTeam', 'AwayTeam', 'FTR', 'FTHG', 'FTAG', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC']

In [None]:
def form_guide(x, team):
  result = []
  if x.FTR == 'H' and x.HomeTeam == team:
    result.append(3)
  elif x.AwayTeam == team and x.FTR == 'A':
    result.append(3)
  elif x.FTR == 'D':
    result.append(1)
  else:
    result.append(0)
  return sum(result)

def shots(x, team):
  if x.HomeTeam == team:
    return x.HS
  if x.AwayTeam == team:
    return x.AS

def shots_ot(x, team):
  if x.HomeTeam == team:
    return x.HST
  if x.AwayTeam == team:
    return x.AST

def corners(x, team):
  if x.HomeTeam == team:
    return x.HC
  if x.AwayTeam == team:
    return x.AC

def goals_sc(x, team):
  if x.HomeTeam == team:
    return x.FTHG
  if x.AwayTeam == team:
    return x.FTAG

def goals_con(x, team):
  if x.HomeTeam == team:
    return x.FTAG
  if x.AwayTeam == team:
    return x.FTHG

In [None]:
def clean_data(epl):
  df = epl[['HomeTeam', 'AwayTeam', 'FTR']]
  for team in epl.HomeTeam.unique():
    team_data = pd.concat([epl[epl.HomeTeam == team], epl[epl.AwayTeam == team]]).sort_index().reset_index()
    team_data['Points'] = team_data.apply(lambda x: form_guide(x, team), axis =1)
    team_data['Form'] = team_data.apply(lambda x: sum(team_data['Points'][x.name-4:x.name]), axis=1)
    team_data['Total_Points'] = team_data['Points'].shift().cumsum()

    team_data['Shots'] = team_data.apply(lambda x: shots(x, team), axis=1).shift().cumsum()
    team_data['Shots_On_Target'] = team_data.apply(lambda x: shots_ot(x, team), axis=1).shift().cumsum()

    team_data['Corners'] = team_data.apply(lambda x: corners(x, team), axis=1).shift().cumsum()

    team_data['Goals_Scored'] = team_data.apply(lambda x: goals_sc(x, team), axis=1).shift().cumsum()

    team_data['Goals_Conceded'] = team_data.apply(lambda x: goals_con(x, team), axis=1).shift().cumsum()
    team_data.fillna(0, inplace=True)

    home = team_data.groupby('HomeTeam').get_group(team).set_index('index')
    away = team_data.groupby('AwayTeam').get_group(team).set_index('index')

    df.loc[home.index, 'HF'] = home['Form']
    df.loc[away.index, 'AF'] = away['Form']

    df.loc[home.index, 'HP'] = home['Total_Points']
    df.loc[away.index, 'AP'] = away['Total_Points']

    df.loc[home.index, 'HGS'] = home['Goals_Scored']
    df.loc[away.index, 'AGS'] = away['Goals_Scored']

    df.loc[home.index, 'HS'] = home['Shots']
    df.loc[away.index, 'AS'] = away['Shots']

    df.loc[home.index, 'HST'] = home['Shots_On_Target']
    df.loc[away.index, 'AST'] = away['Shots_On_Target']

    df.loc[home.index, 'HGC'] = home['Goals_Conceded']
    df.loc[away.index, 'AGC'] = away['Goals_Conceded']

    df.loc[home.index, 'HC'] = home['Corners']
    df.loc[away.index, 'AC'] = away['Corners']

  return df

In [None]:
df1 = clean_data(epl20[usable])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [None]:
from sklearn.compose import ColumnTransformer

In [None]:
num_col = [cname for cname in df1.columns if df1[cname].dtype in ['Int64', 'float64']]
cat_col = ['HomeTeam', 'AwayTeam']
obj_col = ['FTR']

pipeline = ColumnTransformer([
                              ('encoder', OrdinalEncoder(), cat_col),
                              ('scaler', StandardScaler(), num_col),
])

  """Entry point for launching an IPython kernel.


In [None]:
pipeline.fit(df1)

ColumnTransformer(transformers=[('encoder', OrdinalEncoder(),
                                 ['HomeTeam', 'AwayTeam']),
                                ('scaler', StandardScaler(),
                                 ['HF', 'AF', 'HP', 'AP', 'HGS', 'AGS', 'HS',
                                  'AS', 'HST', 'AST', 'HGC', 'AGC', 'HC',
                                  'AC'])])

In [None]:
X = pipeline.transform(df1)

enc_ftr = OrdinalEncoder()
enc_ftr.fit(df1[['FTR']])

y = enc_ftr.transform(df1[['FTR']])

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1)

In [None]:
model = keras.Sequential([
                          keras.layers.Input(shape=X_train.shape[1:]),
                          keras.layers.Dense(300, activation='relu'),
                          keras.layers.Dense(100, activation='relu'),
                          keras.layers.Dense(50, activation='relu'),
                          keras.layers.Dense(3, activation='softmax')                          
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

[print(i.shape, i.dtype) for i in model.inputs]
[print(o.shape, o.dtype) for o in model.outputs]
[print(l.name, l.input_shape, l.dtype) for l in model.layers]

hist = model.fit(X_train, y_train, epochs=20, validation_data=(X_valid, y_valid))


(None, 16) <dtype: 'float32'>
(None, 3) <dtype: 'float32'>
dense (None, 16) float32
dense_1 (None, 300) float32
dense_2 (None, 100) float32
dense_3 (None, 50) float32
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


USING GRID SEARCH TO FINE THE BEST HYPER PARAMETERS

In [None]:
def create_model(activation='relu', optimizer='adam'):
  # create model
  model = keras.models.Sequential()
  model.add(keras.layers.Input(shape=X_train.shape[1:]))
  model.add(keras.layers.Dense(300, activation=activation))
  model.add(keras.layers.Dense(100, activation=activation))
  model.add(keras.layers.Dense(50, activation='relu'))
  model.add(keras.layers.Dense(3, activation='softmax'))

  # Compile model
  model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
  return model
search_model = KerasClassifier(build_fn=create_model, epochs=20)

  del sys.path[0]


In [None]:
optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']
activation = ['softmax', 'softplus', 'softsign', 'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear']

param_grid = dict(optimizer=optimizer, activation=activation)

grid = GridSearchCV(estimator=search_model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X, y)



Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))


Best: 0.484439 using {'activation': 'sigmoid', 'optimizer': 'RMSprop'}
0.452756 (0.033561) with: {'activation': 'softmax', 'optimizer': 'SGD'}
0.452756 (0.033561) with: {'activation': 'softmax', 'optimizer': 'RMSprop'}
0.452756 (0.033561) with: {'activation': 'softmax', 'optimizer': 'Adagrad'}
0.326230 (0.071328) with: {'activation': 'softmax', 'optimizer': 'Adadelta'}
0.452756 (0.033561) with: {'activation': 'softmax', 'optimizer': 'Adam'}
0.452756 (0.033561) with: {'activation': 'softmax', 'optimizer': 'Adamax'}
0.452756 (0.033561) with: {'activation': 'softmax', 'optimizer': 'Nadam'}
0.450173 (0.046457) with: {'activation': 'softplus', 'optimizer': 'SGD'}
0.436800 (0.014955) with: {'activation': 'softplus', 'optimizer': 'RMSprop'}
0.452756 (0.033561) with: {'activation': 'softplus', 'optimizer': 'Adagrad'}
0.315544 (0.096725) with: {'activation': 'softplus', 'optimizer': 'Adadelta'}
0.436883 (0.025029) with: {'activation': 'softplus', 'optimizer': 'Adam'}
0.431800 (0.076545) with: {

In [None]:
hist_search = grid.best_estimator_.fit(X_train, y_train, epochs=20, validation_data=(X_valid, y_valid))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [None]:
rfc = RandomForestClassifier(n_estimators=50)
xgbc = XGBClassifier(max_depth=5)
svc = SVC(C=0.1)
acc = cross_val_score(rfc, X, y, cv=5)
acc2 = cross_val_score(xgbc, X, y, cv=5)
acc3 = cross_val_score(svc, X, y, cv=5)
print('Accuracy score for Random Forest Classifier, XGB Classifier, Support Vector Machines, respectively:\n', acc.mean(), acc2.mean(), acc3.mean())

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Accuracy score for Random Forest Classifier, XGB Classifier, Support Vector Machines, respectively:
 0.44210526315789467 0.45 0.4526315789473684


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


TRAINING USING 2021 DATA

In [None]:
df2 = clean_data(epl21[usable])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [None]:
cat = pipeline.transformers_[0][1].categories_
for i in range(len(cat)):
  cat[i] = np.append(cat[i], list(set(df2.HomeTeam.unique())-set(cat[i])))

pipeline.transformers_[0][1].categories_ = cat

In [None]:
X1 = pipeline.transform(df2)

y1 = enc_ftr.transform(df2[['FTR']])

In [None]:
X_train1, X_valid1, y_train1, y_valid1 = train_test_split(X1, y1, test_size=0.2)

In [None]:
hist1 = model.fit(X_train1, y_train1, epochs=20, validation_data=(X_valid1, y_valid1))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


TESTING ON THIS SEASON MATCHES

In [None]:
df3 = clean_data(epl22[usable])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [None]:
df3

Unnamed: 0,HomeTeam,AwayTeam,FTR,HF,AF,HP,AP,HGS,AGS,HS,AS,HST,AST,HGC,AGC,HC,AC
0,Brentford,Arsenal,H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Man United,Leeds,H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Burnley,Brighton,A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Chelsea,Crystal Palace,H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Everton,Southampton,H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178,Watford,West Ham,A,0.0,4.0,13.0,28.0,21.0,30.0,179.0,228.0,64.0,85.0,31.0,24.0,70.0,98.0
179,Leicester,Liverpool,H,4.0,10.0,22.0,41.0,30.0,50.0,202.0,348.0,86.0,123.0,33.0,15.0,86.0,141.0
180,Chelsea,Brighton,D,8.0,5.0,41.0,23.0,42.0,16.0,294.0,206.0,106.0,66.0,13.0,17.0,123.0,89.0
181,Brentford,Man City,A,4.0,12.0,20.0,47.0,21.0,50.0,196.0,366.0,70.0,139.0,24.0,12.0,70.0,169.0


In [None]:
cat = pipeline.transformers_[0][1].categories_
for i in range(len(cat)):
  cat[i] = np.append(cat[i], list(set(df3.HomeTeam.unique())-set(cat[i])))

pipeline.transformers_[0][1].categories_ = cat

In [None]:
X2 = pipeline.transform(df3)

y2 = enc_ftr.transform(df3[['FTR']])

In [None]:
hist2 = model.fit(X2, y2, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
def get_stats(ht, at, df_in=epl22[usable]): #ht, at for hometeam and awayteam respectively
  df = pd.DataFrame([{'HomeTeam':ht, 'AwayTeam':at}])
  epl = df_in
  stats_dict = {}
  for team in [ht, at]:
    team_data = pd.concat([epl[epl.HomeTeam == team], epl[epl.AwayTeam == team]]).sort_index().reset_index()
    team_stats = {}
    team_data['Points'] = team_data.apply(lambda x: form_guide(x, team), axis =1)
    team_stats['Form'] = team_data['Points'].tail(4).sum()
    team_stats['Total_Points'] = team_data['Points'].sum()

    team_stats['Shots'] = team_data.apply(lambda x: shots(x, team), axis=1).sum()
    team_stats['Shots_On_Target'] = team_data.apply(lambda x: shots_ot(x, team), axis=1).sum()

    team_stats['Corners'] = team_data.apply(lambda x: corners(x, team), axis=1).sum()

    team_stats['Goals_Scored'] = team_data.apply(lambda x: goals_sc(x, team), axis=1).sum()

    team_stats['Goals_Conceded'] = team_data.apply(lambda x: goals_con(x, team), axis=1).sum()
    stats_dict[team] = team_stats
  
  for team, team_stats in stats_dict.items():
    if team == ht:
      home = team_stats
    elif team == at:
      away = team_stats

  df['HF'] = home['Form']
  df['AF'] = away['Form']

  df['HP'] = home['Total_Points']
  df['AP'] = away['Total_Points']

  df['HGS'] = home['Goals_Scored']
  df['AGS'] = away['Goals_Scored']

  df['HS'] = home['Shots']
  df['AS'] = away['Shots']

  df['HST'] = home['Shots_On_Target']
  df['AST'] = away['Shots_On_Target']

  df['HGC'] = home['Goals_Conceded']
  df['AGC'] = away['Goals_Conceded']

  df['HC'] = home['Corners']
  df['AC'] = away['Corners']

  return df

In [None]:
def predict_match(ht, at):
  df_temp = get_stats(ht, at)
  array = pipeline.transform(df_temp)
  '''
  numerical_cols = [cname for cname in df_temp.columns if df_temp[cname].dtype in ['Int64', 'float64']]
  df_temp[numerical_cols] = scal.transform(df_temp[numerical_cols])
  df_temp['HomeTeam'] = enc.transform(df_temp[['HomeTeam']])
  df_temp.AwayTeam = enc.transform(df_temp[['AwayTeam']])
  array = np.asarray(df_temp).astype(np.float32)
  '''
  y_proba = model.predict(array)
  result = enc_ftr.inverse_transform([y_proba.argmax(axis=1)])
  return (result[0][0], y_proba)

In [None]:
pl1, pl2, pl3 =map(clean_data, [epl20[usable], epl21[usable], epl22[usable]])
pl = pd.concat([pl1, pl2, pl3])
X_pl = pipeline.transform(pl)

y_pl = enc_ftr.transform(pl[['FTR']])
svc_pl = SVC(C=0.1)
svc_pl.fit(X_pl, y_pl)

def predict_match_ml(ht, at):
  df_temp = get_stats(ht, at)
  df_temp = pipeline.transform(df_temp)
  y_pred = svc_pl.predict(df_temp)
  result = enc_ftr.inverse_transform([y_pred])
  return result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
  y = column_or_1d(y, warn=True)


In [None]:
predict_match('Chelsea', 'Liverpool')

('H', array([[0.2936663 , 0.09959143, 0.60674226]], dtype=float32))

In [None]:
predict_match_ml('Chelsea', 'Liverpool')

array([['H']], dtype=object)

In [None]:
pipeline.transformers_[0][1].categories_[0]

array(['Arsenal', 'Aston Villa', 'Bournemouth', 'Brighton', 'Burnley',
       'Chelsea', 'Crystal Palace', 'Everton', 'Leicester', 'Liverpool',
       'Man City', 'Man United', 'Newcastle', 'Norwich',
       'Sheffield United', 'Southampton', 'Tottenham', 'Watford',
       'West Ham', 'Wolves', 'Fulham', 'West Brom', 'Leeds', 'Brentford'],
      dtype=object)