In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Needed imports

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn import neighbors
import datetime
import math

# Keras specific
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam

def add_index(row):
  return int(row.name)
wd = "/content/drive/MyDrive/DamsoPronos-TNF/"

# Setup + Training

In [638]:
pays = "E0"
xg = False

## elo set **up** (1st year)

In [639]:
#set up elo on year 2017
data17 = pd.read_csv(wd+'datasets/'+pays+'_17.csv')[['Div', 'Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG', 'HTR', 'HS', 'AS', 'HST', 'AST', 'B365H', 'B365D', 'B365A']]
elo = {}
for team in data17['HomeTeam'].unique():
  elo[team]=1000
data17["eloHome"]=0
data17["eloAway"]=0
eloK = 10
for index, row in data17.iterrows():
  rh = elo[row["HomeTeam"]]
  ra = elo[row['AwayTeam']]
  Rh = 10**(rh/400)
  Ra = 10**(ra/400)
  Eh = Rh/(Rh+Ra)
  Ea = 1-Eh
  if row["FTHG"]>row["FTAG"]:
    Sh=1
    Sa=0
  elif row["FTAG"]>row["FTHG"]:
    Sh=0
    Sa=1
  else:
    Sh=0.5
    Sa=0.5
  nrh = rh + eloK*(Sh-Eh)
  nra = ra + eloK*(Sa-Ea)
  elo[row["HomeTeam"]]= round(nrh)
  elo[row["AwayTeam"]]= round(nra)

print(sorted( ((v,k) for k,v in elo.items()), reverse=True))

[(1112, 'Man City'), (1072, 'Man United'), (1063, 'Tottenham'), (1062, 'Liverpool'), (1037, 'Chelsea'), (1020, 'Arsenal'), (1004, 'Burnley'), (995, 'Everton'), (988, 'Leicester'), (987, 'Crystal Palace'), (983, 'Bournemouth'), (980, 'West Ham'), (979, 'Newcastle'), (974, 'Brighton'), (967, 'Southampton'), (965, 'Watford'), (959, 'Huddersfield'), (953, 'West Brom'), (952, 'Stoke'), (948, 'Swansea')]


In [640]:
#set up elo on year 2017
data18 = pd.read_csv(wd+'datasets/'+pays+'_18.csv')[['Div', 'Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG', 'HTR', 'HS', 'AS', 'HST', 'AST', 'B365H', 'B365D', 'B365A']]
for team in data18['HomeTeam'].unique():
    if not(team in elo.keys()):
      elo[team]=1000
for index, row in data17.iterrows():
  rh = elo[row["HomeTeam"]]
  ra = elo[row['AwayTeam']]
  Rh = 10**(rh/400)
  Ra = 10**(ra/400)
  Eh = Rh/(Rh+Ra)
  Ea = 1-Eh
  if row["FTHG"]>row["FTAG"]:
    Sh=1
    Sa=0
  elif row["FTAG"]>row["FTHG"]:
    Sh=0
    Sa=1
  else:
    Sh=0.5
    Sa=0.5
  nrh = rh + eloK*(Sh-Eh)
  nra = ra + eloK*(Sa-Ea)
  elo[row["HomeTeam"]]= round(nrh)
  elo[row["AwayTeam"]]= round(nra)

print(sorted( ((v,k) for k,v in elo.items()), reverse=True))

[(1179, 'Man City'), (1112, 'Man United'), (1097, 'Tottenham'), (1096, 'Liverpool'), (1062, 'Chelsea'), (1034, 'Arsenal'), (1004, 'Burnley'), (1000, 'Wolves'), (1000, 'Fulham'), (1000, 'Cardiff'), (989, 'Everton'), (981, 'Crystal Palace'), (979, 'Leicester'), (977, 'Bournemouth'), (971, 'West Ham'), (967, 'Newcastle'), (960, 'Brighton'), (947, 'Watford'), (947, 'Southampton'), (933, 'Huddersfield'), (924, 'Stoke'), (923, 'West Brom'), (918, 'Swansea')]


## prepare data

In [641]:
data18 = pd.read_csv(wd+'datasets/'+pays+'_18.csv')[['Div', 'Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG', 'HTR', 'HS', 'AS', 'HST', 'AST', 'B365H', 'B365D', 'B365A']]
xg18 = pd.read_csv(wd+'/datasets/'+pays+'18xG.csv')[['Squad', 'xG']]
def add_index(row):
  return int(row.name)

data18["index"]=data18.apply(add_index, axis=1)

def custom_hs(row):
  value = data18.loc[(data18['HomeTeam'] == row['HomeTeam'])].loc[data18['index']<int(row.name)]['HS'].mean()
  if math.isnan(value):
    return 0
  return value

def custom_as(row):
  value = data18.loc[(data18['AwayTeam'] == row['AwayTeam'])].loc[data18['index']<int(row.name)]['AS'].mean()
  if math.isnan(value):
    return 0
  return value

def custom_hst(row):
  value = data18.loc[(data18['HomeTeam'] == row['HomeTeam'])].loc[data18['index']<int(row.name)]['HST'].mean()
  if math.isnan(value):
    return 0
  return value

def custom_ast(row):
  value = data18.loc[(data18['AwayTeam'] == row['AwayTeam'])].loc[data18['index']<int(row.name)]['AST'].mean()
  if math.isnan(value):
    return 0
  return value


def custom_hc(row):
  value = data18.loc[(data18['HomeTeam'] == row['HomeTeam'])].loc[data18['index']<int(row.name)]['HC'].mean()
  if math.isnan(value):
    return 0
  return value

def custom_ac(row):
  value = data18.loc[(data18['AwayTeam'] == row['AwayTeam'])].loc[data18['index']<int(row.name)]['AC'].mean()
  if math.isnan(value):
    return 0
  return value

def custom_psdg(row):
  pshg = data18.loc[(data18['HomeTeam'] == row['HomeTeam'])].loc[data18['index']<int(row.name)]['FTHG'].sum()
  psag = data18.loc[(data18['AwayTeam'] == row['AwayTeam'])].loc[data18['index']<int(row.name)]['FTAG'].sum()
  if math.isnan(pshg-psag):
    return 0
  return pshg-psag

def xhg18(row):
  return xg18.loc[xg18['Squad']==row["HomeTeam"]]["xG"].mean()

def xag18(row):
  return xg18.loc[xg18['Squad']==row["AwayTeam"]]["xG"].mean()
if xg:
  data18['xHG'] = data18.apply(xhg18, axis=1)
  data18['xAG'] = data18.apply(xag18, axis=1)
data18['CustomHS']=data18.apply(custom_hs, axis=1)
data18['CustomAS']=data18.apply(custom_as, axis=1)
data18['CustomHST']=data18.apply(custom_hst, axis=1)
data18['CustomAST']=data18.apply(custom_ast, axis=1)
data18['CustomPSDG']=data18.apply(custom_psdg, axis=1)




In [642]:
data18.tail(10)

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,AST,B365H,B365D,B365A,index,CustomHS,CustomAS,CustomHST,CustomAST,CustomPSDG
370,E0,12/05/2019,Brighton,Man City,1,4,A,1,2,A,...,9,19.0,8.5,1.16,370,10.777778,15.388889,2.833333,5.722222,-16
371,E0,12/05/2019,Burnley,Arsenal,1,3,A,0,0,D,...,6,3.25,3.8,2.2,371,10.888889,10.722222,3.388889,3.777778,-5
372,E0,12/05/2019,Crystal Palace,Bournemouth,5,3,H,3,1,H,...,8,1.9,4.2,3.8,372,15.388889,11.222222,3.777778,4.055556,-9
373,E0,12/05/2019,Fulham,Newcastle,0,4,A,0,2,A,...,6,2.5,3.6,2.9,373,14.333333,9.166667,4.666667,3.111111,8
374,E0,12/05/2019,Leicester,Chelsea,0,0,D,0,0,D,...,4,2.4,3.75,2.9,374,16.055556,14.833333,5.333333,4.222222,0
375,E0,12/05/2019,Liverpool,Wolves,2,0,H,1,0,H,...,2,1.3,6.0,11.0,375,17.888889,11.111111,6.722222,3.555556,34
376,E0,12/05/2019,Man United,Cardiff,0,2,A,0,1,A,...,4,1.28,6.5,11.0,376,14.333333,9.666667,6.444444,3.0,22
377,E0,12/05/2019,Southampton,Huddersfield,1,1,D,1,0,H,...,3,1.44,4.75,8.5,377,13.777778,10.388889,4.777778,3.222222,15
378,E0,12/05/2019,Tottenham,Everton,2,2,D,1,0,H,...,9,2.2,3.5,3.5,378,16.5,11.055556,5.666667,4.166667,10
379,E0,12/05/2019,Watford,West Ham,1,4,A,0,2,A,...,9,2.25,3.75,3.2,379,11.5,10.0,4.166667,3.333333,9


In [643]:
data19 = pd.read_csv(wd+'/datasets/'+pays+'_19.csv')
xg19 = pd.read_csv(wd+'/datasets/'+pays+'19xG.csv')[['Squad', 'xG']]

common_cols = list(set.intersection(*(set(df.columns) for df in [data18, data19])))
data19=data19[common_cols]
data19["index"]=data19.apply(add_index, axis=1)

def custom_hs19(row):
  value = data19.loc[(data19['HomeTeam'] == row['HomeTeam'])].loc[data19['index']<int(row.name)]['HS'].mean()
  if math.isnan(value):
    return 0
  return value

def custom_as19(row):
  value = data19.loc[(data19['AwayTeam'] == row['AwayTeam'])].loc[data19['index']<int(row.name)]['AS'].mean()
  if math.isnan(value):
    return 0
  return value

def custom_hst19(row):
  value = data19.loc[(data19['HomeTeam'] == row['HomeTeam'])].loc[data19['index']<int(row.name)]['HST'].mean()
  if math.isnan(value):
    return 0
  return value

def custom_ast19(row):
  value = data19.loc[(data19['AwayTeam'] == row['AwayTeam'])].loc[data19['index']<int(row.name)]['AST'].mean()
  if math.isnan(value):
    return 0
  return value

def custom_hc19(row):
  value = data19.loc[(data19['HomeTeam'] == row['HomeTeam'])].loc[data19['index']<int(row.name)]['HC'].mean()
  if math.isnan(value):
    return 0
  return value

def custom_ac19(row):
  value = data19.loc[(data19['AwayTeam'] == row['AwayTeam'])].loc[data19['index']<int(row.name)]['AC'].mean()
  if math.isnan(value):
    return 0
  return value

def custom_psdg19(row):
  pshg = data19.loc[(data19['HomeTeam'] == row['HomeTeam'])].loc[data19['index']<int(row.name)]['FTHG'].sum()
  psag = data19.loc[(data19['AwayTeam'] == row['AwayTeam'])].loc[data19['index']<int(row.name)]['FTAG'].sum()
  if math.isnan(pshg-psag):
    return 0
  return pshg-psag

def custom_hf19(row):
  value = data19.loc[(data19['AwayTeam'] == row['AwayTeam'])].loc[data19['index']<int(row.name)]['HF'].mean()
  if math.isnan(value):
    return 0
  return value

def custom_af19(row):
  value = data19.loc[(data19['AwayTeam'] == row['AwayTeam'])].loc[data19['index']<int(row.name)]['AF'].mean()
  if math.isnan(value):
    return 0
  return value

def xhg19(row):
  return xg19.loc[xg19['Squad']==row["HomeTeam"]]["xG"].mean()

def xag19(row):
  return xg19.loc[xg19['Squad']==row["AwayTeam"]]["xG"].mean()

if xg:
  data19['xHG'] = data19.apply(xhg19, axis=1)
  data19['xAG'] = data19.apply(xag19, axis=1)
data19['CustomHS']=data19.apply(custom_hs19, axis=1)
data19['CustomAS']=data19.apply(custom_as19, axis=1)
data19['CustomHST']=data19.apply(custom_hst19, axis=1)
data19['CustomAST']=data19.apply(custom_ast19, axis=1)
data19['CustomPSDG']=data19.apply(custom_psdg19, axis=1)

In [644]:
data19.tail(10)

Unnamed: 0,Date,AST,FTHG,HomeTeam,HTHG,FTR,FTAG,B365D,HST,HTAG,...,AwayTeam,Div,B365A,HTR,index,CustomHS,CustomAS,CustomHST,CustomAST,CustomPSDG
370,26/07/2020,6,3,Arsenal,3,H,2,3.8,5,1,...,Watford,E0,3.8,H,370,12.111111,8.944444,4.777778,2.888889,21
371,26/07/2020,4,1,Burnley,1,A,2,3.4,2,1,...,Brighton,E0,3.0,D,371,11.5,11.722222,4.388889,3.888889,6
372,26/07/2020,1,2,Chelsea,2,H,0,3.8,3,0,...,Wolves,E0,4.0,H,372,17.833333,12.222222,6.555556,4.055556,4
373,26/07/2020,2,1,Crystal Palace,0,D,1,4.2,2,1,...,Tottenham,E0,1.55,A,373,12.111111,10.666667,3.666667,4.388889,-10
374,26/07/2020,7,1,Everton,1,A,3,3.7,5,2,...,Bournemouth,E0,3.0,A,374,13.277778,9.833333,4.888889,2.944444,8
375,26/07/2020,3,0,Leicester,0,A,2,3.5,3,0,...,Man United,E0,2.2,D,375,13.777778,13.166667,4.722222,5.277778,11
376,26/07/2020,4,5,Man City,2,H,0,11.0,10,0,...,Norwich,E0,26.0,H,376,20.166667,10.333333,6.944444,3.0,45
377,26/07/2020,6,1,Newcastle,1,A,3,5.25,2,1,...,Liverpool,E0,1.36,D,377,12.777778,14.555556,4.166667,5.333333,-11
378,26/07/2020,3,3,Southampton,0,H,1,3.5,4,1,...,Sheffield United,E0,3.3,A,378,12.611111,7.777778,4.055556,2.333333,4
379,26/07/2020,4,1,West Ham,0,D,1,3.6,1,0,...,Aston Villa,E0,2.3,D,379,11.722222,10.944444,4.888889,3.666667,11


In [645]:
data20= pd.read_csv(wd+'/datasets/'+pays+'_20.csv')[common_cols]
xg20 = pd.read_csv(wd+'/datasets/'+pays+'20xG.csv')[['Squad', 'xG']]
data20["index"]=data20.apply(add_index, axis=1)
def custom_hs20(row):
  value = data20.loc[(data20['HomeTeam'] == row['HomeTeam'])].loc[data20['index']<int(row.name)]['HS'].mean()
  if math.isnan(value):
    return 0
  return value

def custom_as20(row):
  value = data20.loc[(data20['AwayTeam'] == row['AwayTeam'])].loc[data20['index']<int(row.name)]['AS'].mean()
  if math.isnan(value):
    return 0
  return value

def custom_hst20(row):
  value = data20.loc[(data20['HomeTeam'] == row['HomeTeam'])].loc[data20['index']<int(row.name)]['HST'].mean()
  if math.isnan(value):
    return 0
  return value

def custom_ast20(row):
  value = data20.loc[(data20['AwayTeam'] == row['AwayTeam'])].loc[data20['index']<int(row.name)]['AST'].mean()
  if math.isnan(value):
    return 0
  return value

def custom_hc20(row):
  value = data20.loc[(data20['HomeTeam'] == row['HomeTeam'])].loc[data20['index']<int(row.name)]['HC'].mean()
  if math.isnan(value):
    return 0
  return value

def custom_ac20(row):
  value = data20.loc[(data20['AwayTeam'] == row['AwayTeam'])].loc[data20['index']<int(row.name)]['AC'].mean()
  if math.isnan(value):
    return 0
  return value

def custom_psdg20(row):
  pshg = data20.loc[(data20['HomeTeam'] == row['HomeTeam'])].loc[data20['index']<int(row.name)]['FTHG'].sum()
  psag = data20.loc[(data20['AwayTeam'] == row['AwayTeam'])].loc[data20['index']<int(row.name)]['FTAG'].sum()
  if math.isnan(pshg-psag):
    return 0
  return pshg-psag

def custom_hf20(row):
  value = data20.loc[(data20['AwayTeam'] == row['AwayTeam'])].loc[data20['index']<int(row.name)]['HF'].mean()
  if math.isnan(value):
    return 0
  return value

def custom_af20(row):
  value = data20.loc[(data20['AwayTeam'] == row['AwayTeam'])].loc[data20['index']<int(row.name)]['AF'].mean()
  if math.isnan(value):
    return 0
  return value

def xhg20(row):
  return xg20.loc[xg20['Squad']==row["HomeTeam"]]["xG"].mean()

def xag20(row):
  return xg20.loc[xg20['Squad']==row["AwayTeam"]]["xG"].mean()

if xg:
  data20['xHG'] = data20.apply(xhg20, axis=1)
  data20['xAG'] = data20.apply(xag20, axis=1)
data20['CustomHS']=data20.apply(custom_hs20, axis=1)
data20['CustomAS']=data20.apply(custom_as20, axis=1)
data20['CustomHST']=data20.apply(custom_hst20, axis=1)
data20['CustomAST']=data20.apply(custom_ast20, axis=1)
data20['CustomPSDG']=data20.apply(custom_psdg20, axis=1)

In [646]:
data20.tail(10)

Unnamed: 0,Date,AST,FTHG,HomeTeam,HTHG,FTR,FTAG,B365D,HST,HTAG,...,AwayTeam,Div,B365A,HTR,index,CustomHS,CustomAS,CustomHST,CustomAST,CustomPSDG
370,23/05/2021,1,2,Arsenal,0,H,0,4.33,5,0,...,Brighton,E0,5.25,D,370,12.444444,10.388889,4.0,3.444444,4
371,23/05/2021,7,2,Aston Villa,1,H,1,4.6,3,0,...,Chelsea,E0,1.45,H,371,14.611111,14.0,5.111111,5.0,1
372,23/05/2021,4,0,Fulham,0,A,2,3.8,0,1,...,Newcastle,E0,3.3,A,372,12.222222,8.611111,3.5,3.277778,-9
373,23/05/2021,5,3,Leeds,2,H,1,5.25,9,0,...,West Brom,E0,6.5,H,373,13.777778,8.444444,5.444444,2.666667,6
374,23/05/2021,4,2,Leicester,1,A,4,4.0,6,1,...,Tottenham,E0,3.3,D,374,13.0,11.055556,4.888889,3.833333,3
375,23/05/2021,4,2,Liverpool,1,H,0,8.5,5,0,...,Crystal Palace,E0,15.0,H,375,16.555556,10.111111,5.777778,3.722222,6
376,23/05/2021,3,5,Man City,2,H,0,5.0,11,0,...,Everton,E0,6.5,H,376,17.055556,9.666667,6.055556,3.833333,15
377,23/05/2021,3,1,Sheffield United,1,H,0,3.6,3,0,...,Burnley,E0,2.4,H,377,9.611111,9.833333,3.111111,3.555556,-8
378,23/05/2021,5,3,West Ham,2,H,0,4.0,7,0,...,Southampton,E0,5.0,H,378,11.444444,9.888889,4.111111,3.722222,10
379,23/05/2021,4,1,Wolves,1,A,2,3.5,4,2,...,Man United,E0,2.7,A,379,14.222222,13.166667,4.722222,5.611111,-13


In [647]:
data21 = pd.read_csv(wd+'datasets/'+pays+'_21.csv')[['Div', 'Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG', 'HTR', 'HS', 'AS', 'HST', 'AST', 'B365H', 'B365D', 'B365A']]
xg21 = pd.read_csv(wd+'/datasets/'+pays+'21xG.csv')[['Squad', 'xG']]
def add_index(row):
  return int(row.name)

data21["index"]=data21.apply(add_index, axis=1)

def custom_hs(row):
  value = data21.loc[(data21['HomeTeam'] == row['HomeTeam'])].loc[data21['index']<int(row.name)]['HS'].mean()
  if math.isnan(value):
    return 0
  return value

def custom_as(row):
  value = data21.loc[(data21['AwayTeam'] == row['AwayTeam'])].loc[data21['index']<int(row.name)]['AS'].mean()
  if math.isnan(value):
    return 0
  return value

def custom_hst(row):
  value = data21.loc[(data21['HomeTeam'] == row['HomeTeam'])].loc[data21['index']<int(row.name)]['HST'].mean()
  if math.isnan(value):
    return 0
  return value

def custom_ast(row):
  value = data21.loc[(data21['AwayTeam'] == row['AwayTeam'])].loc[data21['index']<int(row.name)]['AST'].mean()
  if math.isnan(value):
    return 0
  return value


def custom_hc(row):
  value = data21.loc[(data21['HomeTeam'] == row['HomeTeam'])].loc[data21['index']<int(row.name)]['HC'].mean()
  if math.isnan(value):
    return 0
  return value

def custom_ac(row):
  value = data21.loc[(data21['AwayTeam'] == row['AwayTeam'])].loc[data21['index']<int(row.name)]['AC'].mean()
  if math.isnan(value):
    return 0
  return value

def custom_psdg(row):
  pshg = data21.loc[(data21['HomeTeam'] == row['HomeTeam'])].loc[data21['index']<int(row.name)]['FTHG'].sum()
  psag = data21.loc[(data21['AwayTeam'] == row['AwayTeam'])].loc[data21['index']<int(row.name)]['FTAG'].sum()
  if math.isnan(pshg-psag):
    return 0
  return pshg-psag

def xhg21(row):
  return xg21.loc[xg21['Squad']==row["HomeTeam"]]["xG"].mean()

def xag21(row):
  return xg21.loc[xg21['Squad']==row["AwayTeam"]]["xG"].mean()

if xg:
  data21['xHG'] = data21.apply(xhg21, axis=1)
  data21['xAG'] = data21.apply(xag21, axis=1)
data21['CustomHS']=data21.apply(custom_hs, axis=1)
data21['CustomAS']=data21.apply(custom_as, axis=1)
data21['CustomHST']=data21.apply(custom_hst, axis=1)
data21['CustomAST']=data21.apply(custom_ast, axis=1)
data21['CustomPSDG']=data21.apply(custom_psdg, axis=1)


In [648]:
data = data19.append(data20).append(data21)
data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,Date,AST,FTHG,HomeTeam,HTHG,FTR,FTAG,B365D,HST,HTAG,...,AwayTeam,Div,B365A,HTR,index,CustomHS,CustomAS,CustomHST,CustomAST,CustomPSDG
0,09/08/2019,5,4,Liverpool,4,H,1,10.00,7,0,...,Norwich,E0,19.00,H,0,0.000000,0.000000,0.000000,0.000000,0
1,10/08/2019,9,0,West Ham,0,A,5,6.50,3,1,...,Man City,E0,1.22,A,1,0.000000,0.000000,0.000000,0.000000,0
2,10/08/2019,3,1,Bournemouth,0,D,1,3.60,3,0,...,Sheffield United,E0,3.60,D,2,0.000000,0.000000,0.000000,0.000000,0
3,10/08/2019,3,3,Burnley,0,H,0,3.20,4,0,...,Southampton,E0,2.75,D,3,0.000000,0.000000,0.000000,0.000000,0
4,10/08/2019,3,0,Crystal Palace,0,D,0,3.25,2,0,...,Everton,E0,2.37,D,4,0.000000,0.000000,0.000000,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1135,22/05/2022,4,1,Crystal Palace,1,H,0,3.60,3,0,...,Man United,E0,2.25,H,375,12.500000,12.166667,4.333333,4.611111,1
1136,22/05/2022,2,4,Leicester,0,H,1,4.00,6,0,...,Southampton,E0,3.80,D,376,12.500000,13.277778,4.888889,4.500000,11
1137,22/05/2022,5,3,Liverpool,1,H,1,8.50,8,1,...,Wolves,E0,15.00,D,377,20.833333,9.555556,7.111111,3.333333,29
1138,22/05/2022,2,3,Man City,0,H,2,8.00,5,1,...,Aston Villa,E0,17.00,A,378,18.722222,11.944444,7.111111,4.111111,34


## apply elo

In [649]:
elo

{'Arsenal': 1034,
 'Brighton': 960,
 'Chelsea': 1062,
 'Crystal Palace': 981,
 'Everton': 989,
 'Southampton': 947,
 'Watford': 947,
 'West Brom': 923,
 'Man United': 1112,
 'Newcastle': 967,
 'Bournemouth': 977,
 'Burnley': 1004,
 'Leicester': 979,
 'Liverpool': 1096,
 'Stoke': 924,
 'Swansea': 918,
 'Huddersfield': 933,
 'Tottenham': 1097,
 'Man City': 1179,
 'West Ham': 971,
 'Fulham': 1000,
 'Wolves': 1000,
 'Cardiff': 1000}

In [650]:
for team in data['HomeTeam'].unique():
  if not(team in elo.keys()):
    elo[team]=800
for index, row in data.iterrows():
  rh = elo[row["HomeTeam"]]
  ra = elo[row['AwayTeam']]
  Rh = 10**(rh/400)
  Ra = 10**(ra/400)
  Eh = Rh/(Rh+Ra)
  Ea = 1-Eh
  if row["FTHG"]>row["FTAG"]:
    Sh=1
    Sa=0
  elif row["FTAG"]>row["FTHG"]:
    Sh=0
    Sa=1
  else:
    Sh=0.5
    Sa=0.5
  nrh = int(round(rh + eloK*(Sh-Eh)))
  nra = int(round(ra + eloK*(Sa-Ea)))
  elo[row["HomeTeam"]]= nrh
  elo[row["AwayTeam"]]= nra
  data.loc[index,['eloHome']] = rh
  data.loc[index, ['eloAway']] = ra

print(sorted( ((v,k) for k,v in elo.items()), reverse=True))

[(1209, 'Man City'), (1179, 'Liverpool'), (1089, 'Chelsea'), (1073, 'Man United'), (1071, 'Tottenham'), (1050, 'Arsenal'), (1005, 'Leicester'), (1000, 'Cardiff'), (992, 'West Ham'), (973, 'Wolves'), (967, 'Brighton'), (966, 'Newcastle'), (957, 'Crystal Palace'), (950, 'Everton'), (937, 'Fulham'), (933, 'Huddersfield'), (931, 'Burnley'), (929, 'Bournemouth'), (927, 'Southampton'), (924, 'Stoke'), (918, 'Swansea'), (911, 'Aston Villa'), (898, 'Leeds'), (885, 'West Brom'), (855, 'Watford'), (854, 'Brentford'), (843, 'Sheffield United'), (774, 'Norwich')]


In [651]:
def home_or_dc(row):
    if row['FTR'] == 'H':
        val = 0
    else:
        val = 1
    return val

def exact_pred(row):
    if row['FTR'] == 'H':
        val = 0
    elif row['FTR'] == 'D':
        val = 1
    else:
        val = 2
    return val

data['FTR2']=data.apply(home_or_dc, axis=1)
data['FTR3']=data.apply(exact_pred, axis=1)
data = data.dropna()
print(data.shape)
normalized_df=(data-data.mean())/data.std()

(1140, 27)




In [652]:
provided_columns= ['B365H', 'B365D', 'B365A', 'HS', 'AS', 'HST', 'AST', 'CustomPSDG']

real_columns=['B365H', 'B365D', 'B365A', 'CustomHS', 'CustomAS', 'CustomHST', 'CustomAST', 'CustomPSDG', 'eloHome', 'eloAway']

# x = data[real_columns]
# y = data['FTR2'].values
x = normalized_df[real_columns]
y = data['FTR2'].values
xtrain, xtest, ytrain, ytest = train_test_split(x, y, train_size=0.9)

print(x.shape, len(y))

(1140, 10) 1140


In [653]:
x

Unnamed: 0,B365H,B365D,B365A,CustomHS,CustomAS,CustomHST,CustomAST,CustomPSDG,eloHome,eloAway
0,-0.830835,4.222908,3.642199,-2.948071,-2.828365,-2.667372,-2.435888,-0.174436,1.147309,-1.924025
1,3.992598,1.666676,-0.831710,-2.948071,-2.828365,-2.667372,-2.435888,-0.174436,-0.148731,2.001062
2,-0.471076,-0.451345,-0.232840,-2.948071,-2.828365,-2.667372,-2.435888,-0.174436,-0.086521,-1.924025
3,-0.173498,-0.743485,-0.446722,-2.948071,-2.828365,-2.667372,-2.435888,-0.174436,0.193424,-0.401630
4,-0.004722,-0.706968,-0.542340,-2.948071,-2.828365,-2.667372,-2.435888,-0.174436,-0.045048,0.033340
...,...,...,...,...,...,...,...,...,...,...
1135,-0.004722,-0.451345,-0.572535,-0.053395,0.335214,0.004731,0.438291,-0.049526,-0.366465,0.975776
1136,-0.515491,-0.159204,-0.182515,-0.053395,0.624126,0.347308,0.369033,1.199577,0.162319,-0.567332
1137,-0.830835,3.127380,2.635695,1.876388,-0.343727,1.717618,-0.358168,3.447961,1.987143,-0.111650
1138,-0.821952,2.762204,3.138947,1.387510,0.277432,1.717618,0.126633,4.072513,2.298192,-0.753748


## keras classification

In [654]:
from tensorflow.python.framework.ops import disable_eager_execution
disable_eager_execution()

In [655]:
keras.backend.clear_session()
model = Sequential()

i = Input(shape=(10,))

model.add(i)
model.add(Dense(500, activation='relu'))
model.add(Dense(500, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(250, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(50, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(20, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='relu'))
# Compile the model
model.compile(optimizer=Adam(lr=0.001), 
              loss="binary_crossentropy", 
              metrics=['accuracy'])
es_callback = keras.callbacks.EarlyStopping(monitor='loss', patience=50)
path = f'/models/2k22elo/checkpoints/'+pays+'10.h5'
mc_callback = tf.keras.callbacks.ModelCheckpoint(
          wd+path,
          verbose=0,
          save_weights_only=True,
          save_best_only=True)
ytraink = to_categorical(ytrain)
ytestk= to_categorical(ytest)
print(xtrain.shape, xtest.shape, ytraink.shape)
model.fit(xtrain, ytraink, epochs=600, validation_split=0.2, callbacks=[es_callback, mc_callback])

  super(Adam, self).__init__(name, **kwargs)


(1026, 10) (114, 10) (1026, 2)
Train on 820 samples, validate on 206 samples
Epoch 1/600

  updates = self.state_updates


Epoch 2/600
Epoch 3/600
Epoch 4/600
Epoch 5/600
Epoch 6/600
Epoch 7/600
Epoch 8/600
Epoch 9/600
Epoch 10/600
Epoch 11/600
Epoch 12/600
Epoch 13/600
Epoch 14/600
Epoch 15/600
Epoch 16/600
Epoch 17/600
Epoch 18/600
Epoch 19/600
Epoch 20/600
Epoch 21/600
Epoch 22/600
Epoch 23/600
Epoch 24/600
Epoch 25/600
Epoch 26/600
Epoch 27/600
Epoch 28/600
Epoch 29/600
Epoch 30/600
Epoch 31/600
Epoch 32/600
Epoch 33/600
Epoch 34/600
Epoch 35/600
Epoch 36/600
Epoch 37/600
Epoch 38/600
Epoch 39/600
Epoch 40/600
Epoch 41/600
Epoch 42/600
Epoch 43/600
Epoch 44/600
Epoch 45/600
Epoch 46/600
Epoch 47/600
Epoch 48/600
Epoch 49/600
Epoch 50/600
Epoch 51/600
Epoch 52/600
Epoch 53/600
Epoch 54/600
Epoch 55/600
Epoch 56/600
Epoch 57/600
Epoch 58/600
Epoch 59/600
Epoch 60/600
Epoch 61/600
Epoch 62/600
Epoch 63/600
Epoch 64/600
Epoch 65/600
Epoch 66/600
Epoch 67/600
Epoch 68/600
Epoch 69/600
Epoch 70/600
Epoch 71/600
Epoch 72/600
Epoch 73/600
Epoch 74/600
Epoch 75/600
Epoch 76/600
Epoch 77/600
Epoch 78/600
Epoch 7

<keras.callbacks.History at 0x7f77029a4f90>

In [656]:
model.load_weights(wd+"models/2k22elo/checkpoints/"+pays+"10.h5")
pred_train= model.predict(xtrain)
scores = model.evaluate(xtrain, ytraink, verbose=0)
print('Accuracy on training data: {}% '.format(scores[1]*100))
scores2 = model.evaluate(xtest, ytestk, verbose=0)
print('Accuracy on test data: {}% '.format(scores2[1]*100))

  updates=self.state_updates,


Accuracy on training data: 65.6920075416565% 
Accuracy on test data: 61.84210777282715% 


In [657]:
model.save(wd+'/models/2k22elo/'+pays+'10.h5')

# Results on unseen data ?

## Preparing data

In [553]:
def argmax(iterable):
    return max(enumerate(iterable), key=lambda x: x[1])[0]

def double_chance(c1, c2):
    m1=c2/(c1+c2)

    return round(m1*c1, 2)

In [None]:
def home_or_dc(row):
    if row['FTR'] == 'H':
        val = 0
    else:
        val = 1
    return val

def exact_pred(row):
    if row['FTR'] == 'H':
        val = 0
    elif row['FTR'] == 'D':
        val = 1
    else:
        val = 2
    return val

In [None]:
data21 = pd.read_csv(wd+'/datasets/'+pays+'_21.csv')[common_cols]
xg21 = pd.read_csv(wd+'/datasets/'+pays+'21xG.csv')[['Squad', 'xG']]
data21["index"]=data21.apply(add_index, axis=1)
def custom_hs21(row):
  value = data21.loc[(data21['HomeTeam'] == row['HomeTeam'])].loc[data21['index']<int(row.name)]['HS'].mean()
  if math.isnan(value):
    return 0
  return value

def custom_as21(row):
  value = data21.loc[(data21['AwayTeam'] == row['AwayTeam'])].loc[data21['index']<int(row.name)]['AS'].mean()
  if math.isnan(value):
    return 0
  return value

def custom_hst21(row):
  value = data21.loc[(data21['HomeTeam'] == row['HomeTeam'])].loc[data21['index']<int(row.name)]['HST'].mean()
  if math.isnan(value):
    return 0
  return value

def custom_ast21(row):
  value = data21.loc[(data21['AwayTeam'] == row['AwayTeam'])].loc[data21['index']<int(row.name)]['AST'].mean()
  if math.isnan(value):
    return 0
  return value

def custom_hc21(row):
  value = data21.loc[(data21['HomeTeam'] == row['HomeTeam'])].loc[data21['index']<int(row.name)]['HC'].mean()
  if math.isnan(value):
    return 0
  return value

def custom_ac21(row):
  value = data21.loc[(data21['AwayTeam'] == row['AwayTeam'])].loc[data21['index']<int(row.name)]['AC'].mean()
  if math.isnan(value):
    return 0
  return value

def custom_hf21(row):
  value = data21.loc[(data21['HomeTeam'] == row['HomeTeam'])].loc[data21['index']<int(row.name)]['HF'].mean()
  if math.isnan(value):
    return 0
  return value

def custom_af21(row):
  value = data21.loc[(data21['AwayTeam'] == row['AwayTeam'])].loc[data21['index']<int(row.name)]['AF'].mean()
  if math.isnan(value):
    return 0
  return value

def custom_psdg21(row):
  pshg = data21.loc[(data21['HomeTeam'] == row['HomeTeam'])].loc[data21['index']<int(row.name)]['FTHG'].sum()
  psag = data21.loc[(data21['AwayTeam'] == row['AwayTeam'])].loc[data21['index']<int(row.name)]['FTAG'].sum()
  if math.isnan(pshg-psag):
    return 0
  return pshg-psag

def xhg21(row):
  return xg21.loc[xg21['Squad']==row["HomeTeam"]]["xG"].mean()

def xag21(row):
  return xg21.loc[xg21['Squad']==row["AwayTeam"]]["xG"].mean()

data21['xHG'] = data21.apply(xhg21, axis=1)
data21['xAG'] = data21.apply(xag21, axis=1)
data21['CustomHS']=data21.apply(custom_hs21, axis=1)
data21['CustomAS']=data21.apply(custom_as21, axis=1)
data21['CustomHST']=data21.apply(custom_hst21, axis=1)
data21['CustomAST']=data21.apply(custom_ast21, axis=1)
data21['CustomPSDG']=data21.apply(custom_psdg21, axis=1)
data21['FTR2']=data21.apply(home_or_dc, axis=1)
data21.tail(10)

Unnamed: 0,HomeTeam,B365A,HS,HTAG,Div,FTHG,AST,B365D,B365H,HST,AS,FTR,HTR,HTHG,Date,FTAG,AwayTeam,index,xHG,xAG,CustomHS,CustomAS,CustomHST,CustomAST,CustomPSDG,FTR2
220,Marseille,7.0,18,2,F1,5,2,4.0,1.5,8,4,H,D,2,04/02/2022,2,Angers,220,33.4,26.9,15.636364,9.363636,4.272727,3.363636,3,0
221,St Etienne,2.62,27,1,F1,3,2,3.4,2.7,8,3,H,A,0,05/02/2022,1,Montpellier,221,24.8,26.7,13.818182,8.9,4.363636,3.7,-4,0
222,Monaco,3.75,10,0,F1,2,5,3.6,1.95,6,19,H,H,2,05/02/2022,0,Lyon,222,34.2,41.9,12.272727,12.8,4.363636,4.8,13,0
223,Lorient,2.15,8,0,F1,2,0,3.5,3.3,3,8,H,H,1,06/02/2022,0,Lens,223,26.6,33.7,10.7,11.545455,2.9,4.545455,-9,0
224,Nice,6.0,7,0,F1,0,5,4.2,1.55,3,14,A,D,0,06/02/2022,1,Clermont,224,38.7,26.1,14.454545,10.181818,4.545455,3.090909,6,1
225,Reims,3.5,16,0,F1,5,0,3.6,2.05,7,6,H,H,1,06/02/2022,0,Bordeaux,225,22.5,24.9,12.6,10.9,4.2,3.4,-4,0
226,Strasbourg,4.5,13,0,F1,1,2,3.75,1.75,3,10,H,D,0,06/02/2022,0,Nantes,226,39.0,23.6,13.545455,8.727273,4.636364,2.909091,14,0
227,Troyes,4.0,9,0,F1,0,0,3.3,2.0,2,6,D,D,0,06/02/2022,0,Metz,227,21.9,18.8,11.818182,9.0,3.545455,3.272727,1,1
228,Rennes,7.0,19,0,F1,2,1,4.2,1.5,8,5,H,H,1,06/02/2022,0,Brest,228,38.8,27.3,17.0,10.090909,5.909091,3.727273,12,0
229,Lille,1.85,10,3,F1,1,7,3.6,4.2,5,18,A,A,1,06/02/2022,5,Paris SG,229,34.8,45.2,11.9,13.363636,3.8,4.454545,-5,1


In [None]:
x21=data21[real_columns]
y21= data21['FTR2'].values
x21.tail(3)

Unnamed: 0,B365H,B365D,B365A,xHG,xAG,CustomHS,CustomAS,CustomHST,CustomAST,CustomPSDG
227,2.0,3.3,4.0,21.9,18.8,11.818182,9.0,3.545455,3.272727,1
228,1.5,4.2,7.0,38.8,27.3,17.0,10.090909,5.909091,3.727273,12
229,4.2,3.6,1.85,34.8,45.2,11.9,13.363636,3.8,4.454545,-5


## Results

In [None]:
pred21 = model.predict(x21)
odds = [(round(1+0.1*k, 2), round(2+0.1*i, 2)) for k in range(3, 8) for i in range(10)]

def calc_gain(low_odd, high_odd):
  gain21 = 0
  cpt_paris=0
  cpt_reus= 0
  for prediction, res, donnee in zip(pred21, y21, x21.itertuples()):
    if (argmax(prediction)==0): #home team to win
      if low_odd<donnee[1]<high_odd:
        #on joue
        cpt_paris+=1
        if res==0:
          gain21+=donnee[1]-1
          cpt_reus+=1
        else:
          gain21-=1
    else:#double chance
      cotedc=double_chance(donnee[2], donnee[3])
      if (low_odd<cotedc<high_odd):
        cpt_paris+=1
        if res==1:
          gain21+=cotedc-1
          cpt_reus+=1
        else:
          gain21-=1
  return gain21, cpt_paris

for low_odd, high_odd in odds:
  print(low_odd, high_odd, calc_gain(low_odd, high_odd))

1.3 2.0 (-5.359999999999998, 157)
1.3 2.1 (-5.179999999999999, 173)
1.3 2.2 (-3.8299999999999983, 178)
1.3 2.3 (-2.599999999999997, 179)
1.3 2.4 (-1.9999999999999973, 183)
1.3 2.5 (-1.9999999999999973, 183)
1.3 2.6 (-2.9999999999999973, 184)
1.3 2.7 (-2.9999999999999973, 184)
1.3 2.8 (-1.2499999999999973, 185)
1.3 2.9 (-1.2499999999999973, 185)
1.4 2.0 (-0.0699999999999994, 126)
1.4 2.1 (0.11000000000000032, 142)
1.4 2.2 (1.459999999999999, 147)
1.4 2.3 (2.6900000000000013, 148)
1.4 2.4 (3.290000000000001, 152)
1.4 2.5 (3.290000000000001, 152)
1.4 2.6 (2.290000000000001, 153)
1.4 2.7 (2.290000000000001, 153)
1.4 2.8 (4.040000000000001, 154)
1.4 2.9 (4.040000000000001, 154)
1.5 2.0 (-1.179999999999998, 92)
1.5 2.1 (-1.0, 108)
1.5 2.2 (0.3500000000000023, 113)
1.5 2.3 (1.580000000000001, 114)
1.5 2.4 (2.1800000000000024, 118)
1.5 2.5 (2.1800000000000024, 118)
1.5 2.6 (1.1800000000000024, 119)
1.5 2.7 (1.1800000000000024, 119)
1.5 2.8 (2.9300000000000024, 120)
1.5 2.9 (2.9300000000000024,

In [None]:
model.save(wd+'/models/2k21xG4ans/italy.h5')

## Better for damsopronos ? since 29/11 ? 

In [None]:
dpfr = data21.tail(69)
x2911 = dpfr[real_columns]
y2911 = dpfr['FTR2'].values

In [None]:
#model = keras.models.load_model("/content/drive/MyDrive/Colab Notebooks/models/france.h5")
pred2911 = model.predict(x2911)


odds = [(round(1+0.1*k, 2), round(2+0.1*i, 2)) for k in range(3, 8) for i in range(10)]

def calc_gain(low_odd, high_odd):
  gain21 = 0
  cpt_paris=0
  cpt_reus= 0
  for prediction, res, donnee in zip(pred2911, y2911, x2911.itertuples()):
    if (argmax(prediction)==0): #home team to win
      if low_odd<donnee[1]<high_odd:
        #on joue
        cpt_paris+=1
        if res==0:
          gain21+=donnee[1]-1
          cpt_reus+=1
        else:
          gain21-=1
    else:#double chance
      cotedc=double_chance(donnee[2], donnee[3])
      if (low_odd<cotedc<high_odd):
        cpt_paris+=1
        if res==1:
          gain21+=cotedc-1
          cpt_reus+=1
        else:
          gain21-=1
  return gain21, cpt_paris

for low_odd, high_odd in odds:
  print(low_odd, high_odd, calc_gain(low_odd, high_odd))

1.3 2.0 (-0.8199999999999994, 41)
1.3 2.1 (0.2400000000000011, 42)
1.3 2.2 (-0.6399999999999992, 45)
1.3 2.3 (0.5700000000000012, 46)
1.3 2.4 (1.9400000000000013, 47)
1.3 2.5 (0.9400000000000013, 48)
1.3 2.6 (0.9400000000000013, 48)
1.3 2.7 (-0.05999999999999872, 49)
1.3 2.8 (-0.05999999999999872, 49)
1.3 2.9 (1.6000000000000014, 53)
1.4 2.0 (-1.3499999999999996, 32)
1.4 2.1 (-0.2899999999999996, 33)
1.4 2.2 (-1.1699999999999993, 36)
1.4 2.3 (0.04000000000000048, 37)
1.4 2.4 (1.4100000000000006, 38)
1.4 2.5 (0.4100000000000006, 39)
1.4 2.6 (0.4100000000000006, 39)
1.4 2.7 (-0.5899999999999994, 40)
1.4 2.8 (-0.5899999999999994, 40)
1.4 2.9 (1.0700000000000007, 44)
1.5 2.0 (-1.6399999999999992, 25)
1.5 2.1 (-0.5799999999999992, 26)
1.5 2.2 (-1.4599999999999993, 29)
1.5 2.3 (-0.2499999999999989, 30)
1.5 2.4 (1.120000000000001, 31)
1.5 2.5 (0.120000000000001, 32)
1.5 2.6 (0.120000000000001, 32)
1.5 2.7 (-0.879999999999999, 33)
1.5 2.8 (-0.879999999999999, 33)
1.5 2.9 (0.7800000000000016, 3