# Football prediction model

In [31]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn
from scipy.stats import poisson,skellam
import datetime as dt
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

import statsmodels.api as sm
import statsmodels.formula.api as smf

In [18]:
torneos=['FIFA World Cup qualification', 'UEFA Euro qualification',
       'African Cup of Nations qualification', 'AFC Asian Cup qualification',
       'African Cup of Nations', 'CFU Caribbean Cup qualification',
       'FIFA World Cup',  'UEFA Nations League', 'Gold Cup',
       'Copa América',  'AFF Championship',
       'UEFA Euro', 'African Nations Championship', 'AFC Asian Cup',
       'CONCACAF Nations League']

# 1. Calcular ELO

## 1.1 Funciones ELO

In [19]:
confederation_tournaments=['AFC Asian Cup','African Cup of Nations','UEFA Euro','Copa América','CONCACAF Championship','Oceania Nations Cup']

def k_value(tournament):
    k=5
    if tournament == 'Friendly':
        k=10
    elif tournament == 'FIFA World Cup qualification':
        k=25
    elif tournament in confederation_tournaments:
        k=40
    elif tournament == 'FIFA World Cup':
        k=55
    return k
    
def expected_result(loc,aw):
    dr=loc-aw
    we=(1/(10**(-dr/400)+1))
    return [np.round(we,3),1-np.round(we,3)]

def actual_result(loc,aw):
    if loc<aw:
        wa=1
        wl=0
    elif loc>aw:
        wa=0
        wl=1
    elif loc==aw:
        wa=0.5
        wl=0.5
    return [wl,wa]

def calculate_elo(elo_l,elo_v,local_goals,away_goals,tournament):
    
    k=k_value(tournament)
    wl,wv=actual_result(local_goals,away_goals)
    wel,wev=expected_result(elo_l,elo_v)

    elo_ln=elo_l+k*(wl-wel)
    elo_vn=elo_v+k*(wv-wev)

    return elo_ln,elo_vn

## 1.2 Calculate ELO

In [20]:
matches = pd.read_csv("results.csv").sort_values('date')

matches["Elo_h_before"]=np.nan
matches["Elo_a_before"]=np.nan

matches["Elo_h_after"]=np.nan
matches["Elo_a_after"]=np.nan

current_elo={}
for idx,row in matches.iterrows():
    
    local=row['home_team']
    away=row['away_team']
    local_goals=row['home_score']
    away_goals=row['away_score']
    tournament = row['tournament']
    
    # Si el equipo no se le ha calculado el ELO, se le inicializa en 1300
    if local not in current_elo.keys():
        current_elo[local]=1300
    
    if away not in current_elo.keys():
        current_elo[away]=1300
    
    elo_l=current_elo[local]
    elo_v=current_elo[away]
    elo_ln,elo_vn=calculate_elo(elo_l,elo_v,local_goals,away_goals,tournament)

    current_elo[local]=elo_ln
    current_elo[away]=elo_vn
    
    matches.loc[idx,'Elo_h_after']=elo_ln
    matches.loc[idx,'Elo_a_after']=elo_vn 
    matches.loc[idx,'Elo_h_before']=elo_l
    matches.loc[idx,'Elo_a_before']=elo_v

# 2. Build model

## 2.1 Filter dataframe 

In [21]:

matches=matches[(pd.to_datetime(matches['date'])>dt.datetime(1989,12,31))&(matches['tournament'].isin(torneos))]
matches =matches[['date','home_team','away_team','home_score','away_score','neutral','Elo_a_before','Elo_h_before']]
matches = matches[(matches['home_score']<9)&(matches['away_score']<9)].reset_index(drop=True)
matches["Match Name"]= matches["home_team"].astype(str) + ' - ' + matches["away_team"].astype(str)

home=matches[["date","home_team","home_score","away_score","neutral","Match Name","Elo_a_before","Elo_h_before"]].rename(columns={'home_team':"Team","home_score":"Goals for","away_score":"Goals against","Elo_a_before":"Elo rival","Elo_h_before":"Elo"}).assign(local=1)
away=matches[["date","away_team","away_score","home_score","Match Name","Elo_a_before","Elo_h_before"]].rename(columns={'away_team':"Team","away_score":"Goals for","home_score":"Goals against","Elo_a_before":"Elo","Elo_h_before":"Elo rival"}).assign(neutral=0).assign(local=0)

df = pd.concat([home,away],ignore_index=True).sort_values("date").reset_index(drop=True)

df["Moving goals for"]=df.groupby('Team')['Goals for'].transform(lambda x: x.rolling(3).mean()).shift()
df["Moving goals against"]=df.groupby('Team')['Goals against'].transform(lambda x: x.rolling(3).mean()).shift()

df["Moving goals for"]=df.groupby("Team")["Moving goals for"].shift()
df["Moving goals against"]=df.groupby("Team")["Moving goals against"].shift()
df=df.dropna()

## 2.2 Get data from teams going to world cup

In [22]:
# Data from world cup teams
Teams=df.drop_duplicates(keep='last',subset='Team')
wc_teams=["Qatar","Ecuador","Senegal","Netherlands","England","Iran","United States","Wales","Argentina","Saudi Arabia","Mexico","Poland","France","Australia","Denmark","Tunisia",
"Spain","Costa Rica","Germany","Japan","Belgium","Canada","Morocco","Croatia","Brazil","Serbia","Switzerland","Cameroon","Portugal","Ghana","Uruguay","South Korea"
]
Teams=Teams.loc[Teams["Team"].isin(wc_teams),["Team","Moving goals for","Moving goals against","Elo"]].reset_index(drop=True)


In [23]:
#Teams.to_excel("WorldCup2022_Teams.xlsx")

## 2.3 Shaping data in necessary format

In [24]:
home=df.loc[df["local"]==1,["date","Team","Goals for","neutral","Match Name","Moving goals for","Moving goals against","Elo"]].rename(columns={"Team":"Home team","Goals for":"home_score","Moving goals for":"Mov_score_for_home","Moving goals against":"Mov_score_against_home","Elo":"Elo_home"})
away=df.loc[df["local"]==0,["date","Team","Goals for","Match Name","Moving goals for","Moving goals against","Elo"]].rename(columns={"Team":"Away team","Goals for":"away_score","Moving goals for":"Mov_score_for_away","Moving goals against":"Mov_score_against_away","Elo":"Elo_away"})
home_away=home.merge(away,how='inner',on=["date","Match Name"])

home_tr=home_away[["Home team","home_score","neutral","Mov_score_for_home","Mov_score_against_home","Mov_score_for_away","Mov_score_against_away","Elo_home","Elo_away"]].rename(columns={
    "Home team":"Team","home_score":"Goals","Mov_score_for_home":"mov_score_for","Mov_score_against_home":"mov_score_against","Mov_score_for_away":"mov_score_for_rival","Mov_score_against_away":"mov_score_against_rival",
    "Elo_home":"Elo","Elo_away":"Elo_rival"
})
away_tr=home_away[["Away team","away_score","neutral","Mov_score_for_home","Mov_score_against_home","Mov_score_for_away","Mov_score_against_away","Elo_home","Elo_away"]].rename(columns={
    "Away team":"Team","away_score":"Goals","Mov_score_for_home":"mov_score_for_rival","Mov_score_against_home":"mov_score_against_rival","Mov_score_for_away":"mov_score_for","Mov_score_against_away":"mov_score_against",
    "Elo_home":"Elo_rival","Elo_away":"Elo"
})

home_tr["home"]=home_tr["neutral"].apply(lambda x: 1 if x==0 else 0)
away_tr["home"]=0

home_tr=home_tr.drop(columns="neutral")
away_tr=away_tr.drop(columns="neutral")

## 2.4 Creating training dataframe and training model

In [25]:
training=home_tr.append(away_tr,ignore_index=True)
training["Elo_diff"]=training["Elo"]-training["Elo_rival"]

poisson_model = smf.glm(formula="Goals ~ home + mov_score_for + mov_score_against + mov_score_against_rival + Elo_diff", data=training, 
                        family=sm.families.Poisson()).fit()
poisson_model.summary()

0,1,2,3
Dep. Variable:,Goals,No. Observations:,26280.0
Model:,GLM,Df Residuals:,26274.0
Model Family:,Poisson,Df Model:,5.0
Link Function:,Log,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-38272.0
Date:,"Tue, 08 Nov 2022",Deviance:,32877.0
Time:,19:07:43,Pearson chi2:,31100.0
No. Iterations:,5,Pseudo R-squ. (CS):,0.2777
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.0273,0.017,-1.632,0.103,-0.060,0.006
home,0.3338,0.011,31.435,0.000,0.313,0.355
mov_score_for,0.0222,0.006,3.610,0.000,0.010,0.034
mov_score_against,0.0297,0.006,5.224,0.000,0.019,0.041
mov_score_against_rival,0.0114,0.005,2.134,0.033,0.001,0.022
Elo_diff,0.0025,2.85e-05,86.918,0.000,0.002,0.003


In [26]:
#poisson_model.save("international_model.pickle")

In [27]:
def simulate_match(model,G_local,G_perm_local, G_visitante,G_perm_vis,home, elo_h,elo_a,max_goals=5):
    elo_diff=elo_h-elo_a
    if home==1:
        home_goals_avg = model.predict(pd.DataFrame(data={'mov_score_for': G_local,  'mov_score_against': G_perm_local,'mov_score_against_rival':G_perm_vis,'home':1,"Elo_diff":elo_diff},index=[1])).values[0]
    else:
        home_goals_avg = model.predict(pd.DataFrame(data={'mov_score_for': G_local,  'mov_score_against': G_perm_local,'mov_score_against_rival':G_perm_vis,'home':0,"Elo_diff":elo_diff},index=[1])).values[0]
    
    away_goals_avg = model.predict(pd.DataFrame(data={'mov_score_for': G_visitante,  'mov_score_against': G_perm_vis,'mov_score_against_rival':G_perm_local,'home':0,"Elo_diff":-elo_diff},index=[1])).values[0]

    team_pred = [[poisson.pmf(i, team_avg) for i in range(0, max_goals+1)] for team_avg in [home_goals_avg, away_goals_avg]]
    return(np.outer(np.array(team_pred[0]), np.array(team_pred[1])))

In [28]:
def probabilidad(model,G_local,G_perm_local, G_visitante,G_perm_vis,home,elo_h,elo_a):
    mill_tol = simulate_match(model,G_local,G_perm_local, G_visitante,G_perm_vis,home,elo_h,elo_a)
    series=pd.Series([1-np.sum(np.triu(mill_tol, 1))-np.sum(np.diag(mill_tol)),np.sum(np.diag(mill_tol)),np.sum(np.triu(mill_tol, 1))],index=['Local','Empate','Visitante'])
    return series

In [29]:
def winner(model,G_local,G_perm_local, G_visitante,G_perm_vis,home,elo_h,elo_a):
    prob = probabilidad(model,G_local,G_perm_local, G_visitante,G_perm_vis,home,elo_h,elo_a)
    dif = abs(prob["Local"]-prob["Visitante"])
    if (dif < 0.08):
        return 1
    else:
        return prob.argmax()
    

In [30]:
home_away["home"]=home_away["neutral"].apply(lambda x: 1 if x==0 else 0)
home_away=home_away.drop(columns="neutral")
home_away["result"]=home_away.apply(lambda x: 0 if x["home_score"]>x["away_score"] else (1 if x["home_score"]==x["away_score"] else 2),axis=1)
home_away["predicted"]=home_away[["Mov_score_for_home","Mov_score_against_home","Mov_score_for_away","Mov_score_against_away","home","Elo_home","Elo_away"]].apply(lambda x: winner(poisson_model,x[0],x[1],x[2],x[3],x[4],x[5],x[6]),axis=1)

In [32]:
print(classification_report(home_away["result"], home_away["predicted"], target_names=["Local","Empate","Visitante"]))

              precision    recall  f1-score   support

       Local       0.67      0.78      0.72      6492
      Empate       0.27      0.16      0.20      2878
   Visitante       0.57      0.59      0.58      3770

    accuracy                           0.59     13140
   macro avg       0.51      0.51      0.50     13140
weighted avg       0.56      0.59      0.57     13140



In [36]:
acc=(home_away["predicted"]==home_away['result']).mean()
print(f"Accuracy: {np.round(acc*100,2)} %")

Accuracy: 59.13 %
