In [2]:
import pandas as pd
import os
import numpy as np
from itertools import permutations,combinations
from scipy.stats import poisson
import random
from itertools import product
import numpy as np
from functools import cmp_to_key
from statsmodels.genmod.generalized_estimating_equations import GEE
from statsmodels.genmod.cov_struct import Independence
from statsmodels.genmod.families import Poisson

# 1. Data loading and cleansing

### 1.1 Loading the training datasets

In [3]:
lst = []
for file in os.listdir("raw_data"):
    if file.endswith(".csv"):
        lst.append(os.path.join("raw_data", file))

In [4]:
pd_lst = []

In [5]:
for file in lst:
    df = pd.read_csv(file, index_col=None, header = 0)
    df['Competition'] = file.split("\\")[1].split('.')[0]
    pd_lst.append(df)

In [6]:
complete_df = pd.concat(pd_lst, axis=0,ignore_index=True)

In [7]:
complete_df.columns

Index(['timestamp', 'date_GMT', 'status', 'attendance', 'home_team_name',
       'away_team_name', 'referee', 'Game Week', 'Pre-Match PPG (Home)',
       'Pre-Match PPG (Away)', 'home_ppg', 'away_ppg', 'home_team_goal_count',
       'away_team_goal_count', 'total_goal_count', 'total_goals_at_half_time',
       'home_team_goal_count_half_time', 'away_team_goal_count_half_time',
       'home_team_goal_timings', 'away_team_goal_timings',
       'home_team_corner_count', 'away_team_corner_count',
       'home_team_yellow_cards', 'home_team_red_cards',
       'away_team_yellow_cards', 'away_team_red_cards',
       'home_team_first_half_cards', 'home_team_second_half_cards',
       'away_team_first_half_cards', 'away_team_second_half_cards',
       'home_team_shots', 'away_team_shots', 'home_team_shots_on_target',
       'away_team_shots_on_target', 'home_team_shots_off_target',
       'away_team_shots_off_target', 'home_team_fouls', 'away_team_fouls',
       'home_team_possession', 'away_te

In [8]:
len(complete_df)

3246

In [9]:
complete_df.groupby('home_team_name')['home_team_name'].count()

home_team_name
Afghanistan         8
Alajuelense         5
Albania            23
Algeria            25
Alianza             3
                   ..
Yemen               9
Zambia             10
Zimbabwe           10
Águila              1
Étoile du Congo     7
Name: home_team_name, Length: 228, dtype: int64

### 1.2 Loading the Elo ratings

In [10]:
elo_rankings_df = pd.read_csv('elo_rankings.csv')

In [11]:
required_headers = ['timestamp', 'date_GMT','home_team_name','away_team_name'
                    ,'home_team_goal_count','away_team_goal_count','team_a_xg'
                    ,'team_b_xg','elo_ranking_x','elo_ranking_y']

In [12]:
home_elo = complete_df.merge(elo_rankings_df, left_on='home_team_name', right_on='Team')
total_elo = home_elo.merge(elo_rankings_df, left_on='away_team_name', right_on='Team')

In [13]:
elo_df = total_elo[required_headers].rename(columns={"elo_ranking_x":"elo_ranking_home","elo_ranking_y":"elo_ranking_away"})

In [15]:
elo_df.sort_values(by=['timestamp'])

Unnamed: 0,timestamp,date_GMT,home_team_name,away_team_name,home_team_goal_count,away_team_goal_count,team_a_xg,team_b_xg,elo_ranking_home,elo_ranking_away
769,1444334400,Oct 08 2015 - 8:00pm,Bolivia,Uruguay,0,2,0.0,0.0,1621,1936
183,1444336200,Oct 08 2015 - 8:30pm,Colombia,Peru,2,0,0.0,0.0,1911,1843
643,1444338000,Oct 08 2015 - 9:00pm,Venezuela,Paraguay,0,1,0.0,0.0,1682,1739
581,1444347000,Oct 08 2015 - 11:30pm,Chile,Brazil,2,0,0.0,0.0,1703,2169
847,1444348800,Oct 09 2015 - 12:00am,Argentina,Ecuador,0,2,0.0,0.0,2141,1840
...,...,...,...,...,...,...,...,...,...,...
2491,1668973500,Nov 20 2022 - 7:45pm,Austria,Italy,0,0,0.0,0.0,1729,1993
1239,1710975600,Mar 20 2024 - 11:00pm,Lithuania,Belarus,0,0,0.0,0.0,1248,1415
2216,1710975600,Mar 20 2024 - 11:00pm,Gibraltar,Cyprus,0,0,0.0,0.0,1053,1350
2631,1711234800,Mar 23 2024 - 11:00pm,Cyprus,Gibraltar,0,0,0.0,0.0,1350,1053


In [18]:
elo_df.query('team_a_xg < home_team_goal_count | team_b_xg < away_team_goal_count')['home_team_goal_count'].count()

1681

In [19]:
elo_df.head()

Unnamed: 0,timestamp,date_GMT,home_team_name,away_team_name,home_team_goal_count,away_team_goal_count,team_a_xg,team_b_xg,elo_ranking_home,elo_ranking_away
0,1546772400,Jan 06 2019 - 11:00am,Australia,Jordan,0,1,2.33,1.32,1719,1593
1,1654106400,Jun 01 2022 - 6:00pm,Australia,Jordan,2,1,1.63,1.11,1719,1593
2,1623772800,Jun 15 2021 - 4:00pm,Australia,Jordan,1,0,1.19,0.86,1719,1593
3,1553612400,Mar 26 2019 - 3:00pm,Iraq,Jordan,3,2,0.0,0.0,1574,1593
4,1613394000,Feb 15 2021 - 1:00pm,Uzbekistan,Jordan,2,0,0.0,0.0,1567,1593


In [22]:
elo_filtered_df = elo_df[elo_df['team_b_xg'] != 0].sort_values(by=['timestamp'])

In [23]:
elo_filtered_df.head()

Unnamed: 0,timestamp,date_GMT,home_team_name,away_team_name,home_team_goal_count,away_team_goal_count,team_a_xg,team_b_xg,elo_ranking_home,elo_ranking_away
246,1504602900,Sep 05 2017 - 9:15am,Nepal,Tajikistan,1,2,0.29,0.01,927,1318
157,1504611000,Sep 05 2017 - 11:30am,Cambodia,Vietnam,1,2,0.85,1.18,847,1391
276,1504611000,Sep 05 2017 - 11:30am,Singapore,Turkmenistan,1,1,1.47,2.3,1122,1250
727,1504611000,Sep 05 2017 - 11:30am,Philippines,Yemen,2,2,2.02,1.42,1095,1099
789,1504615500,Sep 05 2017 - 12:45pm,Malaysia,Hong Kong,1,1,1.33,1.95,1206,1139


### 1.3 Building the training dataset 

In [24]:
xg_reversed = elo_filtered_df[['away_team_name','home_team_name','elo_ranking_away','elo_ranking_home','team_b_xg']]
xg_reversed.columns = ['team1','team2','elo1', 'elo2', 'xg1']

xg_forward = elo_filtered_df[['home_team_name','away_team_name','elo_ranking_home','elo_ranking_away','team_a_xg']].rename(
    columns={"home_team_name":"team1","away_team_name":"team2"
             ,"elo_ranking_home":"elo1","elo_ranking_away":"elo2","team_a_xg":"xg1"})

xg_full = xg_forward.append(xg_reversed)
#elo_goals.columns = ['elo_team', 'elo_opponent', 'goals_team']

xg_full.head()

Unnamed: 0,team1,team2,elo1,elo2,xg1
246,Nepal,Tajikistan,927,1318,0.29
157,Cambodia,Vietnam,847,1391,0.85
276,Singapore,Turkmenistan,1122,1250,1.47
727,Philippines,Yemen,1095,1099,2.02
789,Malaysia,Hong Kong,1206,1139,1.33


In [27]:
xg_full.query("team1=='Brazil'|team2=='Brazil'")

Unnamed: 0,team1,team2,elo1,elo2,xg1
1137,Brazil,Panama,2169,1656,2.39
463,Brazil,Qatar,2169,1659,2.32
1000,Brazil,Honduras,2169,1472,3.17
505,Brazil,Colombia,2169,1911,1.27
177,Brazil,Peru,2169,1843,1.39
...,...,...,...,...,...
602,Chile,Brazil,1703,2169,1.20
572,Brazil,South Korea,2169,1783,2.46
573,Brazil,Japan,2169,1798,2.27
1796,Ghana,Brazil,1540,2169,0.88


# 2. Running the GEE model

In [28]:
poisson_regression = GEE.from_formula("xg1 ~ elo1 + elo2", data=xg_full, 
                                      groups=list(range(0, elo_filtered_df.shape[0]))*2, # Two examples from a match form 
                                                                                          # a group in Poisson regression
                                                                                          # (not used in this project)
                                      cov_struct=Independence(), family=Poisson())
goals_predictor = poisson_regression.fit()
print(goals_predictor.summary())

                               GEE Regression Results                              
Dep. Variable:                         xg1   No. Observations:                 4302
Model:                                 GEE   No. clusters:                     2151
Method:                        Generalized   Min. cluster size:                   2
                      Estimating Equations   Max. cluster size:                   2
Family:                            Poisson   Mean cluster size:                 2.0
Dependence structure:         Independence   Num. iterations:                     7
Date:                     Mon, 21 Nov 2022   Scale:                           1.000
Covariance type:                    robust   Time:                         22:16:23
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.2153      0.035      6.132      0.000       0.146       0.284
elo1   

In [30]:
goals_predictor.predict(pd.DataFrame([{'elo1':1000,'elo2':1150}]))

0    1.123886
dtype: float64

In [31]:
goals_predictor.predict(pd.DataFrame([{'elo1':1150,'elo2':1000}]))

0    1.533059
dtype: float64

In [32]:
xg_A = goals_predictor.predict(pd.DataFrame([{'elo1':1840,'elo2':1659}]))[0]
xg_B = goals_predictor.predict(pd.DataFrame([{'elo1':1659,'elo2':1840}]))[0]

print(xg_A, xg_B)

1.6403771557201432 1.1278225867139373


In [33]:
xg_C = goals_predictor.predict(pd.DataFrame([{'elo1':1840,'elo2':3000}]))[0]
xg_D = goals_predictor.predict(pd.DataFrame([{'elo1':3000,'elo2':1840}]))[0]

print(xg_C, xg_D)

0.4242128513326225 4.680807673119139


# 3.Simulating group stage

## 3.1 Building the functions to simulate the games

- simulate_xg: runs the goal predictor to predict an xG value for a team
- poisson_matrix_outcome: Builds the goals matrix to determine the wins, draw and losing probabilities
- simulate_match: runs simulate_xg on both teams and the probability mass functions to determine the expected goals and return the probabilities

In [39]:
## Function to simulate the games

def simulate_xg(elo1, elo2):
    xg_lambda = goals_predictor.predict(pd.DataFrame([{'elo1': elo1, 'elo2': elo2}]))
    return xg_lambda

def simulate_match(elo_home, elo_away,max_goals):
    goals_range = range(0,max_goals)
    home_poisson_pmf = poisson.pmf(k=goals_range, mu=simulate_xg(elo_home,elo_away))
    away_poisson_pmf = poisson.pmf(k=goals_range, mu=simulate_xg(elo_away,elo_home))
    prod_table = np.array([(i*j) for i, j in product(home_poisson_pmf, away_poisson_pmf)])
    prod_table.shape = (max_goals, max_goals)
    home_win_prob, draw_prob, home_lose_prob, home_expected_goals, away_expected_goals = poisson_matrix_outcome(prod_table)
    return home_win_prob, draw_prob, home_lose_prob, home_expected_goals, away_expected_goals

def poisson_matrix_outcome(prod_table):
    win_prob = 0
    draw_prob = 0
    lose_prob = 0
    for i in range(0,len(prod_table)):
        for j in range(0,len(prod_table[i])):
            if i > j:
                win_prob += prod_table[i][j]
            if i == j:
                draw_prob += prod_table[i][j]
            if i < j:
                lose_prob += prod_table[i][j]
    
    indices = np.where(prod_table == prod_table.max())
    home_goals = int(indices[0])
    away_goals = int(indices[1])
    
    return win_prob, draw_prob, lose_prob, home_goals, away_goals

In [40]:
simulate_match(1659,1840,8)

(0.2618815990071854, 0.24504485353046457, 0.4927426511879934, 1, 1)

In [37]:
### Groups
group_country = [
    ['Qatar','Ecuador','Senegal','Netherlands'],
    ['England','Iran','United States','Wales'],
    ['Argentina','Saudi Arabia','Mexico','Poland'],
    ['France','Denmark','Australia','Tunisia'],
    ['Spain','Costa Rica','Germany','Japan'],
    ['Belgium','Canada','Morocco','Croatia'],
    ['Brazil','Serbia','Switzerland','Cameroon'],
    ['Portugal','Uruguay','Ghana','South Korea']
]

## 3.2 Each team's Elo Rating

In [38]:
for groups in group_country:
    for country in groups:
        elo_rank = elo_rankings_df[elo_rankings_df['Team']==country]['elo_ranking'].to_string(index=False)
        print(f"{country}'s elo_rank is{elo_rank}")

Qatar's elo_rank is 1659
Ecuador's elo_rank is 1840
Senegal's elo_rank is 1687
Netherlands's elo_rank is 2040
England's elo_rank is 1920
Iran's elo_rank is 1818
United States's elo_rank is 1798
Wales's elo_rank is 1790
Argentina's elo_rank is 2141
Saudi Arabia's elo_rank is 1640
Mexico's elo_rank is 1813
Poland's elo_rank is 1809
France's elo_rank is 2005
Denmark's elo_rank is 1971
Australia's elo_rank is 1719
Tunisia's elo_rank is 1687
Spain's elo_rank is 2045
Costa Rica's elo_rank is 1737
Germany's elo_rank is 1960
Japan's elo_rank is 1798
Belgium's elo_rank is 2025
Canada's elo_rank is 1770
Morocco's elo_rank is 1753
Croatia's elo_rank is 1922
Brazil's elo_rank is 2169
Serbia's elo_rank is 1893
Switzerland's elo_rank is 1929
Cameroon's elo_rank is 1612
Portugal's elo_rank is 2003
Uruguay's elo_rank is 1936
Ghana's elo_rank is 1540
South Korea's elo_rank is 1783


## 3.3 Group stage single simulation

In [41]:
group_matches = []
for i in group_country:
    group_matches.append(list(combinations(i,2)))

In [42]:
group_matches[0]

[('Qatar', 'Ecuador'),
 ('Qatar', 'Senegal'),
 ('Qatar', 'Netherlands'),
 ('Ecuador', 'Senegal'),
 ('Ecuador', 'Netherlands'),
 ('Senegal', 'Netherlands')]

In [43]:
for i in range(0, len(group_matches)):
    for j in range(0, len(group_matches[i])):
        home_country = group_matches[i][j][0]
        away_country = group_matches[i][j][1]
        elo_home = int(elo_rankings_df[elo_rankings_df['Team']==home_country]['elo_ranking'].to_string(index=False))
        elo_away = int(elo_rankings_df[elo_rankings_df['Team']==away_country]['elo_ranking'].to_string(index=False))
        win_prob, draw_prob, lose_prob, home_goals, away_goals = simulate_match(elo_home,elo_away,10)
        print(f"{home_country} vs {away_country} match probability with outcome: home win {win_prob * 100}, draw {draw_prob * 100}, away win {lose_prob * 100} and expected score {home_goals} - {away_goals}")

Qatar vs Ecuador match probability with outcome: home win 26.190524464929144, draw 24.50448589508337, away win 49.30407278226701 and expected score 1 - 1
Qatar vs Senegal match probability with outcome: home win 35.32554781472718, draw 25.736839178352938, away win 38.93726513602226 and expected score 1 - 1
Qatar vs Netherlands match probability with outcome: home win 16.29865430254142, draw 20.662288683449283, away win 63.03383467016451 and expected score 0 - 2
Ecuador vs Senegal match probability with outcome: home win 47.39223563543997, draw 24.831061631466717, away win 27.77596534907896 and expected score 1 - 1
Ecuador vs Netherlands match probability with outcome: home win 25.15034183912247, draw 24.09087496449673, away win 50.75761489954368 and expected score 1 - 1
Senegal vs Netherlands match probability with outcome: home win 17.52105207165702, draw 21.297179744152743, away win 61.17762412668337 and expected score 0 - 1
England vs Iran match probability with outcome: home win 43

In [None]:
### Points calculator
class GroupStage:
    def __init__(self,group):
        self.group = group
        
    def calc_points(self_goals,opp_goals):
        if self_goals > opp_goals:
            return 3
        elif self_goals == opp_goals:
            return 1
        else:
            return 0
        
    def simulate_match:
        
        for i in range(0, len(group_matches)):
        for j in range(0, len(group_matches[i])):
            home_country = group_matches[i][j][0]
            away_country = group_matches[i][j][1]
            elo_home = int(elo_rankings_df[elo_rankings_df['Team']==home_country]['elo_ranking'].to_string(index=False))
            elo_away = int(elo_rankings_df[elo_rankings_df['Team']==away_country]['elo_ranking'].to_string(index=False))
            win_prob, draw_prob, lose_prob, home_goals, away_goals = simulate_match(elo_home,elo_away,10)
            print(f"{home_country} vs {away_country} match probability with outcome: home win {win_prob * 100}, draw {draw_prob * 100}, away win {lose_prob * 100} and expected score {home_goals} - {away_goals}")
        
    def calculate_points_difference_goals(teams, match_results):
        points = {}
        difference = {}
        goals = {}

        for t in teams:
            points[t] = 0
            difference[t] = 0
            goals[t] = 0

        for r in match_results:
            t1,t2,g1,g2 = r
            points[t1] = points[t1] + points_team(g1, g2)
            points[t2] = points[t2] + points_team(g2, g1)
            difference[t1] = difference[t1] + (g1 - g2)
            difference[t2] = difference[t2] + (g2 - g1)
            goals[t1] = goals[t1] + g1
            goals[t2] = goals[t2] + g2

        return points, difference, goals

## 3.4 Simulate entire group

### Building the functions

- simulate_one_group: function to return the results of a simulated match within each group
- calc_points: function that returns number of points depending on victory, draw, loss
- calculate_points_difference_goals: function to calculate the total points, goal difference and goals scored to determine rank
- resolve_points: Calculates/determines who will qualify to next round
- simulate_group_stage: Overall orchestrating function to run the simulation for entire group stage

In [46]:
def simulate_group(group_matches, elo_rankings_df):
    results = []
    for i in range(0, len(group_matches)):
        home_country = group_matches[i][0]
        away_country = group_matches[i][1]
        elo_home = int(elo_rankings_df[elo_rankings_df['Team']==home_country]['elo_ranking'].to_string(index=False))
        elo_away = int(elo_rankings_df[elo_rankings_df['Team']==away_country]['elo_ranking'].to_string(index=False))
        win_prob, draw_prob, lose_prob, home_goals, away_goals = simulate_match(elo_home,elo_away,10)
        results.append([home_country,away_country,home_goals,away_goals])
    return results

In [47]:
def calc_points(self_goals,opp_goals):
        if self_goals > opp_goals:
            return 3
        elif self_goals == opp_goals:
            return 1
        else:
            return 0

In [48]:
def calculate_points_difference_goals(teams, match_results):
    points = {}
    difference = {}
    goals = {}
    
    for t in teams:
        points[t] = 0
        difference[t] = 0
        goals[t] = 0
        
    for r in match_results:
        t1,t2,g1,g2 = r
        points[t1] = points[t1] + calc_points(g1, g2)
        points[t2] = points[t2] + calc_points(g2, g1)
        difference[t1] = difference[t1] + (g1 - g2)
        difference[t2] = difference[t2] + (g2 - g1)
        goals[t1] = goals[t1] + g1
        goals[t2] = goals[t2] + g2
    return points, difference, goals

In [49]:
def resolve_points(points, difference, goals):
    points_df = pd.DataFrame([points,difference,goals]).T
    points_df.columns = ["points","difference","goals"]
    points_df['Team'] = points_df.index
    points_df.sort_values(by=['points','difference','goals'], inplace=True,
               ascending = [False, False,False])
    k = 0
    j = 1
    
    if all(points_df[['points','difference',"goals"]].iloc[0] == points_df[['points','difference',"goals"]].iloc[1]):
        k = random.randint(0, 1)
    if all(points_df[['points','difference',"goals"]].iloc[1] == points_df[['points','difference',"goals"]].iloc[2]):
        if k == 1:
            j = 2
        else:
            j = random.randint(1, 2)
            
    return points_df, points_df.iloc[k].Team , points_df.iloc[j].Team


In [51]:
def simulate_one_group(group_matches_ind,group_teams_ind,elo_rankings_df):
    all_results = simulate_group(group_matches_ind, elo_rankings_df)
    points, difference, goals = calculate_points_difference_goals(group_teams_ind,all_results)
    points_df, group_winner, group_runnerup = resolve_points(points, difference, goals)
    return points_df, group_winner, group_runnerup

In [57]:
group_letters = ['A','B','C','D','E','F','G','H']

In [58]:
def simulate_group_stage(group_matches,group_country, elo_rankings_df):
    group_rankings = []
    group_stage_results = {}
    print ('Running group stage simulation...')
    for i in range(0,len(group_matches)):
        group_rankings.append(simulate_one_group(group_matches[i],group_country[i],elo_rankings_df))
        group_stage_results[group_letters[i] + '1'] = group_rankings[i][1]
        group_stage_results[group_letters[i] + '2'] = group_rankings[i][2]
        print(f'Complete Group {group_letters[i]} simulation')
    return group_stage_results, group_rankings

In [57]:
points_df , _ , _ = simulate_one_group(group_matches[5],group_country[5],elo_rankings_df)

In [58]:
points_df

Unnamed: 0,points,difference,goals,Team
Belgium,3,0,3,Belgium
Canada,3,0,3,Canada
Morocco,3,0,3,Morocco
Croatia,3,0,3,Croatia


In [59]:
simulate_group_stage(group_matches,group_country, elo_rankings_df)

Running group stage simulation...
Complete Group A simulation
Complete Group B simulation
Complete Group C simulation
Complete Group D simulation
Complete Group E simulation
Complete Group F simulation
Complete Group G simulation
Complete Group H simulation


({'A1': 'Netherlands',
  'A2': 'Ecuador',
  'B1': 'England',
  'B2': 'Iran',
  'C1': 'Argentina',
  'C2': 'Poland',
  'D1': 'France',
  'D2': 'Australia',
  'E1': 'Spain',
  'E2': 'Germany',
  'F1': 'Canada',
  'F2': 'Morocco',
  'G1': 'Brazil',
  'G2': 'Switzerland',
  'H1': 'Portugal',
  'H2': 'Uruguay'},
 [(             points  difference  goals         Team
   Netherlands       7           3      4  Netherlands
   Ecuador           3           0      3      Ecuador
   Senegal           2          -1      2      Senegal
   Qatar             2          -2      2        Qatar,
   'Netherlands',
   'Ecuador'),
  (               points  difference  goals           Team
   England             3           0      3        England
   Iran                3           0      3           Iran
   United States       3           0      3  United States
   Wales               3           0      3          Wales, 'England', 'Iran'),
  (              points  difference  goals          Team
   Argent

In [63]:
ro16 = [['A1','B2'],['C1','D2'],['D1','C2'],['B1','A2'],['E1','F2'],['G1','H2'],['F1','E2'],['H1','G2']]
qf = [[ro16[4],ro16[5]],
      [ro16[0],ro16[1]],
       [ro16[6],ro16[7]],
       [ro16[2],ro16[3]]]
sf = [qf[0],qf[1],
     [qf[2],qf[3]]]


In [64]:
group_stage_results

{'A1': 'Netherlands', 'A2': 'Ecuador'}

# 5. Simulating the knockout rounds

### 5.1 Building the functions

- simulate_knockout_each_stage: a function that determines the winner at each stage of the knockout stages
- simulate_knockout_stages : orchestration function that runs the simulation for entire knockout stage

In [59]:
def simulate_knockout_each_stage(stage,elo_rankings_df,group_stage_results):
    winner_list = []
    for i in range(0,len(stage)):
        if stage == ro16:
            home_country = group_stage_results[stage[i][0]]
            away_country = group_stage_results[stage[i][1]]
        else:
            home_country = stage[i][0]
            away_country = stage[i][1]
        print(f"Home country {home_country}, away country {away_country}")
        elo_home = int(elo_rankings_df[elo_rankings_df['Team']==home_country]['elo_ranking'].to_string(index=False))
        elo_away = int(elo_rankings_df[elo_rankings_df['Team']==away_country]['elo_ranking'].to_string(index=False))
        win_prob, draw_prob, lose_prob, home_goals, away_goals = simulate_match(elo_home,elo_away,10)
        if max([win_prob, draw_prob, lose_prob]) == win_prob:
            winner = home_country
        elif max([win_prob, draw_prob, lose_prob]) == lose_prob:
            winner = away_country
        elif max([win_prob, draw_prob, lose_prob]) == draw_prob:
            print('Draw!')
            winner = random.choice([home_country,away_country])
        print (f"{home_country} vs {away_country} match probability with outcome: home win {win_prob * 100:.2f}%, draw {draw_prob * 100:.2f}%, away win {lose_prob * 100:.2f}% and expected score {home_goals} - {away_goals}")
        winner_list.append(winner)
    return winner_list

In [70]:
def simulate_knockout_stages(ro16,elo_rankings_df,group_stage_results):
    print('Simulating the ro16 games...')
    ro16_winners = simulate_knockout_each_stage(ro16,elo_rankings_df,group_stage_results)
    qf = [[ro16_winners[4],ro16_winners[5]],
          [ro16_winners[0],ro16_winners[1]],
           [ro16_winners[6],ro16_winners[7]],
           [ro16_winners[2],ro16_winners[3]]]
    print('Simulating the quarter finals now...')
    quarters_winner = simulate_knockout_each_stage(qf,elo_rankings_df,group_stage_results)
    sf = [[quarters_winner[0],quarters_winner[1]],
         [quarters_winner[2],quarters_winner[3]]]
    print('Simulating the semi finals now...')
    semi_winner = simulate_knockout_each_stage(sf,elo_rankings_df,group_stage_results)
    final = [[semi_winner[0],semi_winner[1]]]
    print('Simulating the finals now...')
    tournament_winner = simulate_knockout_each_stage(final,elo_rankings_df,group_stage_results)
    return tournament_winner

## 6. Simulating the entire tournament

In [73]:
def simulate_tournament():
    group_rankings, detailed_group_results = simulate_group_stage(group_matches,group_country, elo_rankings_df)
    ro16 = [['A1','B2'],['C1','D2'],['D1','C2'],['B1','A2'],['E1','F2'],['G1','H2'],['F1','E2'],['H1','G2']]
    winner = simulate_knockout_stages(ro16,elo_rankings_df,group_rankings)
    return winner , group_rankings, detailed_group_results

# Results

In [74]:
winner , group_rankings, detailed_group_results = simulate_tournament()

Running group stage simulation...
Complete Group A simulation
Complete Group B simulation
Complete Group C simulation
Complete Group D simulation
Complete Group E simulation
Complete Group F simulation
Complete Group G simulation
Complete Group H simulation
Simulating the ro16 games...
Home country Netherlands, away country Iran
Netherlands vs Iran match probability with outcome: home win 52.27%, draw 23.77%, away win 23.96% and expected score 1 - 1
Home country Argentina, away country Denmark
Argentina vs Denmark match probability with outcome: home win 48.77%, draw 24.40%, away win 26.83% and expected score 1 - 1
Home country France, away country Mexico
France vs Mexico match probability with outcome: home win 50.18%, draw 24.23%, away win 25.59% and expected score 1 - 1
Home country England, away country Ecuador
England vs Ecuador match probability with outcome: home win 42.50%, draw 25.36%, away win 32.14% and expected score 1 - 1
Home country Spain, away country Morocco
Spain vs M

In [75]:
import dataframe_image as dfi

In [76]:
for i in range(0,len(detailed_group_results)):
    df = detailed_group_results[i][0][['points','difference','goals']]
    dfi.export(df,f"df_xg_{i}.png")

In [88]:
def simulate_match_df(elo_home, elo_away,max_goals):
    goals_range = range(0,max_goals)
    home_poisson_pmf = poisson.pmf(k=goals_range, mu=simulate_xg(elo_home,elo_away))
    away_poisson_pmf = poisson.pmf(k=goals_range, mu=simulate_xg(elo_away,elo_home))
    prod_table = np.array([(i*j) for i, j in product(home_poisson_pmf, away_poisson_pmf)])
    prod_table.shape = (max_goals, max_goals)
    home_win_prob, draw_prob, home_lose_prob, home_expected_goals, away_expected_goals = poisson_matrix_outcome(prod_table)
    return home_win_prob, draw_prob, home_lose_prob, home_expected_goals, away_expected_goals,prod_table

In [97]:
_,_,_,_,_, prod_table_df = simulate_match_df(1920, 1818,6)