In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import plotly.offline as offline
import plotly.graph_objects as go

# Fifa World Cup 2022 dataset

In [2]:
data= pd.read_csv('data/fifa_world_cup_matches.csv')

data.columns= [x.lower() for x in data.columns]
data= data[['team1','team2','number of goals team1','number of goals team2','date','hour','category']]

data['team1']= data['team1'].str.title()
data['team2']= data['team2'].str.title()

data['date']= pd.to_datetime(data['date'], format='%d %b %Y')

data.head()

Unnamed: 0,team1,team2,number of goals team1,number of goals team2,date,hour,category
0,Qatar,Ecuador,0,2,2022-11-20,17 : 00,Group A
1,England,Iran,6,2,2022-11-21,14 : 00,Group B
2,Senegal,Netherlands,0,2,2022-11-21,17 : 00,Group A
3,United States,Wales,1,1,2022-11-21,20 : 00,Group B
4,Argentina,Saudi Arabia,1,2,2022-11-22,11 : 00,Group C


In [3]:
countries= data['team1'].unique()
countries

array(['Qatar', 'England', 'Senegal', 'United States', 'Argentina',
       'Denmark', 'Mexico', 'France', 'Morocco', 'Germany', 'Spain',
       'Belgium', 'Switzerland', 'Uruguay', 'Portugal', 'Brazil', 'Wales',
       'Netherlands', 'Tunisia', 'Poland', 'Japan', 'Croatia', 'Cameroon',
       'Korea Republic', 'Ecuador', 'Iran', 'Australia', 'Saudi Arabia',
       'Canada', 'Costa Rica', 'Ghana', 'Serbia'], dtype=object)

In [4]:
categories= data['category'].unique()
categories

array(['Group A', 'Group B', 'Group C', 'Group D', 'Group F', 'Group E',
       'Group G', 'Group H', 'Round of 16', 'Quarter-final', 'Semi-final',
       'Play-off for third place', 'Final'], dtype=object)

In [5]:
labels= countries.copy()

for i in range(len(categories)):
    
    if (categories[i]== 'Round of 16'):
        for j in range(8):
            labels= np.append(labels,'Round of 16'+'-'+str(j+1))
    elif (categories[i]== 'Quarter-final'):
        for j in range(4):
            labels= np.append(labels,'Quarter-final'+'-'+str(j+1))
    elif (categories[i]== 'Semi-final'):
        for j in range(2):
            labels= np.append(labels,'Semi-final'+'-'+str(j+1))
    else:
        labels= np.append(labels,categories[i])
    
labels= np.append(labels,'Argentina')

In [6]:
# 'Qatar', 'England', 'Senegal', 'United States', 'Argentina',         00-04
# 'Denmark', 'Mexico', 'France', 'Morocco', 'Germany', 'Spain',        05-10
# 'Belgium', 'Switzerland', 'Uruguay', 'Portugal', 'Brazil', 'Wales',  11-16
# 'Netherlands', 'Tunisia', 'Poland', 'Japan', 'Croatia', 'Cameroon',  17-22
# 'Korea Republic', 'Ecuador', 'Iran', 'Australia', 'Saudi Arabia',    23-27
# 'Canada', 'Costa Rica', 'Ghana', 'Serbia', 'Group A', 'Group B',     28-33
# 'Group C', 'Group D', 'Group F', 'Group E', 'Group G', 'Group H',    34-39
# 'Round of 16-1', 'Round of 16-2', 'Round of 16-3', 'Round of 16-4',  40-43
# 'Round of 16-5', 'Round of 16-6', 'Round of 16-7', 'Round of 16-8',  44-47
# 'Quarter-final-1', 'Quarter-final-2', 'Quarter-final-3',             48-50
# 'Quarter-final-4', 'Semi-final-1', 'Semi-final-2',                   51-53
# 'Play-off for third place','Final'                                   54-55

In [7]:
from matplotlib.colors import to_hex

colorsb= list(map(to_hex, plt.get_cmap('tab20b').colors))
colorsc= list(map(to_hex, plt.get_cmap('tab20c').colors))

cm= np.concatenate((colorsb,colorsc))

In [8]:
offline.init_notebook_mode(connected= True)

fig= go.Figure(data=[go.Sankey(
    node= dict(
      pad = 15,
      thickness= 20,
      line = dict(color= "black", width= 0.5),
      label= labels,
      color= "silver"
    ),
    link = dict(
      source= [0,24,2,17,1,3,25,16,4,19,6,27,7,26,18,5,
               20,10,9,29,8,21,11,28,15,12,22,31,14,23,13,30, 
               32,32,33,33,34,34,35,35, 36,36,37,37,38,38,39,39,
               40,41, 42,43, 44,45, 46,47, 
               48,49, 50,51, 
               52,53, 52,53, 55
              ],
      
      target= [32,32,32,32,33,33,33,33,34,34,34,34,35,35,35,35,
               36,36,36,36,37,37,37,37,38,38,38,38,39,39,39,39,
               40,41,41,40, 42,43,43,42, 44,45,45,44, 46,47,47,46,
               48,48, 49,49, 50,50, 51,51, 
               52,52,53,53, 
               54,54, 55,55, 56
              ],
      
      value= [1,4,6,7,7,5,3,1,6,4,4,3,6,6,4,1,
              6,4,4,3,7,5,4,1,6,6,4,1,6,4,4,3, 
              7,6,7,5,6,4,6,6, 6,4,7,5,6,6,6,4,
              3,2, 4,4, 3,3, 3,6, 
              6,5,2,1, 
              1,1, 3,2, 7
             ],
      
      color= [cm[3],cm[2],cm[1],cm[0],cm[4],cm[5],cm[6],cm[7],cm[8],cm[10],cm[9],cm[11],cm[12],cm[13],cm[14],cm[15],
              cm[16],cm[18],cm[17],cm[19],cm[20],cm[21],cm[22],cm[23],cm[25],cm[24],cm[26],cm[27],cm[28],cm[30],cm[29],cm[31],
              cm[0],cm[1],cm[4],cm[5],cm[8],cm[9],cm[12],cm[13],cm[16],cm[17],cm[20],cm[21],cm[24],cm[25],cm[28],cm[29],
              cm[0],cm[4],cm[9],cm[12],cm[16],cm[20],cm[24],cm[28],
              cm[4],cm[9],cm[20],cm[24],
              cm[9],cm[24],cm[4],cm[20],cm[4]
             ]
        
       
  ))])

fig.update_layout(title={'text':"Sankey diagram for the 2022 FIFA World Cup",
                         'y':0.9,'x':0.5, 'xanchor': 'center', 'yanchor': 'top'}, font_size=12)

fig.show()