# Importing modules and setting parameters 

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 

%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12, 8) 
sns.set('talk', rc={'figure.figsize': (12, 8)})

# Database loading

In [None]:
df = pd.read_csv('data/WorldCupMatches.csv') 

## First analysis 

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.sample(4)

# Data cleaning 

In [None]:
df.info()

In [None]:
## suppression of superfluous variables
df.drop(['Home Team Initials', 'Away Team Initials', 'MatchID', 'RoundID', 'Assistant 2', 'Assistant 1', 'Win conditions','Stage'], axis = 1, inplace = True) 

In [None]:
# to datetime 
df['Datetime'] = pd.to_datetime(df['Datetime'])
df['Month'] = df['Datetime'].dt.month
df['game time'] = df['Datetime'].dt.hour
df.drop(['Datetime'], axis = 1, inplace = True) 

In [None]:
# let's keep the nationality of the arbitrator only 
def arb_func(val):
    if val is np.nan:    
        return np.nan
    else:    
        Referee1 = val.split('(')[1]
        referee_nationality = Referee1[:3] 
    return referee_nationality
        
df['Referee nationality'] = df['Referee'].map(arb_func)
df.drop(['Referee'], axis = 1, inplace = True) 

In [None]:
# delete the uninformed matches 
df_gflight.dropna(subset=['Home Team Name','Away Team Name' ], inplace=True)

In [None]:
# replacing missing values 
df.isna().sum()
df['Attendance'] = df['Attendance'].fillna(df['Attendance'].mean())

In [None]:
categorical_variable  = dict(df.dtypes[df.dtypes != 'int64'][df.dtypes != 'float64'])

for elem in categorical_variable .keys() : 
    df[elem] = df[elem].astype('category')

# We can now start the EDA

In [None]:
# continuous variables
df.describe()

- The majority of the games take place at 5pm, between the months of May and July 
- Attendance is heterogeneous, with an average of 45164.8 supporters  
- The Home Team Goals scores more goals than the Away Team Goals 
- The majority of the goals are scored in the second period  

In [None]:
# categorial variables 
df.describe(exclude = (['float64', 'int64']))

- Most of the world cup games were played in Estadio Azteca, in Mexico City	
- Brazil is the country that has played the most games as a Home Team 
- Mexico is the country that has played the most games as Away Team
- The majority of the referees are Italian 

In [None]:
#correlation matrix 
sns.heatmap(
    100*df.corr(), 
    square=True, 
    annot=True, 
    fmt='.1f', 
    cmap=sns.cm.vlag )

- the more the years go by, the more Home Team Goals decreases
- the more the years go by, the more attendance increase 

In [None]:
sns.regplot(
    df['Year'], df['Attendance'] , 
    scatter_kws = {'color': 'green'}, line_kws = {'color': 'red'} )

In [None]:
sns.regplot(
    df['Year'], df['Home Team Goals'] , 
    scatter_kws = {'color': 'green'}, line_kws = {'color': 'red'} )

In [None]:
# let's take an interest in the games of the french team 
df_France = df['Home Team Name'] == 'France'
df_filtered = df[df_France]
df_filtered

In [1]:
df_filtered.describe(exclude = (['float64', 'int64']))

NameError: name 'df_filtered' is not defined


The French team played most of its games in Spain, with a majority of the referees being Spanish. Mexico is the country most faced by France 

In [None]:
df_filtered.describe()

On average, the French team scores more goals than it concedes, with an average attendance of  41870 


In [None]:
# let's take a look at the referees and the games of the french team 
df_filtered['Referee nationality'].hist(bins=16)

In [None]:
df_regroupement = df_filtered.pivot_table(
    values=['Home Team Goals', 'Away Team Goals'],
    index=['Referee nationality'], 
    aggfunc='median')
df_regroupement.sort_values(by = ['Away Team Goals'])

We notice that the colombian and english referees do not succeed too much for the french team in world cup 