In [28]:
%matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt

plt.rc('figure', figsize = (20, 8))

df = pd.read_csv('data/FIFA_2018_Statistics.csv')
df.head()

Unnamed: 0,Date,Team,Opponent,Goal Scored,Ball Possession %,Attempts,On-Target,Off-Target,Blocked,Corners,...,Yellow Card,Yellow & Red,Red,Man of the Match,1st Goal,Round,PSO,Goals in PSO,Own goals,Own goal Time
0,14-06-2018,Russia,Saudi Arabia,5,40,13,7,3,3,6,...,0,0,0,Yes,12.0,Group Stage,No,0,,
1,14-06-2018,Saudi Arabia,Russia,0,60,6,0,3,3,2,...,0,0,0,No,,Group Stage,No,0,,
2,15-06-2018,Egypt,Uruguay,0,43,8,3,3,2,0,...,2,0,0,No,,Group Stage,No,0,,
3,15-06-2018,Uruguay,Egypt,1,57,14,4,6,4,5,...,0,0,0,Yes,89.0,Group Stage,No,0,,
4,15-06-2018,Morocco,Iran,0,64,13,3,6,4,5,...,1,0,0,No,,Group Stage,No,0,1.0,90.0


## Removendo Valores Faltantes

* Observe que alguns valores estão com NaN (Not a Number)
* A olho nu, provavelmente não conseguiríamos visualizar todos os casos.
* utilizaremos a função *isnull* que retorna um DataFrame com valores booleanos que apontam as ocorrências de valores faltantes.
* obs.: a função *notnull()* faz o contrário:

In [29]:
df.isnull().head()

Unnamed: 0,Date,Team,Opponent,Goal Scored,Ball Possession %,Attempts,On-Target,Off-Target,Blocked,Corners,...,Yellow Card,Yellow & Red,Red,Man of the Match,1st Goal,Round,PSO,Goals in PSO,Own goals,Own goal Time
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,True
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,True,True
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,True,True
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,True
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False


* Uma maneira de quantificar os valores faltantes é usando o **isnull().sum()**

In [30]:
df.isnull().sum()

Date                        0
Team                        0
Opponent                    0
Goal Scored                 0
Ball Possession %           0
Attempts                    0
On-Target                   0
Off-Target                  0
Blocked                     0
Corners                     0
Offsides                    0
Free Kicks                  0
Saves                       0
Pass Accuracy %             0
Passes                      0
Distance Covered (Kms)      0
Fouls Committed             0
Yellow Card                 0
Yellow & Red                0
Red                         0
Man of the Match            0
1st Goal                   34
Round                       0
PSO                         0
Goals in PSO                0
Own goals                 116
Own goal Time             116
dtype: int64

### Melhorando a visualização

In [31]:
valores_faltantes = df.isnull().sum().sort_values(ascending = False)
percentual_valores_faltantes = (valores_faltantes/df.shape[0])*100
#col_names = ['Valores Faltantes', '% Valores Faltantes']
#pd.concat([valores_faltantes, percentual_valores_faltantes], axis = 1, keys = col_names)
df_vf = pd.concat([valores_faltantes, percentual_valores_faltantes], axis = 1)
df_vf.columns = ['Valores Faltantes', '% Valores Faltantes']
df_vf

Unnamed: 0,Valores Faltantes,% Valores Faltantes
Own goal Time,116,90.625
Own goals,116,90.625
1st Goal,34,26.5625
Free Kicks,0,0.0
Team,0,0.0
Opponent,0,0.0
Goal Scored,0,0.0
Ball Possession %,0,0.0
Attempts,0,0.0
On-Target,0,0.0


### Eliminando valores faltantes - função dropna()

* Podemos eliminar as linhas com NaN na coluna Valor utilizando a função **dropna**
* Podemos usamos o parâmetro **subset**, onde definimos em quais colunas devem ser eliminadas os valores faltantes.

In [32]:
df_new = df.dropna(subset=['1st Goal', 'Own goals', 'Own goal Time'])
df_new.isnull().sum()

Date                      0
Team                      0
Opponent                  0
Goal Scored               0
Ball Possession %         0
Attempts                  0
On-Target                 0
Off-Target                0
Blocked                   0
Corners                   0
Offsides                  0
Free Kicks                0
Saves                     0
Pass Accuracy %           0
Passes                    0
Distance Covered (Kms)    0
Fouls Committed           0
Yellow Card               0
Yellow & Red              0
Red                       0
Man of the Match          0
1st Goal                  0
Round                     0
PSO                       0
Goals in PSO              0
Own goals                 0
Own goal Time             0
dtype: int64

## Interpolação

Podemos fazer interpolação no pandas usando o parâmetro **method** da função **fillna()**.

### Exemplo 1 - method ffill
* Preenche com os valores das linhas acima (começa de cima para baixo).
* É utilizado principalmente em séries temporais

In [33]:
df_new = df.fillna(method='ffill')
df_new[['Team', 'Opponent', 'Goal Scored', '1st Goal', 'Own goals', 'Own goal Time']].head()

Unnamed: 0,Team,Opponent,Goal Scored,1st Goal,Own goals,Own goal Time
0,Russia,Saudi Arabia,5,12.0,,
1,Saudi Arabia,Russia,0,12.0,,
2,Egypt,Uruguay,0,12.0,,
3,Uruguay,Egypt,1,89.0,,
4,Morocco,Iran,0,89.0,1.0,90.0


### Exemplo 2 - method bfill
* Preenche com os valores das linhas abaixo (começa de baixo para cima).
* É utilizado principalmente em séries temporais

In [34]:
df_new = df.fillna(method='bfill')
df_new[['Team', 'Opponent', 'Goal Scored', '1st Goal', 'Own goals', 'Own goal Time']].head()

Unnamed: 0,Team,Opponent,Goal Scored,1st Goal,Own goals,Own goal Time
0,Russia,Saudi Arabia,5,12.0,1.0,90.0
1,Saudi Arabia,Russia,0,89.0,1.0,90.0
2,Egypt,Uruguay,0,89.0,1.0,90.0
3,Uruguay,Egypt,1,89.0,1.0,90.0
4,Morocco,Iran,0,90.0,1.0,90.0


### Exemplo 3 - usando a média da coluna
* Preenche com a média dos registros da coluna

In [35]:
'''
df_new = df
df_new.fillna(value={
    '1st Goal': df_new['1st Goal'].mean(),
    'Own goals': df_new['Own goals'].mean(),
    'Own goal Time': df_new['Own goal Time'].mean(),
}, inplace=True)
'''

df_new = df.fillna(df.mean())


df_new[['Team', 'Opponent', 'Goal Scored', '1st Goal', 'Own goals', 'Own goal Time']].head()

Unnamed: 0,Team,Opponent,Goal Scored,1st Goal,Own goals,Own goal Time
0,Russia,Saudi Arabia,5,12.0,1.0,45.833333
1,Saudi Arabia,Russia,0,39.457447,1.0,45.833333
2,Egypt,Uruguay,0,39.457447,1.0,45.833333
3,Uruguay,Egypt,1,89.0,1.0,45.833333
4,Morocco,Iran,0,39.457447,1.0,90.0


## Preenchendo dados em valores faltantes

* Nesta base, não faz muito sentido remover os registro que possuem alguma coluna com valores faltantes;
* Também não faz muito sentido fazer interpolação;
* Uma alternativa é substituir os valores faltantes por valores que façam sentido.
* Por exemplo:
    * Podemos colocar $0$ no atributo **Own goals** e **Own goal Time**
    * Podemos colocar $-1$ na coluna **1st Goal** para representar que como o time não fez gol, então não existe o momento que aconteceu o primeiro gol;

In [36]:
df.fillna(value={'Own goals':0, 'Own goal Time':0, '1st Goal':-1}, inplace=True)
df.isnull().sum()

Date                      0
Team                      0
Opponent                  0
Goal Scored               0
Ball Possession %         0
Attempts                  0
On-Target                 0
Off-Target                0
Blocked                   0
Corners                   0
Offsides                  0
Free Kicks                0
Saves                     0
Pass Accuracy %           0
Passes                    0
Distance Covered (Kms)    0
Fouls Committed           0
Yellow Card               0
Yellow & Red              0
Red                       0
Man of the Match          0
1st Goal                  0
Round                     0
PSO                       0
Goals in PSO              0
Own goals                 0
Own goal Time             0
dtype: int64

In [37]:
df.head()

Unnamed: 0,Date,Team,Opponent,Goal Scored,Ball Possession %,Attempts,On-Target,Off-Target,Blocked,Corners,...,Yellow Card,Yellow & Red,Red,Man of the Match,1st Goal,Round,PSO,Goals in PSO,Own goals,Own goal Time
0,14-06-2018,Russia,Saudi Arabia,5,40,13,7,3,3,6,...,0,0,0,Yes,12.0,Group Stage,No,0,0.0,0.0
1,14-06-2018,Saudi Arabia,Russia,0,60,6,0,3,3,2,...,0,0,0,No,-1.0,Group Stage,No,0,0.0,0.0
2,15-06-2018,Egypt,Uruguay,0,43,8,3,3,2,0,...,2,0,0,No,-1.0,Group Stage,No,0,0.0,0.0
3,15-06-2018,Uruguay,Egypt,1,57,14,4,6,4,5,...,0,0,0,Yes,89.0,Group Stage,No,0,0.0,0.0
4,15-06-2018,Morocco,Iran,0,64,13,3,6,4,5,...,1,0,0,No,-1.0,Group Stage,No,0,1.0,90.0


## Codificando os atributos booleanos com valores numéricos

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128 entries, 0 to 127
Data columns (total 27 columns):
Date                      128 non-null object
Team                      128 non-null object
Opponent                  128 non-null object
Goal Scored               128 non-null int64
Ball Possession %         128 non-null int64
Attempts                  128 non-null int64
On-Target                 128 non-null int64
Off-Target                128 non-null int64
Blocked                   128 non-null int64
Corners                   128 non-null int64
Offsides                  128 non-null int64
Free Kicks                128 non-null int64
Saves                     128 non-null int64
Pass Accuracy %           128 non-null int64
Passes                    128 non-null int64
Distance Covered (Kms)    128 non-null int64
Fouls Committed           128 non-null int64
Yellow Card               128 non-null int64
Yellow & Red              128 non-null int64
Red                       128 non-nul

In [39]:
df['Man of the Match'] = df['Man of the Match'].map({'Yes': 1, 'No': 0})
df['PSO'] = df['PSO'].map({'Yes': 1, 'No': 0})
df

Unnamed: 0,Date,Team,Opponent,Goal Scored,Ball Possession %,Attempts,On-Target,Off-Target,Blocked,Corners,...,Yellow Card,Yellow & Red,Red,Man of the Match,1st Goal,Round,PSO,Goals in PSO,Own goals,Own goal Time
0,14-06-2018,Russia,Saudi Arabia,5,40,13,7,3,3,6,...,0,0,0,1,12.0,Group Stage,0,0,0.0,0.0
1,14-06-2018,Saudi Arabia,Russia,0,60,6,0,3,3,2,...,0,0,0,0,-1.0,Group Stage,0,0,0.0,0.0
2,15-06-2018,Egypt,Uruguay,0,43,8,3,3,2,0,...,2,0,0,0,-1.0,Group Stage,0,0,0.0,0.0
3,15-06-2018,Uruguay,Egypt,1,57,14,4,6,4,5,...,0,0,0,1,89.0,Group Stage,0,0,0.0,0.0
4,15-06-2018,Morocco,Iran,0,64,13,3,6,4,5,...,1,0,0,0,-1.0,Group Stage,0,0,1.0,90.0
5,15-06-2018,Iran,Morocco,1,36,8,2,5,1,2,...,3,0,0,1,90.0,Group Stage,0,0,0.0,0.0
6,15-06-2018,Portugal,Spain,3,39,8,3,2,3,4,...,1,0,0,0,4.0,Group Stage,0,0,0.0,0.0
7,15-06-2018,Spain,Portugal,3,61,12,5,5,2,5,...,1,0,0,1,24.0,Group Stage,0,0,0.0,0.0
8,16-06-2018,France,Australia,2,51,12,5,4,3,5,...,1,0,0,1,58.0,Group Stage,0,0,0.0,0.0
9,16-06-2018,Australia,France,1,49,4,1,2,1,1,...,3,0,0,0,62.0,Group Stage,0,0,1.0,81.0


### Exportando a base de dados

* Vamos salvar essas alterações, para podermos trabalhar com essa base de dados a partir de agora.

In [40]:
df.to_csv('data/FIFA_2018_Statistics_sem_NaN.csv', index=False)