In [1]:
import pandas as pd
import plotly.express as px
import json
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sqlalchemy import create_engine, inspect, text

## Questão 1
- Como primeiro passo do desafio, faça a leitura/importação desses arquivos via python e realize o upload/carregamento desses dados em um banco SQLite.
    - Crie um schema chamado: test_analytics_engineer
    - Crie as tabelas com os mesmos nomes dos arquivos .csv
    - Respeite a tipagem e os nomes das colunas dos arquivos .csv
---
## Questão 3
- Faça uma análise exploratória dos dados no sentido de validar a qualidade dos dados destes datasets.
- Use sua criatividade e imaginação para buscar “sujeiras” na base de dados.
    - Lembre-se que queremos gerar insights com dados, então realize relações com tabelas que nos forneçam alguma informação relevante para os dados tratados analisados. Crie análises exploratórias dos dados.

    - Exemplos:
        - Qual a proporção entre jogadores destros e canhotos? Quais os seus nomes?
        - Qual o nome do país com maior saldo de gols

Leitura dos arquivos

In [2]:
country = pd.read_csv('Data/Country.csv')
league = pd.read_csv('Data/League.csv')
match = pd.read_csv('Data/Match.csv')
player_attributes = pd.read_csv('Data/Player_Attributes.csv')
player = pd.read_csv('Data/Player.csv')
team_attributes = pd.read_csv('Data/Team_Attributes.csv')
team = pd.read_csv('Data/Team.csv')

Verificação dos dados de Country.csv

In [3]:
country.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18 entries, 0 to 17
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      17 non-null     float64
 1   name    16 non-null     object 
dtypes: float64(1), object(1)
memory usage: 416.0+ bytes


In [4]:
print('Nulos: ', country.isnull().sum())
print('---')
print('Ids duplicados: ', country.id.duplicated().sum())
print('Países duplicados: ', country.name.duplicated().sum())

Nulos:  id      1
name    2
dtype: int64
---
Ids duplicados:  5
Países duplicados:  1


In [5]:
country

Unnamed: 0,id,name
0,1.0,Belgium
1,1729.0,England
2,4769.0,France
3,7809.0,Germany
4,10257.0,Italy
5,13274.0,Netherlands
6,15722.0,Poland
7,17642.0,Portugal
8,19694.0,Scotland
9,21518.0,Spain


Remoção dos dados nulos de Country

In [6]:
country.dropna(inplace=True)
country.isnull().sum()

id      0
name    0
dtype: int64

Remoção dos **.0**, conversão do tipo do Id de **float** para **int** e remoção do duplicados (optei por manter os com nomes em inglês).

In [7]:
country['id'] = country['id'].astype(str).str.rstrip('.0').astype(int)
country.drop_duplicates(subset=['id'], keep='first', inplace=True)
country

Unnamed: 0,id,name
0,1,Belgium
1,1729,England
2,4769,France
3,7809,Germany
4,10257,Italy
5,13274,Netherlands
6,15722,Poland
7,17642,Portugal
8,19694,Scotland
9,21518,Spain


Análise dos tipos para criação das tabelas

In [8]:
country.dtypes

id       int64
name    object
dtype: object

Verificação dos dados de League.csv

In [9]:
league.head(20)

Unnamed: 0,id,country_id,name
0,1,1,Belgium Jupiler League
1,44560,44560,Brasileirão Série A
2,1729,1729,England Premier League
3,4769,4769,France Ligue 1
4,7809,7809,Germany 1. Bundesliga
5,10257,10257,Italy Serie A
6,51220,51220,Liga da Angola
7,13274,13274,Netherlands Eredivisie
8,15722,15722,Poland Ekstraklasa
9,17642,17642,Portugal Liga ZON Sagres


In [10]:
league.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13 entries, 0 to 12
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          13 non-null     int64 
 1   country_id  13 non-null     int64 
 2   name        13 non-null     object
dtypes: int64(2), object(1)
memory usage: 440.0+ bytes


In [11]:
print('Nulos: ', league.isnull().sum())
print('---')
print('Id duplicados: ', league.id.duplicated().sum())
print('Duplicados: ', league.duplicated().sum())

Nulos:  id            0
country_id    0
name          0
dtype: int64
---
Id duplicados:  0
Duplicados:  0


In [12]:
league.dtypes

id             int64
country_id     int64
name          object
dtype: object

Sem dados duplicados, nulos e typos corretos. 

Verificação dos dados de Match.csv

In [13]:
match.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25979 entries, 0 to 25978
Columns: 115 entries, id to BSA
dtypes: float64(96), int64(9), object(10)
memory usage: 22.8+ MB


In [14]:
match.head()

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA
0,1,1,1,2008/2009,1,2008-08-17 00:00:00,492473,9987,9993,1,...,4.0,1.65,3.4,4.5,1.78,3.25,4.0,1.73,3.4,4.2
1,2,1,1,2008/2009,1,2008-08-16 00:00:00,492474,10000,9994,0,...,3.8,2.0,3.25,3.25,1.85,3.25,3.75,1.91,3.25,3.6
2,3,1,1,2008/2009,1,2008-08-16 00:00:00,492475,9984,8635,0,...,2.5,2.35,3.25,2.65,2.5,3.2,2.5,2.3,3.2,2.75
3,4,1,1,2008/2009,1,2008-08-17 00:00:00,492476,9991,9998,5,...,7.5,1.45,3.75,6.5,1.5,3.75,5.5,1.44,3.75,6.5
4,5,1,1,2008/2009,1,2008-08-16 00:00:00,492477,7947,9985,1,...,1.73,4.5,3.4,1.65,4.5,3.5,1.65,4.75,3.3,1.67


In [15]:
match.dtypes

id              int64
country_id      int64
league_id       int64
season         object
stage           int64
               ...   
GBD           float64
GBA           float64
BSH           float64
BSD           float64
BSA           float64
Length: 115, dtype: object

Conversão do tipo **object** para **date** da coluna date

In [16]:
match.date.info()

<class 'pandas.core.series.Series'>
RangeIndex: 25979 entries, 0 to 25978
Series name: date
Non-Null Count  Dtype 
--------------  ----- 
25979 non-null  object
dtypes: object(1)
memory usage: 203.1+ KB


In [17]:
match['date'] = pd.to_datetime(match['date'])
match.date.info()

<class 'pandas.core.series.Series'>
RangeIndex: 25979 entries, 0 to 25978
Series name: date
Non-Null Count  Dtype         
--------------  -----         
25979 non-null  datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 203.1 KB


In [18]:
match.head()

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA
0,1,1,1,2008/2009,1,2008-08-17,492473,9987,9993,1,...,4.0,1.65,3.4,4.5,1.78,3.25,4.0,1.73,3.4,4.2
1,2,1,1,2008/2009,1,2008-08-16,492474,10000,9994,0,...,3.8,2.0,3.25,3.25,1.85,3.25,3.75,1.91,3.25,3.6
2,3,1,1,2008/2009,1,2008-08-16,492475,9984,8635,0,...,2.5,2.35,3.25,2.65,2.5,3.2,2.5,2.3,3.2,2.75
3,4,1,1,2008/2009,1,2008-08-17,492476,9991,9998,5,...,7.5,1.45,3.75,6.5,1.5,3.75,5.5,1.44,3.75,6.5
4,5,1,1,2008/2009,1,2008-08-16,492477,7947,9985,1,...,1.73,4.5,3.4,1.65,4.5,3.5,1.65,4.75,3.3,1.67


In [19]:
print('Nulos: ', match.isnull().sum())
print('---')
print('id duplicados: ', match.id.duplicated().sum())
print('Duplicados: ', match.duplicated().sum())

Nulos:  id                0
country_id        0
league_id         0
season            0
stage             0
              ...  
GBD           11817
GBA           11817
BSH           11818
BSD           11818
BSA           11818
Length: 115, dtype: int64
---
id duplicados:  0
Duplicados:  0


Por não conter ids duplicado optei por manter os dados de apostas quem contém **null** por não saber sobre o contexto deles ainda

Removi os nulos para realizar análises sobre os dados.

In [20]:
match.dropna(inplace=True)
match.isnull().sum()

id            0
country_id    0
league_id     0
season        0
stage         0
             ..
GBD           0
GBA           0
BSH           0
BSD           0
BSA           0
Length: 115, dtype: int64

Verificação dos dados de player_attributes.csv

Necessário a conversão dos dados para normalizar pois os dados do **.csv** não estão no formato adequado para a questão 2.

In [21]:
player_attributes['Player_Attributes'] = player_attributes['Player_Attributes'].apply(json.loads)
player_attributes = pd.json_normalize(player_attributes['Player_Attributes'])# type:ignore
player_attributes.head()

Unnamed: 0,id,player_fifa_api_id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,...,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
0,1,218353,505942,2016-02-18 00:00:00,67.0,71.0,right,medium,medium,49.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
1,2,218353,505942,2015-11-19 00:00:00,67.0,71.0,right,medium,medium,49.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
2,3,218353,505942,2015-09-21 00:00:00,62.0,66.0,right,medium,medium,49.0,...,54.0,48.0,65.0,66.0,69.0,6.0,11.0,10.0,8.0,8.0
3,4,218353,505942,2015-03-20 00:00:00,61.0,65.0,right,medium,medium,48.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0
4,5,218353,505942,2007-02-22 00:00:00,61.0,65.0,right,medium,medium,48.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0


In [22]:
player_attributes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183978 entries, 0 to 183977
Data columns (total 42 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   183978 non-null  object 
 1   player_fifa_api_id   183978 non-null  object 
 2   player_api_id        183978 non-null  object 
 3   date                 183978 non-null  object 
 4   overall_rating       183142 non-null  float64
 5   potential            183142 non-null  float64
 6   preferred_foot       183142 non-null  object 
 7   attacking_work_rate  180748 non-null  object 
 8   defensive_work_rate  183142 non-null  object 
 9   crossing             183142 non-null  float64
 10  finishing            183142 non-null  float64
 11  heading_accuracy     183142 non-null  float64
 12  short_passing        183142 non-null  float64
 13  volleys              181265 non-null  float64
 14  dribbling            183142 non-null  float64
 15  curve            

Conversão dos tipos dos campos com id de **object** para **int** e do campo date para **date** 

In [23]:
player_attributes['id'] = player_attributes['id'].astype(int)
player_attributes['player_fifa_api_id'] = player_attributes['player_fifa_api_id'].astype(int)
player_attributes['player_api_id'] = player_attributes['player_api_id'].astype(int)
player_attributes['date'] = pd.to_datetime(player_attributes['date'])
player_attributes.dtypes

id                              int64
player_fifa_api_id              int64
player_api_id                   int64
date                   datetime64[ns]
overall_rating                float64
potential                     float64
preferred_foot                 object
attacking_work_rate            object
defensive_work_rate            object
crossing                      float64
finishing                     float64
heading_accuracy              float64
short_passing                 float64
volleys                       float64
dribbling                     float64
curve                         float64
free_kick_accuracy            float64
long_passing                  float64
ball_control                  float64
acceleration                  float64
sprint_speed                  float64
agility                       float64
reactions                     float64
balance                       float64
shot_power                    float64
jumping                       float64
stamina     

In [24]:
player_attributes.head()

Unnamed: 0,id,player_fifa_api_id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,...,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
0,1,218353,505942,2016-02-18,67.0,71.0,right,medium,medium,49.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
1,2,218353,505942,2015-11-19,67.0,71.0,right,medium,medium,49.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
2,3,218353,505942,2015-09-21,62.0,66.0,right,medium,medium,49.0,...,54.0,48.0,65.0,66.0,69.0,6.0,11.0,10.0,8.0,8.0
3,4,218353,505942,2015-03-20,61.0,65.0,right,medium,medium,48.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0
4,5,218353,505942,2007-02-22,61.0,65.0,right,medium,medium,48.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0


In [25]:
print('Nulos: ', player_attributes.isnull().sum())
print('---')
print('id duplicados: ', player_attributes.id.duplicated().sum())
print('Duplicados: ', player_attributes.duplicated().sum())

Nulos:  id                        0
player_fifa_api_id        0
player_api_id             0
date                      0
overall_rating          836
potential               836
preferred_foot          836
attacking_work_rate    3230
defensive_work_rate     836
crossing                836
finishing               836
heading_accuracy        836
short_passing           836
volleys                2713
dribbling               836
curve                  2713
free_kick_accuracy      836
long_passing            836
ball_control            836
acceleration            836
sprint_speed            836
agility                2713
reactions               836
balance                2713
shot_power              836
jumping                2713
stamina                 836
strength                836
long_shots              836
aggression              836
interceptions           836
positioning             836
vision                 2713
penalties               836
marking                 836
standing_tac

Sem dados duplicados e colunas com ids e date sem valores nulos, os outros dados com **null** optei por manter por não saber o contexto ainda.

Removi os nulos para realizar análises sobre os dados.

In [26]:
player_attributes.dropna(inplace=True)
player_attributes.isnull().sum()

id                     0
player_fifa_api_id     0
player_api_id          0
date                   0
overall_rating         0
potential              0
preferred_foot         0
attacking_work_rate    0
defensive_work_rate    0
crossing               0
finishing              0
heading_accuracy       0
short_passing          0
volleys                0
dribbling              0
curve                  0
free_kick_accuracy     0
long_passing           0
ball_control           0
acceleration           0
sprint_speed           0
agility                0
reactions              0
balance                0
shot_power             0
jumping                0
stamina                0
strength               0
long_shots             0
aggression             0
interceptions          0
positioning            0
vision                 0
penalties              0
marking                0
standing_tackle        0
sliding_tackle         0
gk_diving              0
gk_handling            0
gk_kicking             0


Verificação dos dados de player.csv

In [27]:
player.head()

Unnamed: 0,id,player_api_id,player_name,player_fifa_api_id,birthday,height,weight
0,1,505942,Aaron Appindangoye,218353,1992-02-29 00:00:00,182.88,187
1,2,155782,Aaron Cresswell,189615,1989-12-15 00:00:00,170.18,146
2,3,162549,Aaron Doran,186170,1991-05-13 00:00:00,170.18,163
3,4,30572,Aaron Galindo,140161,1982-05-08 00:00:00,182.88,198
4,5,23780,Aaron Hughes,17725,1979-11-08 00:00:00,182.88,154


In [28]:
player.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11060 entries, 0 to 11059
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  11060 non-null  int64  
 1   player_api_id       11060 non-null  int64  
 2   player_name         11060 non-null  object 
 3   player_fifa_api_id  11060 non-null  int64  
 4   birthday            11060 non-null  object 
 5   height              11060 non-null  float64
 6   weight              11060 non-null  int64  
dtypes: float64(1), int64(4), object(2)
memory usage: 605.0+ KB


In [29]:
print('Nulos: ', player.isnull().sum())
print('---')
print('id duplicados: ', player.id.duplicated().sum())
print('Duplicados: ', player.duplicated().sum())

Nulos:  id                    0
player_api_id         0
player_name           0
player_fifa_api_id    0
birthday              0
height                0
weight                0
dtype: int64
---
id duplicados:  0
Duplicados:  0


In [30]:
player.dtypes

id                      int64
player_api_id           int64
player_name            object
player_fifa_api_id      int64
birthday               object
height                float64
weight                  int64
dtype: object

Conversão do tipo do campo birthday de **object** para **date**

In [31]:
player['birthday'] = pd.to_datetime(player['birthday'])
player.dtypes

id                             int64
player_api_id                  int64
player_name                   object
player_fifa_api_id             int64
birthday              datetime64[ns]
height                       float64
weight                         int64
dtype: object

Sem dados nulos ou duplicados

Verificação dos dados de team_attributes.csv

Necessário a conversão dos dados para normalizar pois os dados do **.csv** não no formato adequado. Para a questão 2.

In [32]:
team_attributes.head()

Unnamed: 0.1,Unnamed: 0,Team_Attributes
0,0,"{""id"": ""1"", ""team_fifa_api_id"": ""434"", ""team_a..."
1,1,"{""id"": ""2"", ""team_fifa_api_id"": ""434"", ""team_a..."
2,2,"{""id"": ""3"", ""team_fifa_api_id"": ""434"", ""team_a..."
3,3,"{""id"": ""4"", ""team_fifa_api_id"": ""77"", ""team_ap..."
4,4,"{""id"": ""5"", ""team_fifa_api_id"": ""77"", ""team_ap..."


In [33]:
team_attributes['Team_Attributes'] = team_attributes['Team_Attributes'].apply(json.loads)
team_attributes = pd.json_normalize(team_attributes['Team_Attributes'])# type:ignore
team_attributes.head()

Unnamed: 0,id,team_fifa_api_id,team_api_id,date,buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribbling,buildUpPlayDribblingClass,buildUpPlayPassing,buildUpPlayPassingClass,...,chanceCreationShooting,chanceCreationShootingClass,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass
0,1,434,9930,2010-02-22 00:00:00,60,Balanced,,Little,50,Mixed,...,55,Normal,Organised,50,Medium,55,Press,45,Normal,Cover
1,2,434,9930,2014-09-19 00:00:00,52,Balanced,48.0,Normal,56,Mixed,...,64,Normal,Organised,47,Medium,44,Press,54,Normal,Cover
2,3,434,9930,2015-09-10 00:00:00,47,Balanced,41.0,Normal,54,Mixed,...,64,Normal,Organised,47,Medium,44,Press,54,Normal,Cover
3,4,77,8485,2010-02-22 00:00:00,70,Fast,,Little,70,Long,...,70,Lots,Organised,60,Medium,70,Double,70,Wide,Cover
4,5,77,8485,2011-02-22 00:00:00,47,Balanced,,Little,52,Mixed,...,52,Normal,Organised,47,Medium,47,Press,52,Normal,Cover


In [34]:
team_attributes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 25 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              1460 non-null   object 
 1   team_fifa_api_id                1460 non-null   object 
 2   team_api_id                     1460 non-null   object 
 3   date                            1460 non-null   object 
 4   buildUpPlaySpeed                1460 non-null   object 
 5   buildUpPlaySpeedClass           1460 non-null   object 
 6   buildUpPlayDribbling            489 non-null    float64
 7   buildUpPlayDribblingClass       1460 non-null   object 
 8   buildUpPlayPassing              1460 non-null   object 
 9   buildUpPlayPassingClass         1460 non-null   object 
 10  buildUpPlayPositioningClass     1460 non-null   object 
 11  chanceCreationPassing           1460 non-null   object 
 12  chanceCreationPassingClass      14

In [35]:
team_attributes.dtypes

id                                 object
team_fifa_api_id                   object
team_api_id                        object
date                               object
buildUpPlaySpeed                   object
buildUpPlaySpeedClass              object
buildUpPlayDribbling              float64
buildUpPlayDribblingClass          object
buildUpPlayPassing                 object
buildUpPlayPassingClass            object
buildUpPlayPositioningClass        object
chanceCreationPassing              object
chanceCreationPassingClass         object
chanceCreationCrossing             object
chanceCreationCrossingClass        object
chanceCreationShooting             object
chanceCreationShootingClass        object
chanceCreationPositioningClass     object
defencePressure                    object
defencePressureClass               object
defenceAggression                  object
defenceAggressionClass             object
defenceTeamWidth                   object
defenceTeamWidthClass             

Conversão dos tipos **object** para **int** e **date** dos campos com id, data e números inteiros.

In [36]:
team_attributes['id'] = team_attributes['id'].astype(int)
team_attributes['team_fifa_api_id'] = team_attributes['team_fifa_api_id'].astype(int)
team_attributes['team_api_id'] = team_attributes['team_api_id'].astype(int)
team_attributes['date'] = pd.to_datetime(team_attributes['date'])
team_attributes['buildUpPlaySpeed'] = team_attributes['buildUpPlaySpeed'].astype(int)
team_attributes['buildUpPlayPassing'] = team_attributes['buildUpPlayPassing'].astype(int)
team_attributes['chanceCreationPassing'] = team_attributes['chanceCreationPassing'].astype(int)
team_attributes['chanceCreationCrossing'] = team_attributes['chanceCreationCrossing'].astype(int)
team_attributes['chanceCreationShooting'] = team_attributes['chanceCreationShooting'].astype(int)
team_attributes['defencePressure'] = team_attributes['defencePressure'].astype(int)
team_attributes['defenceAggression'] = team_attributes['defenceAggression'].astype(int)
team_attributes['defenceTeamWidth'] = team_attributes['defenceTeamWidth'].astype(int)
team_attributes.dtypes

id                                         int64
team_fifa_api_id                           int64
team_api_id                                int64
date                              datetime64[ns]
buildUpPlaySpeed                           int64
buildUpPlaySpeedClass                     object
buildUpPlayDribbling                     float64
buildUpPlayDribblingClass                 object
buildUpPlayPassing                         int64
buildUpPlayPassingClass                   object
buildUpPlayPositioningClass               object
chanceCreationPassing                      int64
chanceCreationPassingClass                object
chanceCreationCrossing                     int64
chanceCreationCrossingClass               object
chanceCreationShooting                     int64
chanceCreationShootingClass               object
chanceCreationPositioningClass            object
defencePressure                            int64
defencePressureClass                      object
defenceAggression   

In [37]:
print('Nulos: ', team_attributes.isnull().sum())
print('---')
print('id duplicados: ', team_attributes.id.duplicated().sum())
print('Duplicados: ', team_attributes.duplicated().sum())

Nulos:  id                                  0
team_fifa_api_id                    0
team_api_id                         0
date                                0
buildUpPlaySpeed                    0
buildUpPlaySpeedClass               0
buildUpPlayDribbling              971
buildUpPlayDribblingClass           0
buildUpPlayPassing                  0
buildUpPlayPassingClass             0
buildUpPlayPositioningClass         0
chanceCreationPassing               0
chanceCreationPassingClass          0
chanceCreationCrossing              0
chanceCreationCrossingClass         0
chanceCreationShooting              0
chanceCreationShootingClass         0
chanceCreationPositioningClass      0
defencePressure                     0
defencePressureClass                0
defenceAggression                   0
defenceAggressionClass              0
defenceTeamWidth                    0
defenceTeamWidthClass               0
defenceDefenderLineClass            0
dtype: int64
---
id duplicados:  2
Duplica

Removido registros com ids duplicados.

In [38]:
duplicated_ids = team_attributes[team_attributes.duplicated(subset="id", keep=False)]
duplicated_ids

Unnamed: 0,id,team_fifa_api_id,team_api_id,date,buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribbling,buildUpPlayDribblingClass,buildUpPlayPassing,buildUpPlayPassingClass,...,chanceCreationShooting,chanceCreationShootingClass,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass
0,1,434,9930,2010-02-22,60,Balanced,,Little,50,Mixed,...,55,Normal,Organised,50,Medium,55,Press,45,Normal,Cover
1,2,434,9930,2014-09-19,52,Balanced,48.0,Normal,56,Mixed,...,64,Normal,Organised,47,Medium,44,Press,54,Normal,Cover
1458,1,434,9930,2010-02-22,60,Balanced,,Little,500000,Mixed,...,55,Normal,Organised,50,Medium,55000,Press,45,Normal,Cover
1459,2,434,9930,2010-02-22,60,Balanced,,Little,500000,Mixed,...,55,Normal,Organised,50,Medium,55000,Press,45,Normal,Cover


In [39]:
team_attributes.drop_duplicates(subset=['id'], keep='first', inplace=True)
print('id duplicados: ', team_attributes.id.duplicated().sum())
print('Duplicados: ', team_attributes.duplicated().sum())

id duplicados:  0
Duplicados:  0


In [40]:
team_attributes.dropna(inplace=True)
team_attributes.isnull().sum()

id                                0
team_fifa_api_id                  0
team_api_id                       0
date                              0
buildUpPlaySpeed                  0
buildUpPlaySpeedClass             0
buildUpPlayDribbling              0
buildUpPlayDribblingClass         0
buildUpPlayPassing                0
buildUpPlayPassingClass           0
buildUpPlayPositioningClass       0
chanceCreationPassing             0
chanceCreationPassingClass        0
chanceCreationCrossing            0
chanceCreationCrossingClass       0
chanceCreationShooting            0
chanceCreationShootingClass       0
chanceCreationPositioningClass    0
defencePressure                   0
defencePressureClass              0
defenceAggression                 0
defenceAggressionClass            0
defenceTeamWidth                  0
defenceTeamWidthClass             0
defenceDefenderLineClass          0
dtype: int64

Verificação dos dados de team.csv

In [41]:
team.head()

Unnamed: 0,id,team_api_id,team_fifa_api_id,team_long_name,team_short_name
0,1,9987,673.0,KRC Genk,GEN
1,2,9993,675.0,Beerschot AC,BAC
2,3,10000,15005.0,SV Zulte-Waregem,ZUL
3,4,9994,2007.0,Sporting Lokeren,LOK
4,5,9984,1750.0,KSV Cercle Brugge,CEB


In [42]:
team.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                299 non-null    int64  
 1   team_api_id       299 non-null    int64  
 2   team_fifa_api_id  288 non-null    float64
 3   team_long_name    299 non-null    object 
 4   team_short_name   299 non-null    object 
dtypes: float64(1), int64(2), object(2)
memory usage: 11.8+ KB


In [43]:
print('Nulos: ', team.isnull().sum())
print('---')
print('id duplicados: ', team.id.duplicated().sum())
print('Duplicados: ', team.duplicated().sum())

Nulos:  id                   0
team_api_id          0
team_fifa_api_id    11
team_long_name       0
team_short_name      0
dtype: int64
---
id duplicados:  0
Duplicados:  0


Removido valores nulos e com **.0** dos dados de team_fifa_api_id e convertido de **float** para **int**.

In [44]:
team.dropna(inplace=True)
team['team_fifa_api_id'] = team['team_fifa_api_id'].astype(str).str.rstrip('.0').astype(int)

In [45]:
print('Nulos: ', team.isnull().sum())

Nulos:  id                  0
team_api_id         0
team_fifa_api_id    0
team_long_name      0
team_short_name     0
dtype: int64


Exemplo sugerido no desafio. Distribuição de destros e canhotos

Validação das tabelas criadas

In [46]:
engine = create_engine(f'sqlite:///Data/test_analytics_engineer.db', echo=False)

In [47]:
inspector = inspect(engine)
print(inspector.get_table_names())

['Country', 'League', 'Match', 'Match_Modified', 'Player', 'Player_Attributes', 'Player_Attributes_Modified', 'Relations', 'Team', 'Team_Attributes', 'Team_Attributes_Modified']


Função para queries

In [48]:
def sql_df(query):
  with engine.connect() as conexao:
    consulta = conexao.execute(text(query))
    dados = consulta.fetchall()
  return pd.DataFrame(dados,columns=consulta.keys())# type:ignore

In [49]:
query = 'SELECT * FROM Country'
df_country = sql_df(query)
df_country.head()

Unnamed: 0,id,name
0,1,Belgium
1,1729,England
2,4769,France
3,7809,Germany
4,10257,Italy


In [50]:
query = 'SELECT * FROM League'
df_league = sql_df(query)
df_league.head()

Unnamed: 0,id,country_id,name
0,1,1,Belgium Jupiler League
1,1729,1729,England Premier League
2,4769,4769,France Ligue 1
3,7809,7809,Germany 1. Bundesliga
4,10257,10257,Italy Serie A


In [51]:
query = 'SELECT * FROM Match'
df_match = sql_df(query)
df_match.head()

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA
0,1,1,1,2008/2009,1,2008-08-17,492473,9987,9993,1,...,4.0,1.65,3.4,4.5,1.78,3.25,4.0,1.73,3.4,4.2
1,2,1,1,2008/2009,1,2008-08-16,492474,10000,9994,0,...,3.8,2.0,3.25,3.25,1.85,3.25,3.75,1.91,3.25,3.6
2,3,1,1,2008/2009,1,2008-08-16,492475,9984,8635,0,...,2.5,2.35,3.25,2.65,2.5,3.2,2.5,2.3,3.2,2.75
3,4,1,1,2008/2009,1,2008-08-17,492476,9991,9998,5,...,7.5,1.45,3.75,6.5,1.5,3.75,5.5,1.44,3.75,6.5
4,5,1,1,2008/2009,1,2008-08-16,492477,7947,9985,1,...,1.73,4.5,3.4,1.65,4.5,3.5,1.65,4.75,3.3,1.67


In [52]:
query = 'SELECT * FROM Player_Attributes'
df_player_attributes = sql_df(query)
df_player_attributes.head()

Unnamed: 0.1,Unnamed: 0,Player_Attributes
0,0,"{""id"": ""1"", ""player_fifa_api_id"": ""218353"", ""p..."
1,1,"{""id"": ""2"", ""player_fifa_api_id"": ""218353"", ""p..."
2,2,"{""id"": ""3"", ""player_fifa_api_id"": ""218353"", ""p..."
3,3,"{""id"": ""4"", ""player_fifa_api_id"": ""218353"", ""p..."
4,4,"{""id"": ""5"", ""player_fifa_api_id"": ""218353"", ""p..."


In [53]:
query = 'SELECT * FROM Player'
df_player = sql_df(query)
df_player.head()

Unnamed: 0,id,player_api_id,player_name,player_fifa_api_id,birthday,height,weight
0,1,505942,Aaron Appindangoye,218353,1992-02-29,182.88,187
1,2,155782,Aaron Cresswell,189615,1989-12-15,170.18,146
2,3,162549,Aaron Doran,186170,1991-05-13,170.18,163
3,4,30572,Aaron Galindo,140161,1982-05-08,182.88,198
4,5,23780,Aaron Hughes,17725,1979-11-08,182.88,154


In [54]:
query = 'SELECT * FROM Team_Attributes'
df_team_attributes = sql_df(query)
df_team_attributes.head()

Unnamed: 0.1,Unnamed: 0,Team_Attributes
0,0,"{""id"": ""1"", ""team_fifa_api_id"": ""434"", ""team_a..."
1,1,"{""id"": ""2"", ""team_fifa_api_id"": ""434"", ""team_a..."
2,2,"{""id"": ""3"", ""team_fifa_api_id"": ""434"", ""team_a..."
3,3,"{""id"": ""4"", ""team_fifa_api_id"": ""77"", ""team_ap..."
4,4,"{""id"": ""5"", ""team_fifa_api_id"": ""77"", ""team_ap..."


In [55]:
query = 'SELECT * FROM Team'
df_team = sql_df(query)
df_team.head()

Unnamed: 0,id,team_api_id,team_fifa_api_id,team_long_name,team_short_name
0,1,9987,673,KRC Genk,GEN
1,2,9993,675,Beerschot AC,BAC
2,3,10000,15005,SV Zulte-Waregem,ZUL
3,4,9994,2007,Sporting Lokeren,LOK
4,5,9984,175,KSV Cercle Brugge,CEB


Dados Inseridos

## Questão 2

- Estabeleça uma relação entre as tabelas Player e Player_Attributes em uma nova tabela, chamada ‘Player_Attributes_Modified’, onde cada key do json é uma nova coluna. 
- Faça o mesmo para a relação Team e Team_Attributes, como o nome para a tabela ‘Team_Attributes_Modified’
- Crie uma tabela chamada Match_Modified cuja coluna seja representada como JSON, onde as chaves precisam ser referentes às colunas da tabela Match, sendo elas:
  - id
  - match_api_id
  - home_team_api_id
  - away_team_api_id.
---
> Queries para criar as tabelas da questão 2 **Player_Attributes_Modified**, **Team_Attributes_Modified** e **Match_Modified**
---
> Leitura dos campos do **json** e join entres as tabelas solicitadas na questão.

In [56]:
query = '''WITH CTE_Player_Attributes AS (
    SELECT
        CAST(JSON_extract(player_attributes, '$.id') AS INT) as id,
        CAST(JSON_extract(player_attributes, '$.player_fifa_api_id') AS INT) as player_fifa_api_id,
        CAST(JSON_extract(player_attributes, '$.player_api_id') AS INT) as player_api_id,
        JSON_extract(player_attributes, '$.date') as date,
        CAST(JSON_extract(player_attributes, '$.overall_rating') AS FLOAT) as overall_rating,
        CAST(JSON_extract(player_attributes, '$.potential') AS FLOAT) as potential,
        JSON_extract(player_attributes, '$.preferred_foot') as preferred_foot,
        JSON_extract(player_attributes, '$.attacking_work_rate') as attacking_work_rate,
        JSON_extract(player_attributes, '$.defensive_work_rate') as defensive_work_rate,
        CAST(JSON_extract(player_attributes, '$.crossing') AS FLOAT) as crossing,
        CAST(JSON_extract(player_attributes, '$.finishing') AS FLOAT) as finishing,
        CAST(JSON_extract(player_attributes, '$.heading_accuracy') AS FLOAT) as heading_accuracy,
        CAST(JSON_extract(player_attributes, '$.short_passing') AS FLOAT) as short_passing,
        CAST(JSON_extract(player_attributes, '$.volleys') AS FLOAT) as volleys,
        CAST(JSON_extract(player_attributes, '$.dribbling') AS FLOAT) as dribbling,
        CAST(JSON_extract(player_attributes, '$.curve') AS FLOAT) as curve,
        CAST(JSON_extract(player_attributes, '$.free_kick_accuracy') AS FLOAT) as free_kick_accuracy,
        CAST(JSON_extract(player_attributes, '$.long_passing') AS FLOAT) as long_passing,
        CAST(JSON_extract(player_attributes, '$.ball_control') AS FLOAT) as ball_control,
        CAST(JSON_extract(player_attributes, '$.acceleration') AS FLOAT) as acceleration,
        CAST(JSON_extract(player_attributes, '$.sprint_speed') AS FLOAT) as sprint_speed,
        CAST(JSON_extract(player_attributes, '$.agility') AS FLOAT) as agility,
        CAST(JSON_extract(player_attributes, '$.reactions') AS FLOAT) as reactions,
        CAST(JSON_extract(player_attributes, '$.balance') AS FLOAT) as balance,
        CAST(JSON_extract(player_attributes, '$.shot_power') AS FLOAT) as shot_power,
        CAST(JSON_extract(player_attributes, '$.jumping') AS FLOAT) as jumping,
        CAST(JSON_extract(player_attributes, '$.stamina') AS FLOAT) as stamina,
        CAST(JSON_extract(player_attributes, '$.strength') AS FLOAT) as strength,
        CAST(JSON_extract(player_attributes, '$.long_shots') AS FLOAT) as long_shots,
        CAST(JSON_extract(player_attributes, '$.aggression') AS FLOAT) as aggression,
        CAST(JSON_extract(player_attributes, '$.interceptions') AS FLOAT) as interceptions,
        CAST(JSON_extract(player_attributes, '$.positioning') AS FLOAT) as positioning,
        CAST(JSON_extract(player_attributes, '$.vision') AS FLOAT) as vision,
        CAST(JSON_extract(player_attributes, '$.penalties') AS FLOAT) as penalties,
        CAST(JSON_extract(player_attributes, '$.marking') AS FLOAT) as marking,
        CAST(JSON_extract(player_attributes, '$.standing_tackle') AS FLOAT) as standing_tackle,
        CAST(JSON_extract(player_attributes, '$.sliding_tackle') AS FLOAT) as sliding_tackle,
        CAST(JSON_extract(player_attributes, '$.gk_diving') AS FLOAT) as gk_diving,
        CAST(JSON_extract(player_attributes, '$.gk_handling') AS FLOAT) as gk_handling,
        CAST(JSON_extract(player_attributes, '$.gk_kicking') AS FLOAT) as gk_kicking,
        CAST(JSON_extract(player_attributes, '$.gk_positioning') AS FLOAT) as gk_positioning,
        CAST(JSON_extract(player_attributes, '$.gk_reflexes') AS FLOAT) as gk_reflexes
    FROM Player_Attributes
)
SELECT cte_p_a.*,
    Player.player_name,
    Player.birthday,
    Player.height,
    Player.weight
FROM CTE_Player_Attributes as cte_p_a
JOIN Player ON cte_p_a.player_fifa_api_id = Player.player_fifa_api_id
    AND cte_p_a.player_fifa_api_id = Player.player_fifa_api_id
'''
player_attributes_modified = sql_df(query)
player_attributes_modified['date'] = pd.to_datetime(player_attributes_modified['date'])
player_attributes_modified['birthday'] = pd.to_datetime(player_attributes_modified['birthday'])
player_attributes_modified.head()

Unnamed: 0,id,player_fifa_api_id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,...,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,player_name,birthday,height,weight
0,1,218353,505942,2016-02-18,67.0,71.0,right,medium,medium,49.0,...,69.0,6.0,11.0,10.0,8.0,8.0,Aaron Appindangoye,1992-02-29,182.88,187
1,2,218353,505942,2015-11-19,67.0,71.0,right,medium,medium,49.0,...,69.0,6.0,11.0,10.0,8.0,8.0,Aaron Appindangoye,1992-02-29,182.88,187
2,3,218353,505942,2015-09-21,62.0,66.0,right,medium,medium,49.0,...,69.0,6.0,11.0,10.0,8.0,8.0,Aaron Appindangoye,1992-02-29,182.88,187
3,4,218353,505942,2015-03-20,61.0,65.0,right,medium,medium,48.0,...,66.0,5.0,10.0,9.0,7.0,7.0,Aaron Appindangoye,1992-02-29,182.88,187
4,5,218353,505942,2007-02-22,61.0,65.0,right,medium,medium,48.0,...,66.0,5.0,10.0,9.0,7.0,7.0,Aaron Appindangoye,1992-02-29,182.88,187


In [57]:
query = '''WITH CTE_Team_Attributes AS (
    SELECT
        CAST(JSON_extract(Team_Attributes, '$.id') AS INT) as id,
        CAST(JSON_extract(Team_Attributes, '$.team_fifa_api_id') AS INT) as team_fifa_api_id,
        CAST(JSON_extract(Team_Attributes, '$.team_api_id') AS INT) as team_api_id,
        JSON_extract(Team_Attributes, '$.date') as date,
        CAST(JSON_extract(Team_Attributes, '$.buildUpPlaySpeed') AS INT) as buildUpPlaySpeed,
        JSON_extract(Team_Attributes, '$.buildUpPlaySpeedClass') as buildUpPlaySpeedClass,
        CAST(JSON_extract(Team_Attributes, '$.buildUpPlayDribbling') AS FLOAT) as buildUpPlayDribbling,
        JSON_extract(Team_Attributes, '$.buildUpPlayDribblingClass') as buildUpPlayDribblingClass,
        CAST(JSON_extract(Team_Attributes, '$.buildUpPlayPassing') AS INT) as buildUpPlayPassing,
        JSON_extract(Team_Attributes, '$.buildUpPlayPassingClass') as buildUpPlayPassingClass,
        JSON_extract(Team_Attributes, '$.buildUpPlayPositioningClass') as buildUpPlayPositioningClass,
        CAST(JSON_extract(Team_Attributes, '$.chanceCreationPassing') AS INT) as chanceCreationPassing,
        JSON_extract(Team_Attributes, '$.chanceCreationPassingClass') as chanceCreationPassingClass,
        CAST(JSON_extract(Team_Attributes, '$.chanceCreationCrossing') AS INT) as chanceCreationCrossing,
        JSON_extract(Team_Attributes, '$.chanceCreationCrossingClass') as chanceCreationCrossingClass,
        CAST(JSON_extract(Team_Attributes, '$.chanceCreationShooting') AS INT) as chanceCreationShooting,
        JSON_extract(Team_Attributes, '$.chanceCreationShootingClass') as chanceCreationShootingClass,
        JSON_extract(Team_Attributes, '$.chanceCreationPositioningClass') as chanceCreationPositioningClass,
        CAST(JSON_extract(Team_Attributes, '$.defencePressure') AS INT) as defencePressure,
        JSON_extract(Team_Attributes, '$.defencePressureClass') as defencePressureClass,
        CAST(JSON_extract(Team_Attributes, '$.defenceAggression') AS INT) as defenceAggression,
        JSON_extract(Team_Attributes, '$.defenceAggressionClass') as defenceAggressionClass,
        CAST(JSON_extract(Team_Attributes, '$.defenceTeamWidth') AS INT) as defenceTeamWidth,
        JSON_extract(Team_Attributes, '$.defenceTeamWidthClass') as defenceTeamWidthClass,
        JSON_extract(Team_Attributes, '$.defenceDefenderLineClass') as defenceDefenderLineClass
    FROM Team_Attributes
)
SELECT cte_t_a.*,
    Team.team_long_name,
    Team.team_short_name
FROM CTE_Team_Attributes as cte_t_a
JOIN Team ON cte_t_a.team_api_id = Team.team_api_id
    AND cte_t_a.team_fifa_api_id = Team.team_fifa_api_id
'''
team_attributes_modified = sql_df(query)
team_attributes_modified['date'] = pd.to_datetime(team_attributes_modified['date'])
team_attributes_modified.drop_duplicates(subset=['id'], keep='first', inplace=True)
team_attributes_modified.head()

Unnamed: 0,id,team_fifa_api_id,team_api_id,date,buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribbling,buildUpPlayDribblingClass,buildUpPlayPassing,buildUpPlayPassingClass,...,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass,team_long_name,team_short_name
0,1,434,9930,2010-02-22,60,Balanced,,Little,50,Mixed,...,Organised,50,Medium,55,Press,45,Normal,Cover,FC Aarau,AAR
1,2,434,9930,2014-09-19,52,Balanced,48.0,Normal,56,Mixed,...,Organised,47,Medium,44,Press,54,Normal,Cover,FC Aarau,AAR
2,3,434,9930,2015-09-10,47,Balanced,41.0,Normal,54,Mixed,...,Organised,47,Medium,44,Press,54,Normal,Cover,FC Aarau,AAR
3,4,77,8485,2010-02-22,70,Fast,,Little,70,Long,...,Organised,60,Medium,70,Double,70,Wide,Cover,Aberdeen,ABE
4,5,77,8485,2011-02-22,47,Balanced,,Little,52,Mixed,...,Organised,47,Medium,47,Press,52,Normal,Cover,Aberdeen,ABE


Consulta dos dados e conversão em **json**.

In [58]:
query = '''
    SELECT 
        CAST(id AS VARCHAR) AS id,
        CAST(match_api_id AS VARCHAR) AS match_api_id,
        CAST(home_team_api_id AS VARCHAR) AS home_team_api_id,
        CAST(away_team_api_id AS VARCHAR) AS away_team_api_id
    FROM Match
'''
df_match_modified = sql_df(query)
df_match_modified.dtypes

id                  object
match_api_id        object
home_team_api_id    object
away_team_api_id    object
dtype: object

In [59]:
df_match_modified['Match_Modified'] = df_match_modified.apply(lambda row: {
    'id': row['id'],
    'match_api_id': row['match_api_id'],
    'home_team_api_id': row['home_team_api_id'],
    'away_team_api_id': row['away_team_api_id']
}, axis=1)

match_modified = pd.DataFrame(df_match_modified['Match_Modified'])
match_modified['id'] = match_modified.index.astype(str)
match_modified = match_modified[['id', 'Match_Modified']]
match_modified.head()

Unnamed: 0,id,Match_Modified
0,0,"{'id': '1', 'match_api_id': '492473', 'home_te..."
1,1,"{'id': '2', 'match_api_id': '492474', 'home_te..."
2,2,"{'id': '3', 'match_api_id': '492475', 'home_te..."
3,3,"{'id': '4', 'match_api_id': '492476', 'home_te..."
4,4,"{'id': '5', 'match_api_id': '492477', 'home_te..."


Validação das tabelas modificadas.

In [60]:
query = 'SELECT * FROM Player_Attributes_Modified'
df_team = sql_df(query)
df_team.head()

Unnamed: 0,id,player_fifa_api_id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,...,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,player_name,birthday,height,weight
0,1,218353,505942,2016-02-18,67.0,71.0,right,medium,medium,49.0,...,69.0,6.0,11.0,10.0,8.0,8.0,Aaron Appindangoye,1992-02-29,182.88,187
1,2,218353,505942,2015-11-19,67.0,71.0,right,medium,medium,49.0,...,69.0,6.0,11.0,10.0,8.0,8.0,Aaron Appindangoye,1992-02-29,182.88,187
2,3,218353,505942,2015-09-21,62.0,66.0,right,medium,medium,49.0,...,69.0,6.0,11.0,10.0,8.0,8.0,Aaron Appindangoye,1992-02-29,182.88,187
3,4,218353,505942,2015-03-20,61.0,65.0,right,medium,medium,48.0,...,66.0,5.0,10.0,9.0,7.0,7.0,Aaron Appindangoye,1992-02-29,182.88,187
4,5,218353,505942,2007-02-22,61.0,65.0,right,medium,medium,48.0,...,66.0,5.0,10.0,9.0,7.0,7.0,Aaron Appindangoye,1992-02-29,182.88,187


In [61]:
query = 'SELECT * FROM Team_Attributes_Modified'
df_team = sql_df(query)
df_team.head()

Unnamed: 0,id,team_fifa_api_id,team_api_id,date,buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribbling,buildUpPlayDribblingClass,buildUpPlayPassing,buildUpPlayPassingClass,...,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass,team_long_name,team_short_name
0,1,434,9930,2010-02-22 00:00:00.000000,60,Balanced,,Little,50,Mixed,...,Organised,50,Medium,55,Press,45,Normal,Cover,FC Aarau,AAR
1,2,434,9930,2014-09-19 00:00:00.000000,52,Balanced,48.0,Normal,56,Mixed,...,Organised,47,Medium,44,Press,54,Normal,Cover,FC Aarau,AAR
2,3,434,9930,2015-09-10 00:00:00.000000,47,Balanced,41.0,Normal,54,Mixed,...,Organised,47,Medium,44,Press,54,Normal,Cover,FC Aarau,AAR
3,4,77,8485,2010-02-22 00:00:00.000000,70,Fast,,Little,70,Long,...,Organised,60,Medium,70,Double,70,Wide,Cover,Aberdeen,ABE
4,5,77,8485,2011-02-22 00:00:00.000000,47,Balanced,,Little,52,Mixed,...,Organised,47,Medium,47,Press,52,Normal,Cover,Aberdeen,ABE


In [62]:
query = 'SELECT * FROM Match_Modified'
df_team = sql_df(query)
df_team.head()

Unnamed: 0,id,Match_Modified
0,0,"{""id"": ""1"", ""match_api_id"": ""492473"", ""home_te..."
1,1,"{""id"": ""2"", ""match_api_id"": ""492474"", ""home_te..."
2,2,"{""id"": ""3"", ""match_api_id"": ""492475"", ""home_te..."
3,3,"{""id"": ""4"", ""match_api_id"": ""492476"", ""home_te..."
4,4,"{""id"": ""5"", ""match_api_id"": ""492477"", ""home_te..."


## Questao 4
- Encontre uma relação de dados entre as tabelas League, Country, Team_Attributes e Player, crie uma nova tabela chamada 'Relations'
    - Exemplo: Esquematize uma relação entre a altura e peso dos jogadores de forma isolada, com seu rendimento.
    - Dica: Criar 3 intervalos entre os 6 valores com maiores contagens para estes dois parâmetros.
---
- Não encontrei um vinculo com a tabela **Player** e as demais.

Tomei a liberdade de pesquisar na internet de qual país e liga é cada time e vou adicionar para ligar as tabelas **Team_Attributes_Modified**, **Country** e **League**.

In [63]:
query = 'SELECT * FROM Team_Attributes_Modified'
df_player_attributes = sql_df(query)
team_to_country_map = {
    'AAR': 1,
    'ABE': 1729,
    'AJA': 13274,
    'ACM': 10257,
    'ACA': 4769,
    'ALM': 15722,
    'AND': 17642,
    'ARK': 21518,
    'ARL': 19694,
    'BIE': 24558,
    'ARO': 15722,
    'ARS': 1729,
    'AVL': 1729,
    'ATA': 10257,
    'BIL': 1729,
    'AUG': 7809,
    'AUX': 4769,
    'ALK': 7809,
    'BAR': 21518,
    'BAS': 7809,
    'LEV': 7809,
    'BMU': 15722,
    'BAC': 24558,
    'B-M': 15722,
    'BEL': 1729,
    'BEN': 21518,
    'BIR': 1729,
    'BLB': 1729,
    'BLA': 1729,
    'BOA': 21518,
    'BOL': 15722,
    'BOR': 1729,
    'DOR': 7809,
    'GLA': 1729,
    'BOU': 1729,
    'BRA': 1729,
    'BRE': 21518,
    'BUR': 1729,
    'CAG': 10257,
    'CAM': 4769,
    'CAR': 1729,
    'CAP': 21518,
    'CAT': 1729,
    'CEL': 15722,
    'CES': 7809,
    'CHE': 1729,
    'CHI': 7809,
    'CLB': 1729,
    'COR': 7809,
    'CKR': 15722,
    'CRY': 1729,
    'DAR': 1729,
    'GRA': 10257,
    'DIJ': 4769,
    'DUU': 15722,
    'DUN': 19694,
    'EIB': 21518,
    'EFR': 1729,
    'ELC': 1729,
    'EMP': 10257,
    'COT': 1729,
    'ESP': 21518,
    'EUP': 1729,
    'EVE': 1729,
    'ETG': 4769,
    'EXC': 1729,
    'FAL': 21518,
    'FCK': 1729,
    'GRO': 4769,
    'POR': 21518,
    'UTR': 24558,
    'VAD': 21518,
    'ZUR': 24558,
    'FEY': 13274,
    'FIO': 10257,
    'FDU': 1729,
    'FRE': 4769,
    'FRO': 1729,
    'FUL': 1729,
    'GEN': 10257,
    'GAJ': 7809,
    'GV': 10257,
    'GAE': 21518,
    'LEC': 1729,
    'GOR': 1729,
    'GRE': 1729,
    'GRF': 21518,
    'GUI': 7809,
    'HAM': 1729,
    'HAN': 7809,
    'HER': 7809,
    'HBE': 21518,
    'HIB': 19694,
    'HOF': 7809,
    'HUL': 1729,
    'ING': 1729,
    'INT': 10257,
    'BIA': 10257,
    'JUV': 10257,
    'KAI': 7809,
    'KAR': 1729,
    'KIL': 1729,
    'KKI': 7809,
    'KOR': 7809,
    'MEC': 21518,
    'LAS': 21518,
    'LAU': 21518,
    'LAZ': 10257,
    'LEH': 15722,
    'LEM': 21518,
    'POZ': 15722,
    'LGD': 4769,
    'LEG': 4769,
    'LEI': 1729,
    'LEN': 1729,
    'LIE': 7809,
    'LIL': 4769,
    'LIV': 1729,
    'LOD': 21518,
    'LOK': 7809,
    'LOR': 21518,
    'LUZ': 21518,
    'LYO': 4769,
    'MAI': 4769,
    'MAL': 21518,
    'MUN': 7809,
    'MAR': 21518,
    'MET': 21518,
    'MID': 1729,
    'MON': 4769,
    'MOT': 15722,
    'NAC': 13274,
    'NAN': 1729,
    'NAV': 21518,
    'NEW': 1729,
    'NIC': 15722,
    'NOR': 1729,
    'NOV': 21518,
    'NUR': 1729,
    'NUM': 21518,
    'ODR': 21518,
    'OOS': 21518,
    'OSA': 21518,
    'O-H': 21518,
    'FER': 21518,
    'PAL': 10257,
    'PSG': 4769,
    'PAR': 4769,
    'ZWO': 21518,
    'PEN': 21518,
    'PIG': 21518,
    'POD': 21518,
    'POG': 21518,
    'POB': 21518,
    'PSV': 21518,
    'QPR': 1729,
    'SAN': 10257,
    'RAN': 21518,
    'REA': 1729,
    'BET': 21518,
    'SOC': 21518,
    'HUE': 21518,
    'REG': 21518,
    'REI': 21518,
    'REN': 4769,
    'RA': 21518,
    'RKC': 21518,
    'SIE': 10257,
    'ROD': 1729,
    'ROS': 21518,
    'ROM': 10257,
    'CHO': 4769,
    'ETI': 15722,
    'SAM': 10257,
    'SAS': 10257,
    'HEE': 21518,
    'S04': 7809,
    'SER': 1729,
    'SEV': 21518,
    'SLA': 21518,
    'SOU': 1729,
    'SPA': 21518,
    'SCP': 21518,
    'SPG': 21518,
    'NAP': 10257,
    'GAL': 10257,
    'MIR': 21518,
    'STP': 21518,
    'JOH': 21518,
    'STL': 21518,
    'STK': 1729,
    'SUN': 1729,
    'THU': 21518,
    'TOR': 21518,
    'TOT': 1729,
    'TOU': 4769,
    'TRO': 21518,
    'TWE': 24558,
    'UDI': 10257,
    'ULE': 21518,
    'VAL': 1729,
    'VER': 21518,
    'STU': 15722,
    'VIL': 21518,
    'VIT': 21518,
    'SET': 24558,
    'VEN': 21518,
    'WAA': 21518,
    'WAT': 1729,
    'WBR': 1729,
    'WBA': 1729,
    'WHU': 1729,
    'WES': 1729,
    'WID': 21518,
    'WIG': 1729,
    'WII': 1729,
    'WIS': 1729,
    'WOL': 10257,
    'XAM': 1729,
    'XER': 21518,
    'ZAG': 21518,
    'ZAR': 21518,
    'ZAW': 21518,
    'ZUL': 21518
}
df_player_attributes["country_id"] = df_player_attributes["team_short_name"].map(team_to_country_map)# type:ignore
query = '''
    SELECT 
        League.id, 
        League.country_id,
        League.name as league_name,
        Country.name as country_name
    FROM Country
    JOIN League ON League.country_id = Country.id;
'''
df_league_country = sql_df(query)

# Merge dos Dataframes pela coluna "country_id" mantendo apenas a coluna com o nome do país
df_relations = pd.merge(df_player_attributes, df_league_country[['country_id', 'league_name', 'country_name']], left_on='country_id', right_on='country_id', how='inner')
df_relations.head()

Unnamed: 0,id,team_fifa_api_id,team_api_id,date,buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribbling,buildUpPlayDribblingClass,buildUpPlayPassing,buildUpPlayPassingClass,...,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass,team_long_name,team_short_name,country_id,league_name,country_name
0,1,434,9930,2010-02-22 00:00:00.000000,60,Balanced,,Little,50,Mixed,...,55,Press,45,Normal,Cover,FC Aarau,AAR,1,Belgium Jupiler League,Belgium
1,2,434,9930,2014-09-19 00:00:00.000000,52,Balanced,48.0,Normal,56,Mixed,...,44,Press,54,Normal,Cover,FC Aarau,AAR,1,Belgium Jupiler League,Belgium
2,3,434,9930,2015-09-10 00:00:00.000000,47,Balanced,41.0,Normal,54,Mixed,...,44,Press,54,Normal,Cover,FC Aarau,AAR,1,Belgium Jupiler League,Belgium
3,4,77,8485,2010-02-22 00:00:00.000000,70,Fast,,Little,70,Long,...,70,Double,70,Wide,Cover,Aberdeen,ABE,1729,England Premier League,England
4,5,77,8485,2011-02-22 00:00:00.000000,47,Balanced,,Little,52,Mixed,...,47,Press,52,Normal,Cover,Aberdeen,ABE,1729,England Premier League,England


Algumas análises

Evolução dos melhores jogadores

In [64]:
query = '''
    WITH cte_best AS (
        SELECT player_name
        FROM Player_Attributes_Modified
        GROUP BY 1
        ORDER BY AVG(overall_rating) DESC
        LIMIT 5
    )
    SELECT Player_Attributes_Modified.player_name, date, AVG(Player_Attributes_Modified.overall_rating) AS overall_rating
    FROM Player_Attributes_Modified
    INNER JOIN cte_best ON cte_best.player_name = Player_Attributes_Modified.player_name
    GROUP BY Player_Attributes_Modified.player_name, date
    ORDER BY overall_rating DESC
'''
player_h = sql_df(query)
player_h

Unnamed: 0,player_name,date,overall_rating
0,Lionel Messi,2011-08-30,94.0
1,Lionel Messi,2012-02-22,94.0
2,Lionel Messi,2012-08-31,94.0
3,Lionel Messi,2013-02-15,94.0
4,Lionel Messi,2013-03-08,94.0
...,...,...,...
116,Franck Ribery,2007-02-22,85.0
117,Lionel Messi,2007-02-22,85.0
118,Zlatan Ibrahimovic,2010-08-30,85.0
119,Andres Iniesta,2007-08-30,84.0


In [65]:
player_h['date'] = pd.to_datetime(player_h['date'])
player_h = player_h.sort_values(by=['player_name', 'date'])
traces = []
for player_name, data in player_h.groupby('player_name'):
    trace = go.Scatter(
        x=data['date'],
        y=data['overall_rating'],
        mode='lines',
        name=player_name
    )
    traces.append(trace)

layout = go.Layout(
    title='Evolução da média do desempenho do 5 melhores jogadores',
    xaxis=dict(title='Data de avaliação'),
    yaxis=dict(title='Desempenho'),
)

fig = go.Figure(data=traces, layout=layout)
fig.show()

Saldo de gols por país

In [66]:
query = '''
        SELECT
            name,
            SUM(home_team_goal + away_team_goal) AS total_goal
        FROM Match
        JOIN Country ON Country.id = Match.country_id
        GROUP BY 1
        ORDER BY total_goal DESC
    '''
gols = sql_df(query)
gols.head()

Unnamed: 0,name,total_goal
0,Spain,8412
1,England,8240
2,Italy,7895
3,Netherlands,7542
4,France,7427


In [67]:
fig7 =  px.bar(gols, x='name', y='total_goal',
             title='Países com maior quantidade de gols marcados.',
             labels={'total_goal': 'Quantidade de Gols', 'name': 'País'},
             color_discrete_sequence=['#2937fe'])
fig7.show()

Melhores jogadores por pé

In [68]:
query = '''
    SELECT player_name, AVG(overall_rating) AS overall_rating, preferred_foot
    FROM Player_Attributes_Modified
    GROUP BY 1, 3
    ORDER BY AVG(overall_rating) DESC
'''
player_r_l = sql_df(query)
right_footed = player_r_l[player_r_l['preferred_foot'] == 'right']
left_footed = player_r_l[player_r_l['preferred_foot'] == 'left']
left_footed


Unnamed: 0,player_name,overall_rating,preferred_foot
0,Lionel Messi,92.192308,left
5,Arjen Robben,87.840000,left
8,Iker Casillas,86.954545,left
10,David Silva,86.538462,left
12,Robin van Persie,86.473684,left
...,...,...,...
11996,David Caiado,46.000000,left
11998,Joao Real,46.000000,left
12002,Piotr Wisniewski,45.333333,left
12004,Alain Wiss,45.000000,left


In [69]:
total = len(player_attributes)
total_r = player_attributes[player_attributes['preferred_foot'] == 'right'].preferred_foot.count()
total_l = player_attributes[player_attributes['preferred_foot'] == 'left'].preferred_foot.count()

percentual_l = total_l/ total
percentual_r = total_r/ total

print("Total de dados: ", total)
print("Total de destros: ", total_r)
print("Total de canhotos: ", total_l)
print("Percentual de destros: ", (round(percentual_r, 2)*100), "%")
print("Percentual de canhotos: ", (round(percentual_l, 2)*100), "%")

Total de dados:  180354
Total de destros:  136247
Total de canhotos:  44107
Percentual de destros:  76.0 %
Percentual de canhotos:  24.0 %


In [70]:
labels = ['Destro', 'Canhoto']
values = player_attributes['preferred_foot'].value_counts()

pie_trace = go.Pie(labels=labels, values=values, hoverinfo='label+percent', textinfo='percent+label', 
                   marker=dict(colors=['#2937fe', '#f8d124']), hole=0.3)

layout = go.Layout(title='Distribuição entre destros e canhotos',
                   annotations=[{'text': f"Percentual de destros: {(round(percentual_r, 2) * 100)}%",
                                 'showarrow': False, 'x': .5, 'y': 1.1
                    }], showlegend=False)

fig4 = go.Figure(data=[pie_trace], layout=layout)
fig4.show()

In [71]:
player_r_l = sql_df(query)
right_footed = player_r_l[player_r_l['preferred_foot'] == 'right'].head(5).sort_values(by='overall_rating', ascending=True)
left_footed = player_r_l[player_r_l['preferred_foot'] == 'left'].head(5).sort_values(by='overall_rating', ascending=True)

fig5 = make_subplots(rows=1, cols=2, subplot_titles=("Destros", "Canhotos"))

fig5.add_trace(go.Bar(
    y=right_footed['player_name'],
    x=right_footed['overall_rating'],
    name='Média de desempenho de destros',
    marker_color='#2937fe',
    orientation='h',
    showlegend=False
), row=1, col=1)

fig5.add_trace(go.Bar(
    y=left_footed['player_name'],
    x=left_footed['overall_rating'],
    name='Média de desempenho de canhotos',
    marker_color='#f8d124',
    orientation='h',
    showlegend=False
), row=1, col=2)

for i in range(1, 2):
    fig5.update_xaxes(title_text='Pé preferencial', row=1, col=i)
    fig5.update_yaxes(title_text='Média de desempenho', row=1, col=i)

fig5.update_layout(title='Desempenho dos melhores jogadores destros e canhotos')
fig5.show()

- Para a construção da query utilizei a tabela criada anteriormente **Team_Attributes_modified** onde ja fiz a leitura do **json** e join con a tabe **team**.
---
- buildUpPlaySpeed : Velocidade de construção do jogo do time.
- buildUpPlayPassing : Passe de construção do time.
- chanceCreationPassing : Passe na criação de oportunidades do time.
- chanceCreationCrossing : Cruzamento na criação de oportunidades do time.
- chanceCreationShooting : Chute na criação de oportunidades do time.
- defencePressure : Pressão defensiva do time.
- defenceAggression : Agressividade defensiva do time.

In [72]:
query = '''
    SELECT
        team_long_name,
        SUM(
            buildUpPlaySpeed +
            buildUpPlayPassing +
            chanceCreationPassing +
            chanceCreationCrossing +
            chanceCreationShooting +
            defencePressure +
            defenceAggression
        ) AS total_organization_score
    FROM Relations
    GROUP BY 1
    ORDER BY total_organization_score DESC;
'''
df_best_teams = sql_df(query)
df_best_teams

Unnamed: 0,team_long_name,total_organization_score
0,Widzew Łódź,3032
1,Borussia Dortmund,2540
2,Standard de Liège,2423
3,Celtic,2422
4,TSG 1899 Hoffenheim,2396
...,...,...
247,FC Vaduz,321
248,Boavista FC,319
249,FC Penafiel,314
250,Leixões SC,285


In [73]:
fig6 = make_subplots(rows=1, cols=2, subplot_titles=("5 Melhores Jogadores", "5 Melhores Times"))
player_best = player_r_l.head(5).sort_values(by='overall_rating', ascending=False)
fig6.add_trace(go.Bar(
    x=player_best['player_name'],
    y=player_best['overall_rating'],
    name='Média de desempenho dos melhores jogadores',
    marker_color='#2937fe',
    showlegend=False
), row=1, col=1)

df_best_teams = df_best_teams.head(5).sort_values(by='total_organization_score', ascending=False)
fig6.add_trace(go.Bar(
    x=df_best_teams['team_long_name'],
    y=df_best_teams['total_organization_score'],
    name='Total de desempenho dos melhores times',
    marker_color='#f8d124',
    showlegend=False
), row=1, col=2)

fig6.update_layout(title='5 Melhores jogadores')
fig6.show()

In [74]:
query = '''
    SELECT
        team_long_name,
        team_short_name,
        country_name,
        league_name,
        SUM(
            buildUpPlaySpeed +
            buildUpPlayPassing +
            chanceCreationPassing +
            chanceCreationCrossing +
            chanceCreationShooting +
            defencePressure +
            defenceAggression
        ) AS total_organization_score
    FROM Relations
    GROUP BY 1, 2, 3, 4
    ORDER BY total_organization_score DESC;
'''
df_best_teams2 = sql_df(query)
df_best_teams2

Unnamed: 0,team_long_name,team_short_name,country_name,league_name,total_organization_score
0,Borussia Dortmund,DOR,Germany,Germany 1. Bundesliga,2540
1,Standard de Liège,STL,Spain,Spain LIGA BBVA,2423
2,Celtic,CEL,Poland,Poland Ekstraklasa,2422
3,TSG 1899 Hoffenheim,HOF,Germany,Germany 1. Bundesliga,2396
4,Burnley,BUR,England,England Premier League,2390
...,...,...,...,...,...
249,FC Vaduz,VAD,Spain,Spain LIGA BBVA,321
250,Boavista FC,BOA,Spain,Spain LIGA BBVA,319
251,FC Penafiel,PEN,Spain,Spain LIGA BBVA,314
252,Leixões SC,LEI,England,England Premier League,285


In [75]:
df_teams = df_best_teams2.sort_values(by='total_organization_score', ascending=True).head(5)
df_country_scores = df_best_teams2.groupby('country_name')['total_organization_score'].sum().reset_index()
df_country_scores = df_country_scores.sort_values(by='total_organization_score', ascending=True).head(5)

fig2 = go.Figure()
fig2.add_trace(go.Bar(
    y=df_teams['team_long_name'],
    x=df_teams['total_organization_score'],
    name='Top 5 Times',
    orientation='h',
    text=df_teams['total_organization_score'].astype(str) + ', ' + df_teams['country_name'],
    marker=dict(color='#2937fe')
))

fig2.update_layout(
    title='O desempenho dos 5 Melhores Times',
    xaxis_title='Total de desempenho',
    yaxis_title='Time',
    showlegend=False
)

fig3 = go.Figure()
fig3.add_trace(go.Bar(
    y=df_country_scores['country_name'],
    x=df_country_scores['total_organization_score'],
    name='Top 5 países',
    orientation='h',
    text=df_country_scores['total_organization_score'],
    marker=dict(color='#f8d124')
))

fig3.update_layout(
    title='O desempenhos dos países com o melhores times',
    xaxis_title='Total de desempenho',
    yaxis_title='País',
    showlegend=False
)
fig2.show()
fig3.show()

- Notamos que por mais que os 5 melhores times são da Espanha ou Inglaterra, esses países não aparecem no 5 país com score mais alto de todos os times
- Sugere muitas investigações futuras, pelo tempo não conseguirei me aprofundar.

Para construção da query utilizei a tabela criada **Player_Attributes_Modified** que contém os dados de peso e altura.

In [76]:
query = '''
    WITH height AS (
        SELECT
            height,
            (ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC) + 1) / 2 as interval,
            ROUND(AVG(weight), 2) AS avg_weight,
            COUNT(*) AS count_height,
            AVG(overall_rating) AS overall_rating_by_height,
            ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC) as row_number_height
        FROM Player_Attributes_Modified
        GROUP BY height
        ORDER BY count_height DESC
        LIMIT 6
    ),
    weight AS (
        SELECT
            weight,
            (ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC) + 1) / 2 as interval,
            ROUND(AVG(height), 2) AS avg_height,
            COUNT(*) AS count_weight,
            AVG(overall_rating) AS overall_rating_by_weight,
            ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC) as row_number_weight
        FROM Player_Attributes_Modified
        GROUP BY weight
        ORDER BY count_weight DESC
        LIMIT 6
    )
    SELECT
        height.interval,
        height.height,
        height.avg_weight,
        height.count_height,
        height.overall_rating_by_height,
        weight.weight,
        weight.avg_height,
        weight.count_weight,
        weight.overall_rating_by_weight
    FROM height
    JOIN weight ON height.row_number_height = weight.row_number_weight;
'''
df_height_weight = sql_df(query)
df_height_weight

Unnamed: 0,interval,height,avg_weight,count_height,overall_rating_by_height,weight,avg_height,count_weight,overall_rating_by_weight
0,1,182.88,170.44,32170,68.413385,165,180.99,12826,68.13471
1,1,177.8,161.04,24283,68.200265,176,184.91,11409,68.964248
2,2,180.34,165.78,22901,68.267336,172,183.32,10774,68.718893
3,2,187.96,179.62,21826,68.749517,154,177.64,10574,67.716818
4,3,185.42,174.92,21343,68.540084,168,181.97,10216,68.30059
5,3,175.26,156.36,19590,68.769101,159,178.17,10031,68.511067


In [77]:
fig = make_subplots(rows=1, cols=3, subplot_titles=("Intervalo 1", "Intervalo 2", "Interval0 3"))

# Filtrar os dados para cada intervalo
interval1 = df_height_weight[df_height_weight['interval'] == 1]
interval2 = df_height_weight[df_height_weight['interval'] == 2]
interval3 = df_height_weight[df_height_weight['interval'] == 3]

fig.add_trace(go.Bar(
    x=interval1['height'],
    y=interval1['overall_rating_by_height'],
    name='Overall Rating por Height',
    marker_color='#2937fe',
    showlegend=False
), row=1, col=1)

fig.add_trace(go.Bar(
    x=interval1['weight'],
    y=interval1['overall_rating_by_weight'],
    name='Overall Rating por Weight',
    marker_color='#f8d124',
    showlegend=False
), row=1, col=1)

fig.add_trace(go.Bar(
    x=interval2['height'],
    y=interval2['overall_rating_by_height'],
    name='Overall Rating por Height',
    marker_color='#2937fe',
    showlegend=False
), row=1, col=2)

fig.add_trace(go.Bar(
    x=interval2['weight'],
    y=interval2['overall_rating_by_weight'],
    name='Overall Rating por Weight',
    marker_color='#f8d124',
    showlegend=False
), row=1, col=2)

fig.add_trace(go.Bar(
    x=interval3['height'],
    y=interval3['overall_rating_by_height'],
    name='Overall Rating por Height',
    marker_color='#2937fe',
    showlegend=False
), row=1, col=3)

fig.add_trace(go.Bar(
    x=interval3['weight'],
    y=interval3['overall_rating_by_weight'],
    name='Overall Rating por Weight',
    marker_color='#f8d124',
    showlegend=False
), row=1, col=3)

for i in range(1, 4):
    fig.update_xaxes(title_text='Peso/Altura', row=1, col=i)
    fig.update_yaxes(title_text='Média de desempenho', row=1, col=i)

fig.update_layout(title='Desempenho por Altura e Peso')

# Criar subplots para os 3 gráficos de barras agrupadas para count_weight
fig1 = make_subplots(rows=1, cols=3, subplot_titles=("Grupo 1", "Grupo 2", "Grupo 3"))

fig1.add_trace(go.Bar(
    x=interval1['height'],
    y=interval1['count_height'],
    name='Quantidade por Height',
    marker_color='#2937fe'
), row=1, col=1)

fig1.add_trace(go.Bar(
    x=interval1['weight'],
    y=interval1['count_weight'],
    name='Quantidade por Weight',
    marker_color='#f8d124'
), row=1, col=1)

fig1.add_trace(go.Bar(
    x=interval2['height'],
    y=interval2['count_height'],
    name='Quantidade por Height',
    marker_color='#2937fe',
    showlegend=False
), row=1, col=2)

fig1.add_trace(go.Bar(
    x=interval2['weight'],
    y=interval2['count_weight'],
    name='Quantidade por Weight',
    marker_color='#f8d124',
    showlegend=False
), row=1, col=2)

fig1.add_trace(go.Bar(
    x=interval3['height'],
    y=interval3['count_height'],
    name='Quantidade por Height',
    marker_color='#2937fe',
    showlegend=False
), row=1, col=3)

fig1.add_trace(go.Bar(
    x=interval3['weight'],
    y=interval3['count_weight'],
    name='Quantidade por Weight',
    marker_color='#f8d124',
    showlegend=False
), row=1, col=3)

for i in range(1, 4):
    fig1.update_xaxes(title_text='Peso/Altura', row=1, col=i)
    fig1.update_yaxes(title_text='Quantidade', row=1, col=i)

fig1.update_layout(title='Quantidade de Altura e Peso')
fig1.show()
fig.show()

- Notamos que a média de desempenho por peso e altura é bem parecida nos 6 pesos e no grupo 1 uma altura destoa das demais mais encontrados entre os jogadores. A quantidade encontrada em 128.88

Questão 5 - Utiize SQL-CTE

Semanalmente o gerente da Fifa solicita a média de gols dos times mandantes, porém como você gosta de entregar mais do que lhe pedem, você resolveu montar uma CTE para entregar outras métricas para seu gerente. Como você faria, em SQL, para salvar ou automatizar essa query? Envie-nos seu código.

- Setar uma data para testes da query mas a automatization sera com now() ou getdate() de dependendo do banco **SELECT date('now') AS today, date('now', '-7 days') AS seven_days_ago**
---
- Query com média geral de gols, média de gols por liga e média de gols por teme nos últimos 7 dias, para o time mandante e visitante, os respectivos times no período. 
---
- Para agendar eu usaria o Azure Data Factory, criaria um pipeline conectando as bases ou api's externas, neste exemplo específico seria conectado a apenas uma única base, e adicionaria uma trigger semanalmente e retry caso algum problema.

In [78]:
query = '''
    WITH dates AS (
        SELECT date('2008-08-17') AS today, date('2008-08-17', '-7 days') AS seven_days_ago
    ),
    cte_match_avg AS (
        SELECT
            MAX(season) AS season,
            ROUND(AVG(home_team_goal), 2) AS home_team_goal_avg,
            ROUND(AVG(away_team_goal), 2) AS away_team_goal_avg
        FROM Match
        WHERE date BETWEEN (SELECT seven_days_ago FROM dates) AND (SELECT today FROM dates)
            AND (home_team_api_id IS NOT NULL OR away_team_api_id IS NOT NULL)
    ),
    cte_match_by_country_avg AS (
        SELECT
            MAX(season) AS season,
            ROUND(AVG(home_team_goal), 2) AS home_team_goal_avg_by_league,
            ROUND(AVG(away_team_goal), 2) AS away_team_goal_avg_by_league,
            league_id
        FROM Match
        WHERE date BETWEEN (SELECT seven_days_ago FROM dates) AND (SELECT today FROM dates)
            AND (home_team_api_id IS NOT NULL OR away_team_api_id IS NOT NULL)
        GROUP BY 4
    ),
    cte_home_teams AS (
        SELECT
            home_team_api_id,
            away_team_api_id,
            (SELECT team_long_name FROM Relations WHERE Relations.team_api_id = home_team_api_id) AS home_team_long_name,
            (SELECT team_long_name FROM Relations WHERE Relations.team_api_id = away_team_api_id) AS away_team_long_name,
            league_id,
            MAX(season) AS season,
            ROUND(AVG(home_team_goal), 2) AS home_team_goal_avg
        FROM Match
        WHERE Match.date BETWEEN (SELECT seven_days_ago FROM dates) AND (SELECT today FROM dates)
            AND (
                (SELECT team_long_name FROM Relations WHERE Relations.team_api_id = home_team_api_id) IS NOT NULL
            )
        GROUP BY 1, 2, 3, 4
    ),
    cte_away_teams AS (
        SELECT
            home_team_api_id,
            away_team_api_id,
            (SELECT team_long_name FROM Relations WHERE Relations.team_api_id = home_team_api_id) AS home_team_long_name,
            (SELECT team_long_name FROM Relations WHERE Relations.team_api_id = away_team_api_id) AS away_team_long_name,
            league_id,
            MAX(season) AS season,
            ROUND(AVG(away_team_goal), 2) AS away_team_goal_avg
        FROM Match
        WHERE Match.date BETWEEN (SELECT seven_days_ago FROM dates) AND (SELECT today FROM dates)
            AND (
                (SELECT team_long_name FROM Relations WHERE Relations.team_api_id = away_team_api_id) IS NOT NULL
            )
        GROUP BY 1, 2, 3, 4
    )
    SELECT
        cte_home_teams.home_team_long_name AS home_team_long_name,
        cte_home_teams.home_team_goal_avg AS home_team_goal_avg,
        cte_match_by_country_avg.home_team_goal_avg_by_league,
        (SELECT home_team_goal_avg FROM cte_match_avg) AS home_total_avg,
        cte_home_teams.away_team_long_name AS away_team_long_name,
        cte_away_teams.away_team_goal_avg,
        cte_match_by_country_avg.away_team_goal_avg_by_league,
        (SELECT away_team_goal_avg FROM cte_match_avg) AS away_total_avg
    FROM cte_home_teams
    LEFT JOIN cte_match_by_country_avg ON cte_match_by_country_avg.league_id = cte_home_teams.league_id
    LEFT JOIN cte_away_teams ON cte_away_teams.home_team_api_id =  cte_home_teams.home_team_api_id
        AND cte_away_teams.away_team_api_id = cte_home_teams.away_team_api_id
    WHERE cte_away_teams.away_team_long_name IS NOT NULL
'''
df_avg_weekly = sql_df(query)
df_avg_weekly

Unnamed: 0,home_team_long_name,home_team_goal_avg,home_team_goal_avg_by_league,home_total_avg,away_team_long_name,away_team_goal_avg,away_team_goal_avg_by_league,away_total_avg
0,Ruch Chorzów,2.0,0.9,1.47,Lechia Gdańsk,1.0,0.8,1.18
1,Jagiellonia Białystok,0.0,0.9,1.47,Lech Poznań,3.0,0.8,1.18
2,Cracovia,2.0,0.9,1.47,Widzew Łódź,0.0,0.8,1.18
3,Neuchâtel Xamax,1.0,1.17,1.47,Grasshopper Club Zürich,1.0,2.17,1.18
4,Polonia Bytom,0.0,0.9,1.47,Ruch Chorzów,0.0,0.8,1.18
5,Polonia Bytom,1.0,0.9,1.47,Śląsk Wrocław,1.0,0.8,1.18
6,Bayer 04 Leverkusen,2.0,1.44,1.47,Borussia Dortmund,3.0,1.78,1.18
7,Odra Wodzisław,2.0,0.9,1.47,Legia Warszawa,0.0,0.8,1.18
8,Arka Gdynia,1.0,0.9,1.47,Jagiellonia Białystok,1.0,0.8,1.18
9,Arka Gdynia,1.0,0.9,1.47,Polonia Bytom,0.0,0.8,1.18
