# Análisis de datos: FIFA 23

In [1]:
import pandas as pd
import numpy as np

## Data

In [2]:
d_team = pd.read_csv("fullteam_df.csv")
d_player = pd.read_csv("players_fifa23.csv")

### Visualización y transformación para "d_team"

In [3]:
d_team.tail()

Unnamed: 0,ID,Name,Overall,Attack,Midfield,Defence,IntPrestige,Players,StartingAverageAge,AllTeamAverageAge
42,110081,Northern Ireland,70,66,70,72,2,26,28.6,28.7
43,1334,Finland,70,71,71,67,3,26,28.3,27.1
44,1413,China PR,70,71,69,70,3,26,30.3,30.4
45,1341,Iceland,69,69,68,68,2,26,25.2,26.4
46,111473,New Zealand,67,69,66,67,2,26,25.6,25.2


In [4]:
d_team.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47 entries, 0 to 46
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  47 non-null     int64  
 1   Name                47 non-null     object 
 2   Overall             47 non-null     int64  
 3   Attack              47 non-null     int64  
 4   Midfield            47 non-null     int64  
 5   Defence             47 non-null     int64  
 6   IntPrestige         47 non-null     int64  
 7   Players             47 non-null     int64  
 8   StartingAverageAge  47 non-null     float64
 9   AllTeamAverageAge   47 non-null     float64
dtypes: float64(2), int64(7), object(1)
memory usage: 3.8+ KB


In [5]:
d_team.nunique().reset_index()

Unnamed: 0,index,0
0,ID,47
1,Name,47
2,Overall,16
3,Attack,19
4,Midfield,18
5,Defence,16
6,IntPrestige,10
7,Players,2
8,StartingAverageAge,28
9,AllTeamAverageAge,26


In [6]:
d_team.rename(columns = {'Name':'Country'}, inplace = True)

In [7]:
d_team.head()

Unnamed: 0,ID,Country,Overall,Attack,Midfield,Defence,IntPrestige,Players,StartingAverageAge,AllTeamAverageAge
0,1370,Brazil,85,85,85,83,10,26,28.6,27.6
1,1337,Germany,84,84,85,82,10,26,27.4,26.5
2,1369,Argentina,84,86,84,82,9,26,29.4,27.6
3,1318,England,83,86,83,83,8,26,26.6,26.1
4,1325,Belgium,83,84,81,79,8,26,28.2,27.6


Cada fila del dataset "d_tema" representa las caracterísiticas futbolísticas de un pais definido por su "ID" único. No existen valores vacíos (null) y los tipos de variables están asignados correctamente.

### Visualización y transformación para "d_player"

In [8]:
d_player.head()

Unnamed: 0,ID,Name,FullName,Age,Height,Weight,PhotoUrl,Nationality,Overall,Potential,...,LMRating,CMRating,RMRating,LWBRating,CDMRating,RWBRating,LBRating,CBRating,RBRating,GKRating
0,158023,L. Messi,Lionel Messi,35,169,67,https://cdn.sofifa.net/players/158/023/23_60.png,Argentina,91,91,...,91,87,91,67,65,67,62,53,62,22
1,165153,K. Benzema,Karim Benzema,34,187,81,https://cdn.sofifa.net/players/165/153/23_60.png,France,91,91,...,89,84,89,67,67,67,63,58,63,21
2,188545,R. Lewandowski,Robert Lewandowski,33,185,81,https://cdn.sofifa.net/players/188/545/23_60.png,Poland,91,91,...,86,83,86,67,69,67,64,63,64,22
3,190871,Neymar Jr,Neymar da Silva Santos Jr.,30,175,68,https://cdn.sofifa.net/players/190/871/23_60.png,Brazil,91,91,...,91,85,91,69,66,69,65,54,65,23
4,192119,T. Courtois,Thibaut Courtois,30,199,96,https://cdn.sofifa.net/players/192/119/23_60.png,Belgium,91,92,...,34,36,34,32,34,32,32,32,32,91


Dimensiones del dataset

In [9]:
d_player.shape

(1221, 82)

No existen valores vacíos (null)

In [10]:
d_player.isnull().sum().sum()

0

Dado que el número de valores únicos en la columna ID (1220) no es el mismo que el número de filas del dataset (1221), entonces existe una valor que está duplicado.

In [11]:
d_player.nunique().reset_index()

Unnamed: 0,index,0
0,ID,1220
1,Name,1213
2,FullName,1220
3,Age,24
4,Height,41
...,...,...
77,RWBRating,68
78,LBRating,68
79,CBRating,69
80,RBRating,68


In [12]:
d_player[d_player["ID"].duplicated(keep = False)]

Unnamed: 0,ID,Name,FullName,Age,Height,Weight,PhotoUrl,Nationality,Overall,Potential,...,LMRating,CMRating,RMRating,LWBRating,CDMRating,RWBRating,LBRating,CBRating,RBRating,GKRating
64,241461,Ferran Torres,Ferran Torres García,22,184,77,https://cdn.sofifa.net/players/241/461/23_60.png,Spain,85,90,...,86,79,86,65,61,65,61,52,61,22
85,241461,Ferran Torres,Ferran Torres García,22,184,77,https://cdn.sofifa.net/players/241/461/23_60.png,Spain,85,90,...,86,79,86,65,61,65,61,52,61,22


Eliminar registros duplicados del dataset

In [13]:
d_player.drop_duplicates(subset = 'ID', keep = 'first', inplace = True, ignore_index = True)

Nueva verificación

In [14]:
d_player.shape

(1220, 82)

In [15]:
d_player['ID'].nunique()

1220

### Análisis para "d_team"

In [79]:
d_team.tail()

Unnamed: 0,ID,Country,Overall,Attack,Midfield,Defence,IntPrestige,Players,StartingAverageAge,AllTeamAverageAge
42,110081,Northern Ireland,70,66,70,72,2,26,28.6,28.7
43,1334,Finland,70,71,71,67,3,26,28.3,27.1
44,1413,China PR,70,71,69,70,3,26,30.3,30.4
45,1341,Iceland,69,69,68,68,2,26,25.2,26.4
46,111473,New Zealand,67,69,66,67,2,26,25.6,25.2


TAREAS
- El puntaje pueden ser representado en un diagrama de dispersión.
- Los paises pueden agruparse en un conjuntos más grandes, por ejemplo: continentes.

In [66]:
(d_team['IntPrestige'].value_counts().
        reset_index().
        rename(columns = {'index':'IntPrestige','IntPrestige':'Count'}).
        sort_values(by = 'Count', ascending = False)
)

Unnamed: 0,IntPrestige,Count
0,5,11
1,4,8
2,6,6
3,3,5
4,8,4
5,2,4
6,10,3
7,9,3
8,7,2
9,1,1


#### Top 5 según valoraciones

In [18]:
d_team['Country'].nunique()

47

In [87]:
col_title = d_team.loc[:,'Overall':'Defence'].columns.tolist()

d_team01 = pd.DataFrame([])

top = 5

for i in col_title:
    
    aux01 = (d_team.loc[:,['Country',i]].
                 sort_values(by = i, ascending = False, ignore_index = True).
                 head(top))
    
    aux02 = pd.melt(frame = aux01, id_vars = ['Country'], value_vars = i, var_name = 'Type').reset_index()

    d_team01 = pd.concat([d_team01,aux02], ignore_index = True)
        
print(type(d_team01))
d_team01

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,index,Country,Type,value
0,0,Brazil,Overall,85
1,1,Argentina,Overall,84
2,2,Germany,Overall,84
3,3,England,Overall,83
4,4,Belgium,Overall,83
5,0,Argentina,Attack,86
6,1,England,Attack,86
7,2,France,Attack,86
8,3,Brazil,Attack,85
9,4,Belgium,Attack,84


Paises que figuran en el top 5 según el parámetro de calificación

In [81]:
d_team01['Country'].nunique()

10

In [82]:
d_team01['Country'].value_counts()

Brazil         4
Argentina      3
Germany        2
England        2
Belgium        2
Spain          2
Italy          2
France         1
Portugal       1
Netherlands    1
Name: Country, dtype: int64

Es posible crear tablas dinámicas y mapas de calor

In [56]:
pd.pivot_table(d_team01,
               index = ['Country'],
               columns = ['Type'],
               values = ['index'],
               aggfunc = [np.sum])

Unnamed: 0_level_0,sum,sum,sum,sum
Unnamed: 0_level_1,index,index,index,index
Type,Attack,Defence,Midfield,Overall
Country,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
Argentina,0.0,,4.0,1.0
Belgium,4.0,,,4.0
Brazil,3.0,1.0,0.0,0.0
England,1.0,,,3.0
France,2.0,,,
Germany,,,2.0,2.0
Italy,,2.0,3.0,
Netherlands,,3.0,,
Portugal,,0.0,,
Spain,,4.0,1.0,


¿Por qué Alemania se encuentra en el tercer puesto del ranking general si solo destaca en el medio campo, mientras que España tiene más reconocimientos (y mejores) y no figura en dicho ranking?  

Los argumentos de la pregunta son legítimos y la primera hipóstesis es que la asignación de puestos en el ranking general no solo es el resultado de evaluar el ataque, defensa o medio campo, existen más variables por analizar.

TAREA

¿Cuál es el método estadístico más adecuado para saber si una puntuación es significativa para asegurar un puesto alto en el ranking general?

In [85]:
d_team.head()

Unnamed: 0,ID,Country,Overall,Attack,Midfield,Defence,IntPrestige,Players,StartingAverageAge,AllTeamAverageAge
0,1370,Brazil,85,85,85,83,10,26,28.6,27.6
1,1337,Germany,84,84,85,82,10,26,27.4,26.5
2,1369,Argentina,84,86,84,82,9,26,29.4,27.6
3,1318,England,83,86,83,83,8,26,26.6,26.1
4,1325,Belgium,83,84,81,79,8,26,28.2,27.6


In [96]:
col_title = d_team.loc[:,'Overall':'Defence'].columns.tolist()

d_team01 = pd.DataFrame([])

top = 5

for i in col_title:
    
    aux01 = (d_team.loc[:,['Country','IntPrestige',i]].
                 sort_values(by = i, ascending = False, ignore_index = True).
                 head(top))
    
    aux02 = pd.melt(frame = aux01, id_vars = ['Country','IntPrestige'], value_vars = i, var_name = 'Type').reset_index()

    d_team01 = pd.concat([d_team01,aux02], ignore_index = True)

print(d_team01.head())

pd.pivot_table(d_team01,
               index = ['Country','IntPrestige'],
               columns = ['Type'],
               values = ['index'],
               aggfunc = [np.sum]).sort_values(by = 'IntPrestige', ascending = False)

   index    Country  IntPrestige     Type  value
0      0     Brazil           10  Overall     85
1      1  Argentina            9  Overall     84
2      2    Germany           10  Overall     84
3      3    England            8  Overall     83
4      4    Belgium            8  Overall     83


Unnamed: 0_level_0,Unnamed: 1_level_0,sum,sum,sum,sum
Unnamed: 0_level_1,Unnamed: 1_level_1,index,index,index,index
Unnamed: 0_level_2,Type,Attack,Defence,Midfield,Overall
Country,IntPrestige,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3
Brazil,10,3.0,1.0,0.0,0.0
France,10,2.0,,,
Germany,10,,,2.0,2.0
Argentina,9,0.0,,4.0,1.0
Italy,9,,2.0,3.0,
Spain,9,,4.0,1.0,
Belgium,8,4.0,,,4.0
England,8,1.0,,,3.0
Netherlands,8,,3.0,,
Portugal,8,,0.0,,
