In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from helpers import clean_column, count_positions

In [2]:
# Importar e olhar rapidamente
df = pd.read_csv('data/EAFC26-Men_raw.csv')
df.head()

Unnamed: 0,ID,Rank,Name,GENDER,OVR,PAC,SHO,PAS,DRI,DEF,...,League,Team,play style,url,GK Diving,GK Handling,GK Kicking,GK Positioning,GK Reflexes,card
0,209331,1,Mohamed Salah,M,91,89,88,86,90,45,...,Premier League,Liverpool,"['Finesse Shot+', 'First Touch', 'Gamechanger'...",https://www.ea.com/games/ea-sports-fc/ratings/...,,,,,,https://ratings-images-prod.pulse.ea.com/FC26/...
1,231747,3,Kylian Mbappé,M,91,97,90,81,92,37,...,LALIGA EA SPORTS,Real Madrid,"['Quick Step+', 'Acrobatic', 'Finesse Shot', '...",https://www.ea.com/games/ea-sports-fc/ratings/...,,,,,,https://ratings-images-prod.pulse.ea.com/FC26/...
2,231443,5,Ousmane Dembélé,M,90,91,88,83,93,50,...,Ligue 1 McDonald's,Paris SG,"['Rapid+', 'Inventive', 'Low Driven Shot', 'Pi...",https://www.ea.com/games/ea-sports-fc/ratings/...,,,,,,https://ratings-images-prod.pulse.ea.com/FC26/...
3,231866,6,Rodri,M,90,65,80,86,84,86,...,Premier League,Manchester City,"['Tiki Taka+', 'Aerial Fortress', 'Bruiser', '...",https://www.ea.com/games/ea-sports-fc/ratings/...,,,,,,https://ratings-images-prod.pulse.ea.com/FC26/...
4,203376,8,Virgil van Dijk,M,90,73,60,72,72,90,...,Premier League,Liverpool,"['Intercept+', 'Aerial Fortress', 'Anticipate'...",https://www.ea.com/games/ea-sports-fc/ratings/...,,,,,,https://ratings-images-prod.pulse.ea.com/FC26/...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16228 entries, 0 to 16227
Data columns (total 59 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ID                     16228 non-null  int64  
 1   Rank                   16228 non-null  int64  
 2   Name                   16228 non-null  object 
 3   GENDER                 16228 non-null  object 
 4   OVR                    16228 non-null  int64  
 5   PAC                    16228 non-null  int64  
 6   SHO                    16228 non-null  int64  
 7   PAS                    16228 non-null  int64  
 8   DRI                    16228 non-null  int64  
 9   DEF                    16228 non-null  int64  
 10  PHY                    16228 non-null  int64  
 11  Acceleration           16228 non-null  int64  
 12  Sprint Speed           16228 non-null  int64  
 13  Positioning            16228 non-null  int64  
 14  Finishing              16228 non-null  int64  
 15  Sh

In [4]:
# Checagem de valores faltantes:
missing_counts = df.isnull().sum()
print('Valores Faltantes:')
print(missing_counts[missing_counts > 0])

Valores Faltantes:
Alternative positions     5743
GK Diving                14412
GK Handling              14412
GK Kicking               14412
GK Positioning           14412
GK Reflexes              14412
dtype: int64


In [5]:
# Dropar Colunas desnecessárias
df = df.drop(
    columns=[
        "ID",
        "card", "url", "GENDER",
        "play style"
    ])

In [6]:
gk_cols = ["GK Diving", "GK Handling", "GK Kicking",
           "GK Positioning", "GK Reflexes"]

df[gk_cols] = df[gk_cols].fillna(0)


In [7]:
# Mapeamento manual das posições
sector_map = {
    'GK': 'goalie',

    'CB': 'defense',
    'LB': 'defense',
    'RB': 'defense',
    
    'CDM': 'midfielder',
    'CM': 'midfielder',
    'CAM': 'midfielder',
    'RM': 'midfielder',
    'LM': 'midfielder',

    'ST': 'forward',
    'LW': 'forward',
    'RW': 'forward'
}

# Cria a coluna "sector"
df['sector'] = df['Position'].map(sector_map)


In [8]:
# Definir Versatilidade como:
#  em quantas outras posições o jogador pode jogar
df['versatility'] = df['Alternative positions'].apply(count_positions)


In [9]:
# Deixar Altura em CM
df['Height'] = df['Height'].str.extract(r'(\d+)\s*cm').astype(int)
# Deixar peso em kg
df['Weight'] = df['Weight'].str.extract(r'(\d+)\s*kg').astype(int)

#Renomear
df = df.rename(columns={'Height': 'Height(cm)', 'Weight': 'Weight(kg)'})

In [10]:
df['Preferred foot'] = (df['Preferred foot'] == "Right").astype(int)

In [11]:
# Tratar colunas do dataframe
df.columns = [clean_column(c) for c in df.columns]

In [12]:
# Tratar features categóricas
df = pd.get_dummies(
    data=df,
    columns=['sector'],
    dtype=int
)

In [13]:
# 1. Criar uma máscara para quem NÃO é goleiro
# (Se a coluna se chamar 'pos', mude 'sector' para 'pos')
nao_goleiro = df['sector_goalie'] != 1

# 2. Flag Ofensivo
# Adicionamos "& nao_goleiro" para garantir que goleiros fiquem como False (0)
df['offensive'] = (
    ((df['sho'] > df['def'] * 1.2) | (df['dri'] > df['def'] * 1.2)) & nao_goleiro
).astype(int)

# 3. Flag Defensivo
df['defensive'] = (
    (df['def'] > df['sho'] * 1.2) & (df['def'] > df['dri'] * 1.2) & nao_goleiro
).astype(int)

# 4. Flag Completo
# Aqui também precisamos multiplicar por "nao_goleiro".
# Caso contrário, o goleiro (que é 0 em ofensa e defesa) viraria 1 em completo.
df['all_around'] = (
    (df['offensive'] == 0) & (df['defensive'] == 0) & nao_goleiro
).astype(int)

In [14]:
# PKL para salvar os dados
df.to_pickle('data/df_cleaned.pkl')

# CSV para quem quiser ler rapidamente
df.to_csv('data/df_cleaned.csv', index=False)