In [130]:
import pandas as pd
import numpy as np

In [131]:
CSV_FILE_NAME = 'fifa21_male2.csv'
CSV_OUTPUT_NAME = 'processed.csv'

In [132]:
df = pd.read_csv(CSV_FILE_NAME, low_memory=False)

# Limpieza de datos

In [133]:
columns_to_remove = [
    # Primero vamos a remover las columnas que no tienen aporte 
    # a nuestro modelo, como por ejemplo el ID, nombre, etc
    # Son nombres o informacion muy especifica del jugador en si, que 
    # no estan relacionaddas con su posicion"
    #
    # Nota -> Sacamos posicion porque es una lista de las posiciones que 
    # juega actualmente el jugador y creemos que puede generar un overfitting
    # del modelo                    
    "ID", "Name", "Club", "Player Photo", "Position",
    "Club Logo", "Flag Photo", "Team & Contract", "Joined", 
    "Loan Date End", "Value", "Wage", "Release Clause", "Contract",

    # eliminamos de la columna 65 a la 91 que es un desgloce del desemppeño de cada 
    # jugador en cada posicion. Esto obviamente no es relevante en nuestro modelo, ya que 
    # es justamente lo que necesitamos predecir.                   
    "LS","ST","RS","LW","LF", "CF","RF",
    "RW","LAM","CAM","RAM","LM","LCM","CM",
    "RCM","RM", "LWB", "LDM", "CDM","RDM",
    "RWB", "LB", "LCB","CB","RCB","RB","GK"
]

In [134]:
for column in columns_to_remove:
    del df[column]

In [135]:
# dictionary {"column_name", "filling_value"}
filling_values = {}

# de las columnas que quedan, analizamos cuales cuantos nulls hay
for column in df.columns:
    nans_count = df[column].isna().sum()
    if nans_count > 0:
        filling_value = None
        # si la columna es categorica,
        # completamos con la mode
        if df.dtypes[column] == "object":
            mode = df[column].mode()
            filling_value = mode[0] 
        # si los valores son numericos
        # completamos con la media
        else:
            mean = df[column].mean()
            filling_value = mean
        
        print(f"{column} - {nans_count} NAN's - Filling value: {filling_value}")
        df[column].fillna(filling_value, inplace=True)

# assert that the sum of nans of total nans
# for each column is 0
assert(df.isna().sum().sum() == 0)

Volleys - 58 NAN's - Filling value: 45.00574207535009
Curve - 58 NAN's - Filling value: 49.57467627585399
Agility - 58 NAN's - Filling value: 64.60227339309779
Balance - 58 NAN's - Filling value: 64.71559149235367
Jumping - 58 NAN's - Filling value: 65.17021151930626
Interceptions - 7 NAN's - Filling value: 47.08558242785372
Positioning - 7 NAN's - Filling value: 52.372765509989485
Vision - 58 NAN's - Filling value: 55.44360461709732
Composure - 423 NAN's - Filling value: 59.94096515387379
Sliding Tackle - 58 NAN's - Filling value: 46.09972461475362
A/W - 89 NAN's - Filling value: Medium
D/W - 89 NAN's - Filling value: Medium


# Preprocessing

* #### Heigh - pass feets to cms

In [136]:
def ft_to_cm(meassure):    
    """
    Recives a messure with the following format:
    F'I" where F is the total foots and I are
    the total inches.
    Ex: 5'9" (5 foots and 9 inches)
    """
    FOOT_TO_CM_PROPORTION = 30.48
    INCH_TO_CM_PROPORTION = 2.54

    split = meassure.split("'")
    foots = int(split[0])
    inches = int(split[1][:-1])
    
    height_in_cm = foots * FOOT_TO_CM_PROPORTION +\
                   inches * INCH_TO_CM_PROPORTION
    
    return round(height_in_cm, 2)

heights_in_cm = df.apply(lambda r: ft_to_cm(r['Height']), axis=1)
df['Height'] = heights_in_cm

* #### Weight - pass libras to kilograms

In [137]:
def lb_to_kg(meassure):    
    """
    Recives a messure with the following format:
    Nlbs where N is the weigth in libras
    Ex: 159lbs
    """
    LB_TO_KG_PROPORTION = 0.453592
    
    split = meassure.split("lbs")
    lbs = int(split[0])
    
    weight_in_kg = lbs * LB_TO_KG_PROPORTION
    
    return round(weight_in_kg, 2)

weights_in_kg = df.apply(lambda r: lb_to_kg(r['Weight']), axis=1)
df['Weight'] = weights_in_kg

* #### Hits - transform to integer

In [138]:
def hit_to_int(hit):
    multiplier = 1.0

    if 'K' in hit:
        hit = hit.split('K')[0]
        multiplier = 1000.0
    
    if 'k' in hit:
        hit = hit.split('k')[0]
        multiplier = 1000.0
    
    return float(hit) * multiplier
    
hits_as_numeric = df.apply(lambda r: hit_to_int(r['Hits']), axis=1)
df['Hits'] = hits_as_numeric 

* #### remove ★ icons from columns

In [139]:
def remove_star(messure):
    """
    receive a messure with the followign format:
    "1 ★" and transform it to an integer 
    """
    messure_without_star = messure.replace('★', '')
    return int(messure_without_star)

df['SM'] = df.apply(lambda r: remove_star(r['SM']), axis=1)
df['IR'] = df.apply(lambda r: remove_star(r['IR']), axis=1)
df['W/F'] = df.apply(lambda r: remove_star(r['W/F']), axis=1)

# Save processed CSV

In [140]:
df.to_csv(CSV_OUTPUT_NAME, index=False)