In [940]:
# imports

import pandas as pd

In [941]:
# Import file
data=pd.read_csv('fifa21_train.csv')
data.shape

(11701, 101)

In [942]:
# Standardize headers

def standardize_header(dataframe):
    dataframe.columns = [c.lower().replace(' ', '_') for c in data.columns]

standardize_header(data)

In [943]:
# Drop duplicate columns

def drop_duplicate_columns(dataframe):
    return dataframe.loc[:,~dataframe.T.duplicated(keep='first')]

data = drop_duplicate_columns(data)
data.shape

(11701, 85)

In [944]:
# Standardize headers kept after drop of duplicate columns

def map_columns(dataframe):
    col_mapping = {
        'ls':'striker',
        'lw':'wing',
        'lf':'forward',
        'lam':'att_mid',
        'lm':'side_mid',
        'lcm':'center_mid',
        'lwb':'back_wing',
        'ldm':'def_wing',
        'lb':'def_laterals',
        'lcb':'center_back'
    }
    return dataframe.rename(columns = col_mapping)

data = map_columns(data)

In [945]:
# Drop useless columns

def drop_useless_cols(dataframe):
    return dataframe.drop(columns=['id', 'name', 'age', 'nationality', 'club', 'position', 'team_&_contract', 'joined', 'loan_date_end', 'value', 'wage', 'release_clause', 'contract'], axis=1)
    
data = drop_useless_cols(data)
data.shape

(11701, 72)

In [946]:
# Convert 'height' into cm

def convert_into_cm(dataframe):
    dataframe['height'] = dataframe['height'].str.replace("'", '.').str.replace('"', '').astype(float)
    dataframe['height'] = round(dataframe['height'] * 30.48, 1)

convert_into_cm(data)

In [947]:
# Convert 'weight' into kg

def convert_int_kg(dataframe):
    dataframe['weight'] = dataframe['weight'].str.replace('lbs', '').astype(float)
    dataframe['weight'] = round(dataframe['weight'] * 0.453592, 1)

convert_int_kg(data)

In [948]:
# Standardize 'foot'

def standardize_foot(dataframe):
    dataframe['foot'] = dataframe['foot'].map(dict(Left=0, Right=1))
    
standardize_foot(data)

In [949]:
# Fill nan values for 'volleys', 'curve', 'agility', 'balance', 'jumping', 'vision', 'sliding_tackle', 'interceptions', 'positioning', 'composure'

def fill_nan_subset_mean(dataframe):
    column_names = ['volleys', 'curve', 'agility', 'balance', 'jumping', 'vision', 'sliding_tackle', 'interceptions', 'positioning', 'composure']
    for column in column_names:
        bp_list = dataframe[dataframe[column].isna()]['bp'].unique()
        for bp in bp_list:
            mean = dataframe[dataframe['bp'] == bp][column].mean()
            subset_filter = (dataframe[column].isna()) & (dataframe['bp'] == bp)
            dataframe.loc[subset_filter, column] = mean
    
    return dataframe

data = fill_nan_subset_mean(data)

In [950]:
# Fill nan values for 'a/w' & 'd/w'

def fill_nan_subset_mode(dataframe):
    column_names = ['a/w', 'd/w']
    for column in column_names:
        bp_list = dataframe[dataframe[column].isna()]['bp'].unique()
        for bp in bp_list:
            mode = dataframe[dataframe['bp'] == bp][column].mode()[0]
            subset_filter = (dataframe[column].isna()) & (dataframe['bp'] == bp)
            dataframe.loc[subset_filter, column] = mode
            
    return dataframe
            
data = fill_nan_subset_mode(data)

In [951]:
# Replace 'a/w' & 'd/w' with int values

def map_string_into_int(dataframe):
    column_names = ['a/w', 'd/w']
    for col in column_names:
        dataframe[col] = dataframe[col].map(dict(Low=0, Medium=1, High=2))
        
map_string_into_int(data)

In [952]:
# Remove stars in 'w/f', 'sm' & 'ir'

def remove_stars(dataframe):
    column_names = ['w/f', 'sm', 'ir']
    for column in column_names:
        dataframe[column] = dataframe[column].str.replace('★', '').str.replace(' ', '').astype(int)
        
remove_stars(data)

In [953]:
# Standardize 'hits'

def internal_standardize_hits(x):
    if type(x) == int:
        return x
    elif 'K' in x:
        return int(float(x.replace('K', '')) * 1000)
    else:
        return int(x)
    
def standardize_hits(dataframe):
    dataframe['hits'] = dataframe['hits'].apply(internal_standardize_hits)

standardize_hits(data)

In [954]:
# Evaluate 'striker', 'wing', 'forward', 'att_mid', 'side_mid', 'center_mid', 'back_wing', 'def_wing', 'def_laterals', 'center_back'

def evaluate_columns(dataframe):
    column_names = ['striker', 'wing', 'forward', 'att_mid', 'side_mid', 'center_mid', 'back_wing', 'def_wing', 'def_laterals', 'center_back']
    for col in column_names:
        dataframe[col] = dataframe[col].apply(eval)

evaluate_columns(data)

In [955]:
# Check nan values

def check_nan_values(dataframe):
    for c in dataframe.columns:
        print(f'{c} : {dataframe[c].isna().sum()}')
        
check_nan_values(data)

bp : 0
height : 0
weight : 0
foot : 0
growth : 0
attacking : 0
crossing : 0
finishing : 0
heading_accuracy : 0
short_passing : 0
volleys : 0
skill : 0
dribbling : 0
curve : 0
fk_accuracy : 0
long_passing : 0
ball_control : 0
movement : 0
acceleration : 0
sprint_speed : 0
agility : 0
reactions : 0
balance : 0
power : 0
shot_power : 0
jumping : 0
stamina : 0
strength : 0
long_shots : 0
mentality : 0
aggression : 0
interceptions : 0
positioning : 0
vision : 0
penalties : 0
composure : 0
defending : 0
marking : 0
standing_tackle : 0
sliding_tackle : 0
goalkeeping : 0
gk_diving : 0
gk_handling : 0
gk_kicking : 0
gk_positioning : 0
gk_reflexes : 0
total_stats : 0
base_stats : 0
w/f : 0
sm : 0
a/w : 0
d/w : 0
ir : 0
pac : 0
sho : 0
pas : 0
dri : 0
def : 0
phy : 0
hits : 0
striker : 0
wing : 0
forward : 0
att_mid : 0
side_mid : 0
center_mid : 0
back_wing : 0
def_wing : 0
def_laterals : 0
center_back : 0
gk : 0
ova : 0


In [956]:
# Check columns type

def check_columns_type(dataframe):
    for c in dataframe.columns:
        print(f'{c} : {dataframe[c].dtypes}')
        
check_columns_type(data)

bp : object
height : float64
weight : float64
foot : int64
growth : int64
attacking : int64
crossing : int64
finishing : int64
heading_accuracy : int64
short_passing : int64
volleys : float64
skill : int64
dribbling : int64
curve : float64
fk_accuracy : int64
long_passing : int64
ball_control : int64
movement : int64
acceleration : int64
sprint_speed : int64
agility : float64
reactions : int64
balance : float64
power : int64
shot_power : int64
jumping : float64
stamina : int64
strength : int64
long_shots : int64
mentality : int64
aggression : int64
interceptions : float64
positioning : float64
vision : float64
penalties : int64
composure : float64
defending : int64
marking : int64
standing_tackle : int64
sliding_tackle : float64
goalkeeping : int64
gk_diving : int64
gk_handling : int64
gk_kicking : int64
gk_positioning : int64
gk_reflexes : int64
total_stats : int64
base_stats : int64
w/f : int64
sm : int64
a/w : int64
d/w : int64
ir : int64
pac : int64
sho : int64
pas : int64
dri : in

In [957]:
data=pd.read_csv('fifa21_train.csv')

def preprocess(dataframe):
    # Standardize headers
    standardize_header(dataframe)
    
    # Drop duplicate columns
    dataframe = drop_duplicate_columns(dataframe)
    
    # Standardize headers kept after drop of duplicate columns
    dataframe = map_columns(dataframe)
    
    # Drop useless columns
    dataframe = drop_useless_cols(dataframe)
    
    # Convert 'height' into cm
    convert_into_cm(dataframe)
    
    # Convert 'weight' into kg
    convert_int_kg(dataframe)
    
    # Standardize 'foot'
    standardize_foot(dataframe)
    
    # Fill nan values for 'volleys', 'curve', 'agility', 'balance', 'jumping', 'vision', 'sliding_tackle', 'interceptions', 'positioning', 'composure'
    dataframe = fill_nan_subset_mean(dataframe)
    
    # Fill nan values for 'a/w' & 'd/w'        
    dataframe = fill_nan_subset_mode(dataframe)
    
    # Replace 'a/w' & 'd/w' with int values    
    map_string_into_int(dataframe)
    
    # Remove stars in 'w/f', 'sm' & 'ir'    
    remove_stars(dataframe)
    
    # Standardize 'hits'
    standardize_hits(dataframe)
    
    # Evaluate 'striker', 'wing', 'forward', 'att_mid', 'side_mid', 'center_mid', 'back_wing', 'def_wing', 'def_laterals', 'center_back'
    evaluate_columns(dataframe)
    
    return dataframe

data = preprocess(data)

# Check nan values
print('==> NaN value check <==')
check_nan_values(data)
print('=========================')

# Check columns type
print('==> Column type check <==')
check_columns_type(data)
print('=========================')

==> NaN value check <==
bp : 0
height : 0
weight : 0
foot : 0
growth : 0
attacking : 0
crossing : 0
finishing : 0
heading_accuracy : 0
short_passing : 0
volleys : 0
skill : 0
dribbling : 0
curve : 0
fk_accuracy : 0
long_passing : 0
ball_control : 0
movement : 0
acceleration : 0
sprint_speed : 0
agility : 0
reactions : 0
balance : 0
power : 0
shot_power : 0
jumping : 0
stamina : 0
strength : 0
long_shots : 0
mentality : 0
aggression : 0
interceptions : 0
positioning : 0
vision : 0
penalties : 0
composure : 0
defending : 0
marking : 0
standing_tackle : 0
sliding_tackle : 0
goalkeeping : 0
gk_diving : 0
gk_handling : 0
gk_kicking : 0
gk_positioning : 0
gk_reflexes : 0
total_stats : 0
base_stats : 0
w/f : 0
sm : 0
a/w : 0
d/w : 0
ir : 0
pac : 0
sho : 0
pas : 0
dri : 0
def : 0
phy : 0
hits : 0
striker : 0
wing : 0
forward : 0
att_mid : 0
side_mid : 0
center_mid : 0
back_wing : 0
def_wing : 0
def_laterals : 0
center_back : 0
gk : 0
ova : 0
==> Column type check <==
bp : object
height : float