# Data Preprocessing

Drop irrelevant fields: name, ID, photo links, club logo, etc.

Normalize or scale numerical columns (MinMax or Z-score).

Create physical features: BMI = weight / height²

Categorize skills: defensive, passing, shooting, dribbling, etc.

Possibly use dimensionality reduction (PCA) for visualization.

## Import Required Libraries

In [67]:
import pandas as pd
import numpy as np

## Load the Dataset

In [68]:
players_df = pd.read_csv("../data/raw/male_players (legacy).csv")

  players_df = pd.read_csv("../data/raw/male_players (legacy).csv")


## Drop Unecessary Columns

In [69]:
# Set pandas to show all columns
pd.set_option("display.max_columns", None)

print(f"-----BEFORE CLEANING-----")
print(f"Rows: {players_df.shape[0]}")
print(f"Columns: {players_df.shape[1]}\n")

# Sleect columns to drop - either unecessary or give away the target variable
columns_to_drop = [
    "player_positions", "nation_position",
    "club_jersey_number", "nation_jersey_number",
    "ls", "st", "rs", "lw", "lf", "cf", "rf", "rw", 
    "lam", "cam", "ram", "lm", "lcm", "cm", "rcm", "rm",
    "lwb", "ldm", "cdm", "rdm", "rwb", "lb", "lcb", "cb", "rcb", "rb", "gk",
    "goalkeeping_diving", "goalkeeping_handling", "goalkeeping_kicking",
    "goalkeeping_positioning", "goalkeeping_reflexes", "goalkeeping_speed",
    "player_face_url", "player_url", "player_tags", "player_traits",
    "dob", "club_joined_date", "potential", "player_id", "fifa_version",
    "fifa_update","fifa_update_date", "long_name", "league_id",
    "league_name", "league_level", "club_team_id", "club_name", "club_loaned_from",
    "club_contract_valid_until_year", "nationality_id", "nationality_name", "nation_team_id",
    "international_reputation", "real_face", "release_clause_eur", "mentality_composure"
]

players_df = players_df.drop(columns = columns_to_drop)
players_df = players_df.dropna()

print(f"-----AFTER CLEANING-----")
print(f"Rows: {players_df.shape[0]}")
print(f"Columns: {players_df.shape[1]}\n")

-----BEFORE CLEANING-----
Rows: 161583
Columns: 110

-----AFTER CLEANING-----
Rows: 141822
Columns: 47



## Update Data Types

In [70]:
# Ensure all string data stypes are actually strings

players_df.rename(columns = {"club_position": "specific_position" }, inplace = True)
players_df["specific_position"] = players_df["specific_position"].astype(str)

players_df["short_name"] = players_df["short_name"].astype(str)
players_df["preferred_foot"] = players_df["preferred_foot"].astype(str)
players_df["work_rate"] = players_df["work_rate"].astype(str)
players_df["body_type"] = players_df["body_type"].astype(str)

players_df.dtypes

short_name                      object
overall                          int64
value_eur                      float64
wage_eur                       float64
age                              int64
height_cm                        int64
weight_kg                        int64
specific_position               object
preferred_foot                  object
weak_foot                        int64
skill_moves                      int64
work_rate                       object
body_type                       object
pace                           float64
shooting                       float64
passing                        float64
dribbling                      float64
defending                      float64
physic                         float64
attacking_crossing               int64
attacking_finishing              int64
attacking_heading_accuracy       int64
attacking_short_passing          int64
attacking_volleys                int64
skill_dribbling                  int64
skill_curve              

## Simplify Categorical Variable Columns

In [71]:
# Print unqiue values for string columns

print("-----UNIQUE POSITION VALUES-----")
print(players_df["specific_position"].unique())

print("\n-----UNIQUE PREFERRED FOOT VALUES-----")
print(players_df["preferred_foot"].unique())

print("\n-----UNIQUE WORK RATE VALUES-----")
print(players_df["work_rate"].unique())

print("\n-----UNIQUE BODY TYPE VALUES-----")
print(players_df["body_type"].unique())

-----UNIQUE POSITION VALUES-----
['CF' 'LW' 'SUB' 'ST' 'LCM' 'RES' 'RS' 'LM' 'RCM' 'LCB' 'CAM' 'RW' 'LS'
 'LDM' 'RCB' 'CB' 'CDM' 'CM' 'RDM' 'RB' 'RM' 'LB' 'RAM' 'LAM' 'RWB' 'RF'
 'LF' 'LWB']

-----UNIQUE PREFERRED FOOT VALUES-----
['Left' 'Right']

-----UNIQUE WORK RATE VALUES-----
['Medium/Low' 'High/Low' 'High/Medium' 'High/High' 'Medium/High'
 'Medium/Medium' 'Low/Medium' 'Low/High' 'Low/Low']

-----UNIQUE BODY TYPE VALUES-----
['Normal (170-)' 'Normal (185+)' 'Normal (170-185)' 'Lean (170-185)'
 'Lean (185+)' 'Stocky (170-185)' 'Stocky (170-)' 'Stocky (185+)'
 'Lean (170-)' 'Unique']


In [72]:
players_df = players_df[~players_df["specific_position"].isin(["SUB", "RES"])]

# Mapping positions
specific_position_map = {
    "ST": ["CF", "ST", "RS", "LS"],
    "LW": ["LW", "LF"],
    "RW": ["RW", "RF"],
    "LM": ["LCM", "LM", "LDM", "LAM"],
    "RM": ["RCM", "RDM", "RM", "RAM"],
    "CB": ["LCB", "RCB", "CB"],
    "CM": ["CAM", "CDM", "CM"],
    "RB": ["RB", "RWB"],
    "LB": ["LB", "LWB"]
}

# Create a function to map positions
def map_position(pos):
    for new_pos, old_positions in specific_position_map.items():
        if pos in old_positions:
            return new_pos
        
    # Return the original position if no mapping is found
    return pos  

# Apply the mapping to the specific_position column
players_df["specific_position"] = players_df["specific_position"].apply(map_position)


# Mapping for generic positions
generic_position_map = {
    "LB": "defender", 
    "RB": "defender", 
    "CB": "defender",
    "LM": "midfielder", 
    "RM": "midfielder", 
    "CM": "midfielder",
    "LW": "attacker", 
    "RW": "attacker", 
    "ST": "attacker"
}

# Create the new generic_position column based on specific_position
players_df["generic_position"] = players_df["specific_position"].map(generic_position_map)

# Mapping for body types
body_type_map = {
    "Normal (170-)": "Normal", 
    "Normal (185+)": "Normal", 
    "Normal (170-185)": "Normal",
    "Lean (170-185)": "Lean", 
    "Lean (185+)": "Lean", 
    "Lean (170-)": "Lean", 
    "Stocky (170-185)": "Stocky",
    "Stocky (170-)": "Stocky",
    "Stocky (185+)": "Stocky",
    "Unique": "Unique"
}

# Replace body types with the simplified categories
players_df["body_type"] = players_df["body_type"].map(body_type_map)

# Print unqiue values for string columns
print("-----UNIQUE SPECIFIC POSITION VALUES-----")
print(players_df["specific_position"].unique())

# Print unqiue values for string columns
print("-----UNIQUE GENERIC POSITION VALUES-----")
print(players_df["generic_position"].unique())

print("\n-----UNIQUE PREFERRED FOOT VALUES-----")
print(players_df["preferred_foot"].unique())

print("\n-----UNIQUE WORK RATE VALUES-----")
print(players_df["work_rate"].unique())

print("\n-----UNIQUE BODY TYPE VALUES-----")
print(players_df["body_type"].unique())

-----UNIQUE SPECIFIC POSITION VALUES-----
['ST' 'LW' 'LM' 'RM' 'CB' 'CM' 'RW' 'RB' 'LB']
-----UNIQUE GENERIC POSITION VALUES-----
['attacker' 'midfielder' 'defender']

-----UNIQUE PREFERRED FOOT VALUES-----
['Left' 'Right']

-----UNIQUE WORK RATE VALUES-----
['Medium/Low' 'High/Low' 'High/Medium' 'High/High' 'Medium/High'
 'Medium/Medium' 'Low/Low' 'Low/High' 'Low/Medium']

-----UNIQUE BODY TYPE VALUES-----
['Normal' 'Lean' 'Stocky' 'Unique']


## One Hot Encode Categorical Variables

In [73]:
# One-hot encode the categorical columns
players_df = pd.get_dummies(players_df, columns = ["preferred_foot", "work_rate", "body_type"], drop_first = False)

players_df.head()

Unnamed: 0,short_name,overall,value_eur,wage_eur,age,height_cm,weight_kg,specific_position,weak_foot,skill_moves,pace,shooting,passing,dribbling,defending,physic,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,skill_dribbling,skill_curve,skill_fk_accuracy,skill_long_passing,skill_ball_control,movement_acceleration,movement_sprint_speed,movement_agility,movement_reactions,movement_balance,power_shot_power,power_jumping,power_stamina,power_strength,power_long_shots,mentality_aggression,mentality_interceptions,mentality_positioning,mentality_vision,mentality_penalties,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,generic_position,preferred_foot_Left,preferred_foot_Right,work_rate_High/High,work_rate_High/Low,work_rate_High/Medium,work_rate_Low/High,work_rate_Low/Low,work_rate_Low/Medium,work_rate_Medium/High,work_rate_Medium/Low,work_rate_Medium/Medium,body_type_Lean,body_type_Normal,body_type_Stocky,body_type_Unique
0,L. Messi,93,100500000.0,550000.0,27,169,67,ST,3,4,93.0,89.0,86.0,96.0,27.0,63.0,84,94,71,89,85,96,89,90,76,96,96,90,94,94,95,80,73,77,60,88,48,22,92,90,76,25,21,20,attacker,True,False,False,False,False,False,False,False,False,True,False,False,True,False,False
1,Cristiano Ronaldo,92,79000000.0,375000.0,29,185,80,LW,4,5,93.0,93.0,81.0,91.0,32.0,79.0,83,95,86,82,87,93,88,79,72,92,91,94,93,90,63,94,94,89,79,93,63,24,91,81,85,22,31,23,attacker,False,True,False,True,False,False,False,False,False,False,False,False,True,False,False
3,Z. Ibrahimović,90,52500000.0,275000.0,32,195,95,ST,4,4,76.0,91.0,81.0,86.0,34.0,86.0,76,91,76,84,92,88,80,80,76,90,74,77,86,85,41,93,72,78,93,88,84,20,86,83,91,25,41,27,attacker,False,True,False,False,False,False,False,False,False,True,False,False,True,False,False
5,Iniesta,89,36000000.0,250000.0,30,170,65,LM,4,4,75.0,72.0,89.0,91.0,59.0,63.0,85,73,54,93,74,92,80,70,89,94,76,75,83,90,86,65,54,78,59,75,58,68,87,93,71,57,57,56,midfielder,False,True,False,False,True,False,False,False,False,False,False,False,True,False,False
7,R. van Persie,88,40500000.0,230000.0,30,187,71,ST,3,4,74.0,90.0,82.0,83.0,33.0,68.0,81,91,73,85,92,84,86,81,75,87,73,74,80,88,59,90,59,72,72,86,55,34,90,82,86,23,32,21,attacker,True,False,False,False,False,False,False,False,False,True,False,False,True,False,False


## Encode Target Variables

In [None]:
from sklearn.preprocessing import LabelEncoder

# Initialize the label encoder
label_encoder = LabelEncoder()

# Encode the specific_position column
players_df["encoded_specific_position"] = label_encoder.fit_transform(players_df["specific_position"])

# Encode the generic_position column
players_df["encoded_generic_position"] = label_encoder.fit_transform(players_df["generic_position"])

In [None]:
players_df[["specific_position", "encoded_specific_position", "generic_position", "encoded_generic_position"]].head(10)

Unnamed: 0,specific_position,encoded_specific_position,generic_position,encoded_generic_position
0,ST,8,attacker,0
1,LW,4,attacker,0
3,ST,8,attacker,0
5,LM,3,midfielder,2
7,ST,8,attacker,0
11,LM,3,midfielder,2
12,RM,6,midfielder,2
13,CB,0,defender,1
14,CB,0,defender,1
15,CM,1,midfielder,2


## Save Processed Dataframes

In [None]:
# Save the specific_position.csv
players_df.drop(columns = ["generic_position"], inplace = True)
players_df.to_csv("../data/processed/specific_position.csv", index = False)

# Save the generic_position.csv
players_df.drop(columns = ["specific_position"], inplace = True)
players_df.to_csv("../data/processed/generic_position.csv", index = False)