Data Preprocessing

Import the libraries

In [163]:
import pandas as pd

Load the dataset

In [164]:
df = pd.read_csv("seria.csv")
print(df.head())

            Player   Team  Shirt Number Nation Position     Age  Minutes  \
0          Vitinha  Genoa             9    POR       FW  24-155       90   
1   Junior Messias  Genoa            10    BRA    FW,RW  33-096       90   
2  Morten Frendrup  Genoa            32    DEN       CM  23-132       90   
3     Milan Badelj  Genoa            47    CRO       CM  35-174       85   
4     Jeff Ekhator  Genoa            21    ITA       LW  17-280        5   

   Goals  Assists  Penalty Shoot on Goal  ...  Goal-Creating Actions  \
0      0        0                      0  ...                      0   
1      1        0                      0  ...                      1   
2      0        0                      0  ...                      0   
3      0        0                      0  ...                      0   
4      0        0                      0  ...                      0   

   Passes Completed  Passes Attempted  Pass Completion %  Progressive Passes  \
0                 8           

Replace commas with periods and convert to float in "Pass Completion %" column

In [165]:
df["Pass Completion %"] = df["Pass Completion %"].str.replace(",",".",regex=True).astype(float)
print(df.head())

            Player   Team  Shirt Number Nation Position     Age  Minutes  \
0          Vitinha  Genoa             9    POR       FW  24-155       90   
1   Junior Messias  Genoa            10    BRA    FW,RW  33-096       90   
2  Morten Frendrup  Genoa            32    DEN       CM  23-132       90   
3     Milan Badelj  Genoa            47    CRO       CM  35-174       85   
4     Jeff Ekhator  Genoa            21    ITA       LW  17-280        5   

   Goals  Assists  Penalty Shoot on Goal  ...  Goal-Creating Actions  \
0      0        0                      0  ...                      0   
1      1        0                      0  ...                      1   
2      0        0                      0  ...                      0   
3      0        0                      0  ...                      0   
4      0        0                      0  ...                      0   

   Passes Completed  Passes Attempted  Pass Completion %  Progressive Passes  \
0                 8           

Replace commas with and in "Postion" column

In [166]:
df["Position"] = df["Position"].str.replace(","," and ",regex=True)
print(df.head())

            Player   Team  Shirt Number Nation   Position     Age  Minutes  \
0          Vitinha  Genoa             9    POR         FW  24-155       90   
1   Junior Messias  Genoa            10    BRA  FW and RW  33-096       90   
2  Morten Frendrup  Genoa            32    DEN         CM  23-132       90   
3     Milan Badelj  Genoa            47    CRO         CM  35-174       85   
4     Jeff Ekhator  Genoa            21    ITA         LW  17-280        5   

   Goals  Assists  Penalty Shoot on Goal  ...  Goal-Creating Actions  \
0      0        0                      0  ...                      0   
1      1        0                      0  ...                      1   
2      0        0                      0  ...                      0   
3      0        0                      0  ...                      0   
4      0        0                      0  ...                      0   

   Passes Completed  Passes Attempted  Pass Completion %  Progressive Passes  \
0                 

Fill missing values in "Pass Completion %" with the mean

In [167]:
df['Pass Completion %'] = df['Pass Completion %'].fillna(df['Pass Completion %'].mean())
print(df.head())

            Player   Team  Shirt Number Nation   Position     Age  Minutes  \
0          Vitinha  Genoa             9    POR         FW  24-155       90   
1   Junior Messias  Genoa            10    BRA  FW and RW  33-096       90   
2  Morten Frendrup  Genoa            32    DEN         CM  23-132       90   
3     Milan Badelj  Genoa            47    CRO         CM  35-174       85   
4     Jeff Ekhator  Genoa            21    ITA         LW  17-280        5   

   Goals  Assists  Penalty Shoot on Goal  ...  Goal-Creating Actions  \
0      0        0                      0  ...                      0   
1      1        0                      0  ...                      1   
2      0        0                      0  ...                      0   
3      0        0                      0  ...                      0   
4      0        0                      0  ...                      0   

   Passes Completed  Passes Attempted  Pass Completion %  Progressive Passes  \
0                 

Clean the "Age" Column
The 'Age' column is in 'YY-DDD' format, so we extract the years and convert to integer

In [168]:
df['Age'] = df['Age'].apply(lambda x: int(x.split('-')[0]))
print(df.head())

            Player   Team  Shirt Number Nation   Position  Age  Minutes  \
0          Vitinha  Genoa             9    POR         FW   24       90   
1   Junior Messias  Genoa            10    BRA  FW and RW   33       90   
2  Morten Frendrup  Genoa            32    DEN         CM   23       90   
3     Milan Badelj  Genoa            47    CRO         CM   35       85   
4     Jeff Ekhator  Genoa            21    ITA         LW   17        5   

   Goals  Assists  Penalty Shoot on Goal  ...  Goal-Creating Actions  \
0      0        0                      0  ...                      0   
1      1        0                      0  ...                      1   
2      0        0                      0  ...                      0   
3      0        0                      0  ...                      0   
4      0        0                      0  ...                      0   

   Passes Completed  Passes Attempted  Pass Completion %  Progressive Passes  \
0                 8                1

Convert "Date" Column to datetime objects

In [169]:
df["Date"] = pd.to_datetime(df["Date"])
print(df.head())

            Player   Team  Shirt Number Nation   Position  Age  Minutes  \
0          Vitinha  Genoa             9    POR         FW   24       90   
1   Junior Messias  Genoa            10    BRA  FW and RW   33       90   
2  Morten Frendrup  Genoa            32    DEN         CM   23       90   
3     Milan Badelj  Genoa            47    CRO         CM   35       85   
4     Jeff Ekhator  Genoa            21    ITA         LW   17        5   

   Goals  Assists  Penalty Shoot on Goal  ...  Goal-Creating Actions  \
0      0        0                      0  ...                      0   
1      1        0                      0  ...                      1   
2      0        0                      0  ...                      0   
3      0        0                      0  ...                      0   
4      0        0                      0  ...                      0   

   Passes Completed  Passes Attempted  Pass Completion %  Progressive Passes  \
0                 8                1

Check for missing values

In [170]:
df_missing = df.isnull().sum()
print(df_missing)

Player                    0
Team                      0
Shirt Number              0
Nation                    0
Position                  0
Age                       0
Minutes                   0
Goals                     0
Assists                   0
Penalty Shoot on Goal     0
Penalty Shoot             0
Total Shoot               0
Shoot on Target           0
Yellow Cards              0
Red Cards                 0
Touches                   0
Dribbles                  0
Tackles                   0
Blocks                    0
Expected Goals (xG)       0
Non-Penalty xG (npxG)     0
Expected Assists (xAG)    0
Shot-Creating Actions     0
Goal-Creating Actions     0
Passes Completed          0
Passes Attempted          0
Pass Completion %         0
Progressive Passes        0
Carries                   0
Progressive Carries       0
Dribble Attempts          0
Successful Dribbles       0
Date                      0
dtype: int64


Check for duplicates 

In [171]:
df_duplicated = df.duplicated()
print(df_duplicated)

df_duplicates = df.drop_duplicates()
print(df_duplicates.head())

0       False
1       False
2       False
3       False
4       False
        ...  
3973    False
3974    False
3975    False
3976    False
3977    False
Length: 3978, dtype: bool
            Player   Team  Shirt Number Nation   Position  Age  Minutes  \
0          Vitinha  Genoa             9    POR         FW   24       90   
1   Junior Messias  Genoa            10    BRA  FW and RW   33       90   
2  Morten Frendrup  Genoa            32    DEN         CM   23       90   
3     Milan Badelj  Genoa            47    CRO         CM   35       85   
4     Jeff Ekhator  Genoa            21    ITA         LW   17        5   

   Goals  Assists  Penalty Shoot on Goal  ...  Goal-Creating Actions  \
0      0        0                      0  ...                      0   
1      1        0                      0  ...                      1   
2      0        0                      0  ...                      0   
3      0        0                      0  ...                      0   
4      0 

Rename columns for Clarity and Consistency

In [172]:
df.rename(columns={
    "Player":"player",
    "Team":"team",
    "Shirt Number":"shirt_number",
    "Nation":"nation",
    "Position":"position",
    "Age":"age",
    "Minutes":"minutes",
    "Goals":"goals",
    "Assists":"assists",
    "Penalty Shoot on Goal":"penalty_shoot_on_goal",
    "Penalty Shoot":"penalty_shoot",
    "Total Shoot":"total_shoot",
    "Shoot on Target ":"shoot_on_target",
    "Yellow Cards":"yellow_cards",
    "Red Cards":"red_cards",
    "Touches":"touches",
    "Dribbles":"dribbles",
    "Tackles":"tackles",
    "Blocks":"blocks",
    "Expected Goals (xG)":"expected_goals_xg",
    "Non-Penalty xG (npxG)":"non_penalty_xg_npxg",
    "Expected Assists (xAG)":"expected_assists_xag",
    "Shot-Creating Actions":"shot_creating_actions",
    "Goal-Creating Actions":"goal_creating_actions",
    "Passes Completed":"passes_completed",
    "Passes Attempted":"passes_attempted",
    "Pass Completion %":"pass_completed_%",
    "Progressive Passes":"progressive_passes",
    "Carries":"carries",
    "Progressive Carries":"progressive_carries",
    "Dribble Attempts":"dribble_attempts",
    "Successful Dribbles":"successful_dribbles",
    "Date":"date"
},inplace=True)
print(df.head())

            player   team  shirt_number nation   position  age  minutes  \
0          Vitinha  Genoa             9    POR         FW   24       90   
1   Junior Messias  Genoa            10    BRA  FW and RW   33       90   
2  Morten Frendrup  Genoa            32    DEN         CM   23       90   
3     Milan Badelj  Genoa            47    CRO         CM   35       85   
4     Jeff Ekhator  Genoa            21    ITA         LW   17        5   

   goals  assists  penalty_shoot_on_goal  ...  goal_creating_actions  \
0      0        0                      0  ...                      0   
1      1        0                      0  ...                      1   
2      0        0                      0  ...                      0   
3      0        0                      0  ...                      0   
4      0        0                      0  ...                      0   

   passes_completed  passes_attempted  pass_completed_%  progressive_passes  \
0                 8                13

Save the Cleaned data

In [173]:
df.to_csv("seria.cleaned.csv",index=False)
print("Saved Successfully")

Saved Successfully
