# Exploratory Data Analysis

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('database.csv')
df.head()

Unnamed: 0,Player,Team,#,Nation,Position,Age,Minutes,Goals,Assists,Penalty Shoot on Goal,...,Goal-Creating Actions,Passes Completed,Passes Attempted,Pass Completion %,Progressive Passes,Carries,Progressive Carries,Dribble Attempts,Successful Dribbles,Date
0,Bruno Fernandes,Manchester United,8,POR,"FW,AM",29-343,90,0,0,0,...,0,43,60,717,6,50,7,1,1,2024-08-16
1,Marcus Rashford,Manchester United,10,ENG,LW,26-290,90,0,0,0,...,0,22,30,733,2,34,2,6,0,2024-08-16
2,Amad Diallo,Manchester United,16,CIV,RW,22-036,60,0,0,0,...,0,28,33,848,1,22,0,1,1,2024-08-16
3,Alejandro Garnacho,Manchester United,17,ARG,RW,20-046,30,0,1,0,...,1,10,11,909,0,11,2,2,0,2024-08-16
4,Mason Mount,Manchester United,7,ENG,AM,25-219,60,0,0,0,...,0,11,15,733,1,16,0,2,0,2024-08-16


In [3]:
df.shape

(3349, 33)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3349 entries, 0 to 3348
Data columns (total 33 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Player                  3349 non-null   object 
 1   Team                    3349 non-null   object 
 2   #                       3349 non-null   int64  
 3   Nation                  3349 non-null   object 
 4   Position                3349 non-null   object 
 5   Age                     3349 non-null   object 
 6   Minutes                 3349 non-null   int64  
 7   Goals                   3349 non-null   int64  
 8   Assists                 3349 non-null   int64  
 9   Penalty Shoot on Goal   3349 non-null   int64  
 10  Penalty Shoot           3349 non-null   int64  
 11  Total Shoot             3349 non-null   int64  
 12  Shoot on Target         3349 non-null   int64  
 13  Yellow Cards            3349 non-null   int64  
 14  Red Cards               3349 non-null   

- `26  Pass Completion %       3311 non-null   object`
  - Some null rows in Pass Completion need to be handled

In [5]:
# Convert Age to float
# Original format is YY-DDD 

df[['AgeYears', 'AgeDays']] = df['Age'].str.split('-', expand=True)

df['AgeYears'] = pd.to_numeric(df['AgeYears'], errors='coerce')
df['AgeDays'] = pd.to_numeric(df['AgeDays'], errors='coerce')

df.drop('Age', axis=1, inplace=True)

df[['AgeYears', 'AgeDays']].head()

Unnamed: 0,AgeYears,AgeDays
0,29,343
1,26,290
2,22,36
3,20,46
4,25,219


In [6]:
# Cleaning Pass Completion % 
df['Pass Completion %'] = df['Pass Completion %'].str.replace(',', '.', regex=False)
df['Pass Completion %'] = pd.to_numeric(df['Pass Completion %'], errors='coerce')

df['Pass Completion %'].value_counts()

100.0    199
80.0     105
75.0     104
50.0      99
66.7      97
        ... 
95.1       1
90.8       1
52.8       1
97.6       1
57.8       1
Name: Pass Completion %, Length: 386, dtype: int64

### Handling Null Values

In [7]:
df.isnull().sum()

Player                     0
Team                       0
#                          0
Nation                     0
Position                   0
Minutes                    0
Goals                      0
Assists                    0
Penalty Shoot on Goal      0
Penalty Shoot              0
Total Shoot                0
Shoot on Target            0
Yellow Cards               0
Red Cards                  0
Touches                    0
Dribbles                   0
Tackles                    0
Blocks                     0
Expected Goals (xG)        0
Non-Penalty xG (npxG)      0
Expected Assists (xAG)     0
Shot-Creating Actions      0
Goal-Creating Actions      0
Passes Completed           0
Passes Attempted           0
Pass Completion %         38
Progressive Passes         0
Carries                    0
Progressive Carries        0
Dribble Attempts           0
Successful Dribbles        0
Date                       0
AgeYears                   0
AgeDays                    0
dtype: int64

In [8]:
null_rows = df[df['Pass Completion %'].isnull()]
null_rows

Unnamed: 0,Player,Team,#,Nation,Position,Minutes,Goals,Assists,Penalty Shoot on Goal,Penalty Shoot,...,Passes Attempted,Pass Completion %,Progressive Passes,Carries,Progressive Carries,Dribble Attempts,Successful Dribbles,Date,AgeYears,AgeDays
24,Jay Stansfield,Fulham,28,ENG,"AM,DM",1,0,0,0,0,...,0,,0,0,0,0,0,2024-08-16,21,266
356,Beto,Everton,14,GNB,FW,19,0,0,0,0,...,0,,0,2,1,0,0,2024-08-24,26,206
417,Jean-Clair Todibo,West Ham United,25,FRA,CB,2,0,0,0,0,...,0,,0,0,0,0,0,2024-08-24,24,238
447,Ali Al Hamadi,Ipswich Town,16,IRQ,FW,8,0,0,0,0,...,0,,0,0,0,0,0,2024-08-24,22,176
514,Reiss Nelson,Arsenal,24,ENG,RW,3,0,0,0,0,...,0,,0,1,0,0,0,2024-08-24,24,258
527,Daniel Jebbison,Bournemouth,21,ENG,FW,6,0,0,0,0,...,0,,0,0,0,0,0,2024-08-25,21,12
538,Adam Smith,Bournemouth,15,ENG,RB,5,0,0,0,0,...,0,,0,0,0,0,0,2024-08-25,33,118
717,Taiwo Awoniyi,Nottingham Forest,9,NGA,DM,1,0,0,0,0,...,0,,0,0,0,0,0,2024-08-31,27,19
764,Harrison Reed,Fulham,6,ENG,DM,4,0,0,0,0,...,0,,0,0,0,0,0,2024-08-31,29,217
837,Miguel Almirón,Newcastle United,24,PAR,LW,1,0,0,0,0,...,0,,0,0,0,0,0,2024-09-01,30,204


In [None]:
# Calculate the median Pass Completion % for each position
position_medians = df.groupby('Position')['Pass Completion %'].median()

# Function to prioritize a player's past data for imputing missing Pass Completion %
def impute_pass_completion(row):
    if pd.isnull(row['Pass Completion %']):
        # Filter past data for the same player
        past_data = df[(df['Player'] == row['Player']) & ~df['Pass Completion %'].isnull()]
        if not past_data.empty:
            # Use the median of the player's past performances
            return past_data['Pass Completion %'].median()
        else:
            # Fallback to the median for the player's position
            return position_medians[row['Position']]
    return row['Pass Completion %']

# Apply the imputation logic
df['Pass Completion %'] = df.apply(impute_pass_completion, axis=1)

# Verify there are no missing values
final_missing_values = df['Pass Completion %'].isnull().sum()

# Display a sample of the cleaned dataset
cleaned_sample = df[['Player', 'Position', 'Pass Completion %']].head()

print("Missing Values After Final Imputation:", final_missing_values)
print(cleaned_sample)