Cleaning and Filtering the Master Data File

In [11]:
# Import libraries:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
# Graph output styling from matplotlib:
plt.style.use('fivethirtyeight')

In [2]:
# Global variables:
repo_dir = os.getcwd()  # Directory of the notebook
source_data_dir = os.path.join(repo_dir, 'fbref-dw-merges')

In [32]:
# Function that removes unnamed columns:
def remove_unnamed_cols(df):

    # Create list of unnamed columns:
    columns = df.columns
    unnamed_cols = [col for col in columns if "Unnamed" in col]

    # Create a new df and set index:
    new_df = (df
              .drop(columns=unnamed_cols)
              )

    return new_df


# Function that slices the master file based on an optimal playing time cutoff:
def playing_time_slice(df, cutoff : int = 8):
    return (df[df['90s_r'] >= cutoff])

In [4]:
# Import the master file and make a copy:
master_df = pd.read_csv(os.path.join(source_data_dir, 'master_file.csv'))
master_df_copy = master_df.copy(deep=True)
master_df_copy

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,league,season,team,player_name,nationality,position,age,YOB,...,On-Off,onxG,onxGA,xG+/-,xG+/-90,On-Off.1,team_id,player_id,league_id,market_value_in_eur
0,0,0,ENG-Premier League,1718,Arsenal,Alex Oxlade-Chamberlain,ENG,DF,23.0,1993.0,...,-1.86,4.2,4.6,-0.4,-0.14,-0.74,31,143424,GB1,3.500000e+07
1,1,1,ENG-Premier League,1718,Liverpool,Alex Oxlade-Chamberlain,ENG,"MF,FW",23.0,1993.0,...,0.21,30.7,14.9,15.8,0.95,-0.14,31,143424,GB1,3.500000e+07
2,2,2,ENG-Premier League,1718,Arsenal,Granit Xhaka,SUI,MF,24.0,1992.0,...,0.04,64.4,46.9,17.5,0.48,-1.20,11,111455,GB1,4.166667e+07
3,3,3,ENG-Premier League,1718,Arsenal,Mohamed Elneny,EGY,MF,25.0,1992.0,...,0.02,16.8,12.2,4.7,0.49,-0.07,11,160438,GB1,1.000000e+07
4,4,4,ENG-Premier League,1718,Arsenal,Per Mertesacker,GER,DF,32.0,1984.0,...,-0.13,7.3,5.1,2.1,0.53,-0.01,11,6710,GB1,1.750000e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7116,1758,1758,ITA-Serie A,2223,Udinese,Roberto Pereyra,ARG,"DF,MF",31.0,1991.0,...,-0.03,37.7,42.7,-5.0,-0.16,-0.18,410,112302,IT1,6.500000e+06
7117,1759,1759,ITA-Serie A,2223,Udinese,Rodrigo Becão,BRA,DF,26.0,1996.0,...,1.02,35.2,37.7,-2.5,-0.09,0.14,410,410158,IT1,1.000000e+07
7118,1760,1760,ITA-Serie A,2223,Udinese,Simone Pafundi,ITA,"MF,FW",16.0,2006.0,...,-1.06,1.3,1.7,-0.3,-0.37,-0.24,410,876400,IT1,1.933333e+06
7119,1761,1761,ITA-Serie A,2223,Udinese,Tolgay Arslan,GER,MF,31.0,1990.0,...,-0.42,16.7,17.7,-1.1,-0.08,0.07,410,53536,IT1,2.000000e+06


Analyze matches played, 90s, and minutes played to select a cutoff point:

In [None]:
# Get descriptive stats on the relevant columns:
# master_df_copy['MP'].describe()
playing_time_cols = ['Starts', 'Min', '90s_r']
for col in playing_time_cols:
    plt.figure(figsize=(9,5))
    sns.histplot(data=master_df_copy, x=col, kde=True)
    plt.title(f'{col} KDE Plot')
    if col == 'Min':
        plt.axvline(x=720, color='r', linestyle='--')
        plt.show()
    elif col == '90s_r':
        plt.axvline(x=8, color='r', linestyle='--')
        plt.show()
    else:
        plt.axvline(x=10, color='r', linestyle='--')
        plt.show()

In [28]:
# Test the proposed playing time cutoff measures:
test1 = master_df_copy[master_df_copy['Starts'] >= 10]
test1_loss = ((7121 - test1.shape[0]) / 7121)*100
print(f'\nData Loss 1st from Cutoff: {test1_loss:.2f}%')

test2 = master_df_copy[master_df_copy['Min'] >= 720]
test2_loss = ((7121 - test2.shape[0]) / 7121)*100
print(f'\nData Loss 2nd from Cutoff: {test2_loss:.2f}%')

test3 = master_df_copy[master_df_copy['90s_r'] >= 8]
test3_loss = ((7121 - test3.shape[0]) / 7121)*100
print(f'\nData Loss 2nd from Cutoff: {test3_loss:.2f}%')

### CONCLUSION: WILL USE THE 90s_r >= 8 cutoff since it's associated with the least amount of data loss ###


Data Loss 1st from Cutoff: 36.72%

Data Loss 2nd from Cutoff: 31.15%

Data Loss 2nd from Cutoff: 30.98%


Test temporary cleaning/slicing measures:

In [None]:
cleaned_master_df = remove_unnamed_cols(master_df_copy)
cleaned_master_df = playing_time_slice(cleaned_master_df)
# cleaned_master_df