Cleaning and Filtering the Master Data File

In [None]:
# Import libraries:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
# Graph output styling from matplotlib:
plt.style.use('fivethirtyeight')
# plt.style.use('dark_background')

In [71]:
# Global variables:
repo_dir = os.getcwd()  # Directory of the notebook
source_data_dir = os.path.join(repo_dir, 'fbref-dw-merges')

# League ID and name mapping variables:
league_ids = ['ENG-Premier League', 'ESP-La Liga', 'FRA-Ligue 1', 'ITA-Serie A']
league_names = ['Premier League', 'La Liga', 'Ligue 1', 'Serie A']
league_names_dict = dict(zip(league_ids, league_names))

# Big teams list:
big_teams = ['Arsenal', 'Manchester City', 'Manchester Utd', 'Tottenham', 'Chelsea', 'Liverpool', 'Real Madrid', 'Barcelona', 'Atlético Madrid', 'Paris S-G', 'Juventus', 'Milan', 'Inter']

# Playing time duplicate columns (to be removed from the master file):
playing_time_duplicates = ['Min.1', 'Mn/MP', 'Min%', '90s', 'Starts.1', 'Mn/Start', 'Compl', 'Subs', 'Mn/Sub', 'unSub']

In [72]:
# Function that writes the output data to a CSV file:
def make_csv(dir, df, file_name):
    file_path = os.path.join(dir, f'{file_name}.csv')
    return df.to_csv(file_path)


# Function that removes unnamed columns:
def remove_unnamed_cols(df):

    # Create list of unnamed columns:
    columns = df.columns
    unnamed_cols = [col for col in columns if "Unnamed" in col]

    # Create a new df:
    new_df = (df
              .drop(columns=unnamed_cols)
              )

    return new_df


# Function that removes YOB column so that age can be standardized later:
def remove_YOB_col(df):
    return (df.drop(columns='YOB'))


# Function that removes unnecessary DW player/team ID columns:
def remove_dw_info(df):
    return (df.drop(columns=['team_id', 'player_id', 'league_id']))


# Function that removes duplicate FBref playing time columns:
def remove_playing_time_cols(df):
    return (df.drop(columns=playing_time_duplicates))


# Function that slices the master file based on an optimal playing time cutoff:
def playing_time_slice(df, cutoff : int = 8):
    return (df[df['90s_r'] >= cutoff]
            .reset_index()
            .drop(columns='index')
            )


# Function that adds the log(market value) column to the data:
def add_log_mkt_val_col(df):
    df['log_mkt_val'] = np.log(df['market_value_in_eur'])
    return df


# Function that adds the "Big Team" dummy variable to the data:
def add_big_team_col(df):
    df['Big Team'] = df['team'].isin(big_teams).astype(int)
    return df


# Function that adds a clean league name column based on league id:
def add_league_name_col(df):
    df['league_name'] = df['league'].map(league_names_dict)
    return df

In [73]:
# Import the master file and make a copy:
master_df = pd.read_csv(os.path.join(source_data_dir, 'master_file.csv'))
master_df_copy = master_df.copy(deep=True)
# master_df_copy

Analyze matches played, 90s, and minutes played to select a cutoff point:

In [None]:
# Generate histogram/KDE plots for the playing time variables:
playing_time_cols = ['Starts', 'Min', '90s_r']
for col in playing_time_cols:
    plt.figure(figsize=(9,5))
    sns.histplot(data=master_df_copy, x=col, kde=True)
    plt.title(f'{col} KDE Plot')
    if col == 'Min':
        plt.axvline(x=720, color='r', linestyle='--')
        plt.show()
    elif col == '90s_r':
        plt.axvline(x=8, color='r', linestyle='--')
        plt.show()
    else:
        plt.axvline(x=10, color='r', linestyle='--')
        plt.show()

In [None]:
# Test the proposed playing time cutoff measures:
playing_time_dict = {'Starts' : 10, 'Min' : 720, '90s_r' : 8} # use this to alter the playing time metric cutoffs if necessary
for key, val in playing_time_dict.items():

    # Slice the master file by the cutoff value:
    cut_df = master_df_copy[master_df_copy[key] >= val]
    
    # Compute and print the data loss (as a % of the original row total):
    data_loss = ((7121 - cut_df.shape[0]) / 7121)*100
    print(f'\nData Loss from the {key} Column Cutoff: {data_loss:.2f}%')

### CONCLUSION: WILL USE THE 90s_r >= 8 cutoff since it's associated with the least amount of data loss ###

Test temporary cleaning/slicing measures:

In [74]:
cleaned_master_df = remove_unnamed_cols(master_df_copy)
cleaned_master_df = remove_YOB_col(cleaned_master_df)
cleaned_master_df = remove_dw_info(cleaned_master_df)
cleaned_master_df = remove_playing_time_cols(cleaned_master_df)
cleaned_master_df = playing_time_slice(cleaned_master_df)
# cleaned_master_df

Add the necessary columns to the cleaned master file:

In [None]:
# cleaned_master_df = add_big_team_col(cleaned_master_df)
# cleaned_master_df = add_league_name_col(cleaned_master_df)
# cleaned_master_df = add_log_mkt_val_col(cleaned_master_df)
# cleaned_master_df

Fit Dummy Variables to Covariates, Other Columns:

In [75]:
# Iterate the get_dummies() process over the variables of interest:
# dummy_cols = ['league_name', 'team', 'season', 'nationality', 'position']
dummy_cols = ['team', 'season', 'nationality', 'position'] # removed the league name variable for PCA prep

# Initialize the list of dataframes for concatenation, starting with the clean master file:
dfs_to_concat = [cleaned_master_df]

for col in dummy_cols:

    # Create the dummy variable dataframe:
    dummy_df = pd.get_dummies(cleaned_master_df[col], drop_first=False)

    # Add it to the list of dataframes to concat:
    dfs_to_concat.append(dummy_df)

# Concatenate the dataframes:
cleaned_master_with_dummies_df = pd.concat(dfs_to_concat, axis=1)
cleaned_master_with_dummies_df


Unnamed: 0,league,season,team,player_name,nationality,position,age,MP,Starts,Min,...,DF,"DF,FW","DF,MF",FW,"FW,DF","FW,MF",GK,MF,"MF,DF","MF,FW"
0,ENG-Premier League,1718,Liverpool,Alex Oxlade-Chamberlain,ENG,"MF,FW",23.0,32.0,14.0,1493.0,...,0,0,0,0,0,0,0,0,0,1
1,ENG-Premier League,1718,Arsenal,Granit Xhaka,SUI,MF,24.0,38.0,37.0,3260.0,...,0,0,0,0,0,0,0,1,0,0
2,ENG-Premier League,1718,Arsenal,Mohamed Elneny,EGY,MF,25.0,13.0,11.0,867.0,...,0,0,0,0,0,0,0,1,0,0
3,ENG-Premier League,1718,Arsenal,Rob Holding,ENG,DF,21.0,12.0,9.0,822.0,...,1,0,0,0,0,0,0,0,0,0
4,ENG-Premier League,1718,Bournemouth,Adam Smith,ENG,"DF,MF",26.0,27.0,22.0,2071.0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4910,ITA-Serie A,2223,Udinese,Nehuén Pérez,ARG,DF,22.0,34.0,33.0,2818.0,...,1,0,0,0,0,0,0,0,0,0
4911,ITA-Serie A,2223,Udinese,Roberto Pereyra,ARG,"DF,MF",31.0,34.0,33.0,2820.0,...,0,0,1,0,0,0,0,0,0,0
4912,ITA-Serie A,2223,Udinese,Rodrigo Becão,BRA,DF,26.0,28.0,28.0,2484.0,...,1,0,0,0,0,0,0,0,0,0
4913,ITA-Serie A,2223,Udinese,Tolgay Arslan,GER,MF,31.0,36.0,12.0,1211.0,...,0,0,0,0,0,0,0,1,0,0


In [None]:
# Determine number of dummy variable categories for dimension verification:

# cleaned_master_df['nationality'].unique()

nationality_list = ['ENG', 'SUI', 'EGY', 'ISR', 'ITA', 'CMR', 'GER', 'IRL', 'SCO',
       'WAL', 'ESP', 'FRA', 'BEL', 'GHA', 'ARG', 'CIV', 'SEN', 'USA',
       'COD', 'AUT', 'NGA', 'ALG', 'JAM', 'BRA', 'NED', 'POR', 'ECU',
       'SWE', 'SVK', 'COL', 'KEN', 'PER', 'URU', 'NIR', 'NZL', 'POL',
       'ROU', 'CAN', 'GUA', 'PHI', 'SRB', 'DEN', 'GUI', 'PAR', 'NOR',
       'MEX', 'ZIM', 'CUB', 'FIN', 'GRE', 'MLI', 'UKR', 'BFA', 'MAR',
       'ANG', 'SKN', 'GAB', 'JPN', 'GRN', 'ZAM', 'VEN', 'AUS', 'CRC',
       'SVN', 'CHI', 'TOG', 'RUS', 'CPV', 'MKD', 'CRO', 'CTA', 'TUR',
       'HON', 'EQG', 'nan', 'MOZ', 'ARM', 'ALB', 'KVX', 'MNE', 'GEO', 'GNB',
       'MTN', 'BEN', 'CGO', 'HAI', 'TUN', 'RSA', 'LUX', 'MTQ', 'GLP',
       'COM', 'NCL', 'NIG', 'TRI', 'GUF', 'MAD', 'LBY', 'CZE', 'HUN',
       'EST', 'GAM', 'BIH', 'CYP', 'BUL', 'SUR', 'SLE', 'UZB']
# len(nationality_list)
# this length was equal to 108

# cleaned_master_df['team'].unique()

team_list = ['Liverpool', 'Arsenal', 'Bournemouth', 'Brighton', 'Burnley',
       'Everton', 'Chelsea', 'Crystal Palace', 'Huddersfield',
       'Leicester City', 'Southampton', 'Manchester City',
       'Manchester Utd', 'Newcastle Utd', 'Stoke City', 'Swansea City',
       'Tottenham', 'Watford', 'West Brom', 'West Ham', 'Cardiff City',
       'Fulham', 'Wolves', 'Aston Villa', 'Norwich City', 'Sheffield Utd',
       'Leeds United', 'Brentford', "Nott'ham Forest", 'Alavés',
       'Athletic Club', 'Real Sociedad', 'Leganés', 'Atlético Madrid',
       'Barcelona', 'Betis', 'Celta Vigo', 'Eibar', 'Espanyol', 'Getafe',
       'Málaga', 'Girona', 'La Coruña', 'Las Palmas', 'Levante',
       'Real Madrid', 'Sevilla', 'Valencia', 'Villarreal', 'Huesca',
       'Rayo Vallecano', 'Valladolid', 'Osasuna', 'Granada', 'Mallorca',
       'Cádiz', 'Elche', 'Almería', 'Amiens', 'Angers', 'Bordeaux',
       'Caen', 'Dijon', 'Guingamp', 'Lille', 'Lyon', 'Marseille', 'Metz',
       'Monaco', 'Paris S-G', 'Montpellier', 'Nantes', 'Nice', 'Rennes',
       'Saint-Étienne', 'Strasbourg', 'Toulouse', 'Troyes', 'Nîmes',
       'Reims', 'Brest', 'Lens', 'Lorient', 'Clermont Foot', 'Ajaccio',
       'Auxerre', 'Atalanta', 'Benevento', 'Sampdoria', 'Bologna',
       'Cagliari', 'Crotone', 'Chievo', 'Fiorentina', 'Sassuolo', 'Genoa',
       'Hellas Verona', 'Torino', 'Inter', 'Juventus', 'Lazio', 'Milan',
       'Napoli', 'Roma', 'SPAL', 'Udinese', 'Frosinone', 'Empoli',
       'Parma', 'Lecce', 'Brescia', 'Spezia', 'Salernitana', 'Venezia',
       'Cremonese', 'Monza']
# len(team_list)
# this length was equal to 116

cleaned_master_df['position'].unique()
all_positions = ['MF,FW', 'MF', 'DF', 'DF,MF', 'FW', 'FW,MF', 'GK', 'MF,DF',
       'FW,DF', 'DF,FW']
# len(all_positions)
# this length was equal to 10

In [77]:
# Remove all NaN values from the data:
final_master_df = cleaned_master_with_dummies_df.dropna().reset_index().drop(columns='index')
final_master_df

Unnamed: 0,league,season,team,player_name,nationality,position,age,MP,Starts,Min,...,DF,"DF,FW","DF,MF",FW,"FW,DF","FW,MF",GK,MF,"MF,DF","MF,FW"
0,ENG-Premier League,1718,Liverpool,Alex Oxlade-Chamberlain,ENG,"MF,FW",23.0,32.0,14.0,1493.0,...,0,0,0,0,0,0,0,0,0,1
1,ENG-Premier League,1718,Arsenal,Granit Xhaka,SUI,MF,24.0,38.0,37.0,3260.0,...,0,0,0,0,0,0,0,1,0,0
2,ENG-Premier League,1718,Arsenal,Mohamed Elneny,EGY,MF,25.0,13.0,11.0,867.0,...,0,0,0,0,0,0,0,1,0,0
3,ENG-Premier League,1718,Bournemouth,Adam Smith,ENG,"DF,MF",26.0,27.0,22.0,2071.0,...,0,0,1,0,0,0,0,0,0,0
4,ENG-Premier League,1718,Bournemouth,Andrew Surman,ENG,MF,30.0,25.0,20.0,1856.0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4137,ITA-Serie A,2223,Udinese,Nehuén Pérez,ARG,DF,22.0,34.0,33.0,2818.0,...,1,0,0,0,0,0,0,0,0,0
4138,ITA-Serie A,2223,Udinese,Roberto Pereyra,ARG,"DF,MF",31.0,34.0,33.0,2820.0,...,0,0,1,0,0,0,0,0,0,0
4139,ITA-Serie A,2223,Udinese,Rodrigo Becão,BRA,DF,26.0,28.0,28.0,2484.0,...,1,0,0,0,0,0,0,0,0,0
4140,ITA-Serie A,2223,Udinese,Tolgay Arslan,GER,MF,31.0,36.0,12.0,1211.0,...,0,0,0,0,0,0,0,1,0,0


Standardize Quantitative Data for the Principle Component Analysis (PCA):

In [78]:
# Print cleaned dataframe column datatypes to ensure they are all at least floats:
print('Index')
for index, column in enumerate(final_master_df.columns):
    print(f"{index}\t{column}: {final_master_df[column].dtype}")

# NOTE: THE NON-QUANT VARIABLES LIKE PLAYER NAME AND LEAGUE ARE STORE AS 'OBJECT' DTYPES, AS OPPOSED TO STRINGS, IDK WHY 

# Determine the appropriate column range for standardization: columns [0:5] don't need to be standardized
# Columns [6:145] need to be standardized
# Columns [146:] do not need to be standardized, they are dummies

Index
0	league: object
1	season: int64
2	team: object
3	player_name: object
4	nationality: object
5	position: object
6	age: float64
7	MP: float64
8	Starts: float64
9	Min: float64
10	90s_r: float64
11	Gls: float64
12	Ast: float64
13	G+A: float64
14	G-PK: float64
15	PK: float64
16	PKatt: float64
17	CrdY: float64
18	CrdR: float64
19	xG: float64
20	npxG: float64
21	xAG: float64
22	npxG+xAG: float64
23	PrgC: float64
24	PrgP: float64
25	PrgR: float64
26	Gls.1: float64
27	Ast.1: float64
28	G+A.1: float64
29	G-PK.1: float64
30	G+A-PK: float64
31	xG.1: float64
32	xAG.1: float64
33	xG+xAG: float64
34	npxG.1: float64
35	npxG+xAG.1: float64
36	Gls.2: float64
37	Sh: float64
38	SoT: float64
39	SoT%: float64
40	Sh/90: float64
41	SoT/90: float64
42	G/Sh: float64
43	G/SoT: float64
44	Dist: float64
45	FK: float64
46	PK.1: float64
47	PKatt.1: float64
48	xG.2: float64
49	npxG.2: float64
50	npxG/Sh: float64
51	G-xG: float64
52	np:G-xG: float64
53	Cmp: float64
54	Att: float64
55	Cmp%: float64
56	TotDist: fl

In [80]:
# Attempt to normalize the dataframe using scikitlearn:
from sklearn.preprocessing import StandardScaler

# Initialize StandardScaler:
scaler = StandardScaler()

# Data to standardize:
data_to_scale = final_master_df.iloc[:, 6:146].values

# Fit and transform the selected columns:
scaled_data = scaler.fit_transform(data_to_scale)

# Create a DataFrame from the scaled data:
scaled_df_only = pd.DataFrame(scaled_data, columns=final_master_df.columns[6:146])

# Define dataframe fragments for ease of concatenation:
# left_df = final_master_df.iloc[:, :6] # not adding this to the scaled df for PCA since the variables are strings
right_df = final_master_df.iloc[:, 146:]

# Dimension check (optional):
for index, df in enumerate([scaled_df_only, right_df]):
    print(f'Row Count of DF {index}: {df.shape[0]}')
    print()
# NOTE: each of the DFs to be concatenated have the same row totals

# Concatenate scaled columns with the rest of the DataFrame
master_df_scaled = pd.concat([scaled_df_only, right_df], axis=1)
master_df_scaled

# Send standardized master file to a CSV in the source directory:
# make_csv(source_data_dir, final_scaled_df, 'master_file_standardized')

Row Count of DF 0: 4142

Row Count of DF 1: 4142



Unnamed: 0,age,MP,Starts,Min,90s_r,Gls,Ast,G+A,G-PK,PK,...,DF,"DF,FW","DF,MF",FW,"FW,DF","FW,MF",GK,MF,"MF,DF","MF,FW"
0,-0.834318,0.703365,-0.898098,-0.588397,-0.586948,0.023752,2.139161,0.915099,0.118197,-0.297491,...,0,0,0,0,0,0,0,0,0,1
1,-0.590853,1.565937,1.844768,1.913962,1.911313,-0.465708,2.139161,0.555757,-0.455407,-0.297491,...,0,0,0,0,0,0,0,1,0,0
2,-0.347388,-2.028113,-1.255864,-1.474915,-1.479184,-0.710438,-0.429817,-0.701940,-0.742208,-0.297491,...,0,0,0,0,0,0,0,1,0,0
3,-0.103922,-0.015445,0.055942,0.230145,0.228811,-0.465708,0.426509,-0.162927,-0.455407,-0.297491,...,0,0,1,0,0,0,0,0,0,0
4,0.869938,-0.302969,-0.182568,-0.074330,-0.077099,-0.220978,1.282835,0.376086,-0.168605,-0.297491,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4137,-1.077783,0.990889,1.367748,1.288018,1.286747,-0.220978,-0.857980,-0.522269,-0.168605,-0.297491,...,1,0,0,0,0,0,0,0,0,0
4138,1.113404,0.990889,1.367748,1.290850,1.286747,0.513212,2.139161,1.274441,0.691800,-0.297491,...,0,0,1,0,0,0,0,0,0,0
4139,-0.103922,0.128317,0.771473,0.815020,0.815137,-0.220978,-0.429817,-0.342598,-0.168605,-0.297491,...,1,0,0,0,0,0,0,0,0,0
4140,1.113404,1.278413,-1.136609,-0.987755,-0.982081,-0.465708,-0.857980,-0.701940,-0.455407,-0.297491,...,0,0,0,0,0,0,0,1,0,0


PCA Experimenting Below:

In [81]:
# Test-train split of the standardized dataframe:
from sklearn.model_selection import train_test_split

# Start by using LN(Market Value) as the target variable:
y = master_df_scaled['market_value_in_eur'] # using the actual value, not the natural log
X = master_df_scaled.drop(columns='market_value_in_eur') # everything but the target variable
# Convert all feature column names to strings for PCA fitting:
X.columns = X.columns.astype(str)

# Conduct the test-train split:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # random_state is just the seed for reproducability

In [82]:
# Try to fit a PCA on the training data:
from sklearn.decomposition import PCA

# Create an instance of the PCA model:
pca = PCA(0.95) # "minimum number of components such that 95% of the variance is retained" - Medium tutorial

# Fit the model on the training features:
pca.fit(X_train)

# Check the explained variance ratio:
print("Explained Variance Ratio:", pca.explained_variance_ratio_)

Explained Variance Ratio: [0.26911844 0.22467705 0.06804154 0.05274036 0.04017558 0.02570434
 0.02454625 0.02168879 0.01859857 0.01549081 0.01329503 0.01081347
 0.01037567 0.00998143 0.00882944 0.00850612 0.00821765 0.00762409
 0.00710376 0.00672283 0.0064417  0.00610407 0.00595329 0.00575763
 0.00537502 0.0049814  0.00475572 0.0045596  0.00431144 0.0041518
 0.00400711 0.00365517 0.00345573 0.00339715 0.00320752 0.00301366
 0.00294131 0.00285635 0.00262418 0.00244297 0.00241054 0.00226815
 0.00219173 0.00207067 0.0019299  0.00185286 0.00181786]


In [84]:
# Transform the training and testing features using the learned transformation:
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
# X_train_pca

In [85]:
# Fit an OLS model on the training PCA data:
import statsmodels.api as sm

# Concatenate the principle components (test and train) with a constant:
X_train_pca_with_constant = sm.add_constant(X_train_pca)
X_test_pca_with_constant = sm.add_constant(X_test_pca)

# Fit the OLS model:
train_ols_model = sm.OLS(y_train, X_train_pca_with_constant) # Y from training data principle components from training data
train_ols_results = train_ols_model.fit()

# Regression Output:
print(train_ols_results.summary())

                             OLS Regression Results                            
Dep. Variable:     market_value_in_eur   R-squared:                       0.597
Model:                             OLS   Adj. R-squared:                  0.591
Method:                  Least Squares   F-statistic:                     102.7
Date:                 Mon, 15 Apr 2024   Prob (F-statistic):               0.00
Time:                         19:30:46   Log-Likelihood:                -3130.6
No. Observations:                 3313   AIC:                             6357.
Df Residuals:                     3265   BIC:                             6650.
Df Model:                           47                                         
Covariance Type:             nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0020      0.011     -0.18

In [88]:
# Use the OLS model to predict the training data:
ols_test_pred = train_ols_results.predict(X_test_pca_with_constant)

# Errors:
from sklearn.metrics import mean_absolute_error, mean_squared_error
ols_mae = mean_absolute_error(y_test, ols_test_pred)
ols_mse = mean_squared_error(y_test, ols_test_pred)
ols_rmse = np.sqrt(ols_mse)
ols_errors = [ols_mae, ols_mse, ols_rmse]

for var, name in zip(ols_errors, ['MAE', 'MSE', 'RMSE']):
    print(f'\nOLS with PCA Component Predictors - {name}: {var:.4f}')



OLS with PCA Component Predictors - MAE: 0.4415

OLS with PCA Component Predictors - MSE: 0.4685

OLS with PCA Component Predictors - RMSE: 0.6844
