Cleaning merged league data from FBref:

In [115]:
# Import libraries:
import pandas as pd
import os
import seaborn as so
import matplotlib.pyplot as plt

Global variables and functions:

In [116]:
# Global variables:
repo_dir = os.getcwd()  # Directory of the script
merged_data_dir = os.path.join(repo_dir, "Merged Data")   # Path to the Merged Data folder
league_ids = ['ENG-Premier League', 'ESP-La Liga', 'FRA-Ligue 1', 'GER-Bundesliga', 'ITA-Serie A'] # For league-level iteration
epl_file_path = os.path.join(merged_data_dir, "ENG-Premier League_full_merge.xlsx") # For testing EPL data only

# Graph output styling from matplotlib:
plt.style.use('fivethirtyeight')

In [117]:
# Function that imports data from Github data folder:
def import_merged_data(file_path):
    df = pd.read_excel(file_path)
    return df


# Function that removes unnamed columns:
def remove_unnamed_cols(df):

    # Create list of unnamed columns:
    columns = df.columns
    unnamed_cols = [col for col in columns if "Unnamed" in col]

    # Create a new df and set index:
    new_df = (df
              .drop(columns=unnamed_cols)
              .set_index('league')
              )

    return new_df


# Function that writes the output data to an Excel file:
def make_xl(path, df, file_name):
    file_path = os.path.join(path, f'{file_name}.xlsx')
    return df.to_excel(file_path, index=True)           # Remove index=True if getting permission error


# Function that generates a dataframe with team-level statistics filtered by position and season:
def key_stats_table(df, stat_dict, pos, season):
    return (df[df['position'].str.contains(pos) & (df['season'] == season)]
              .groupby('team')
              .aggregate(stat_dict)
              .reset_index()
              )
    

Import EPL data for preliminary cleaning and exploring:

In [None]:
# Import the EPL data for experimental cleaning:
epl_df = import_merged_data(epl_file_path)

In [None]:
# Make a local copy of the dataframe for efficiency:
epl_df_copy = epl_df.copy(deep=True)

In [None]:
# Remove all unnamed columns and drop NaN values from the dataframe:
clean_epl_df_copy = remove_unnamed_cols(epl_df_copy).dropna(subset=['position'])
clean_epl_df_copy

In [None]:
# Get column names:
col_names = clean_epl_df_copy.columns
col_nums = [i for i in range(0, 155)]

# Create a dictionary and convert to a pandas DF:
col_names_dict = {k:v for k,v in zip(col_nums, col_names)}
columns_df = pd.DataFrame.from_dict(col_names_dict, orient='index')

# Send to an XL doc for reference:
make_xl(path=repo_dir, df=columns_df, file_name='Merged Data Columns')

In [None]:
# Aggregate by position and get row totals:
pos_agg_eply_df = (clean_epl_df_copy
               .groupby('position')
               .aggregate({'season': 'count', 'age': 'mean', 'MP': 'mean'})
               .rename(columns={'season': 'row_total'})
               .reset_index()
               )

pos_agg_eply_df

In [None]:
season_agg_eply_df = (clean_epl_df_copy
                      .groupby('season')
                      .aggregate({'position': 'count', 'age': 'mean', 'MP': 'mean'})
                      .rename(columns={'position': 'row_total'})
                      .reset_index()
                      )

# Create a line chart of player count by season:
so.barplot(x='season', y='row_total', data=season_agg_eply_df)

In [None]:
# Filter on position level:
midfield_epl_df = (clean_epl_df_copy[clean_epl_df_copy['position'].str.contains("MF")]
                   .groupby('position')
                   .aggregate({'season': 'count', 'age': 'mean', 'MP': 'mean'})
                   .rename(columns={'season': 'row_total'})
                   .reset_index()
                   )
midfield_epl_df

In [None]:
# Plot row totals by MIDFIELD sub-positions:
plt.figure(figsize=(9,5))
so.barplot(x='position', y='row_total', data=midfield_epl_df, color='darkcyan')
plt.xlabel('Midfield Sub-Position')
plt.ylabel('Number of Players')

Aggregate at the team-position level:

In [None]:
# Aggregate on the team-position level:
team_agg_eply_df = (clean_epl_df_copy
                      .reset_index()
                      .groupby('team')
                      .aggregate({'league': 'count', 'age': 'mean', 'Gls': 'mean', 'SoT%': 'mean', 'SCA90' : 'mean'})
                      .rename(columns={'league': 'row_total'})
                      .reset_index()
                      .sort_values(by='Gls', ascending=False)
                      )

# Horizontal barplot of average goals/player by team:
plt.figure(figsize=(12,10))
so.barplot(x='Gls', y='team', data=team_agg_eply_df)

In [None]:
# Create a dictionary of key statistics and their respective aggregation methods:
test_stat_dict = {'age': 'mean',
                  'Gls.1': 'mean',  # goals/90
                  'Cmp%.1': 'mean', # medium pass completion rate
                  'SoT%': 'mean',   # shot-on-target %
                  'SCA90' : 'mean', # shot-creating actions per 90 
                  'Tkl%': 'mean',   # successful tackle %
                  'Succ%': 'mean'   # successful take-on %
                  }

# Create a dataframe of these statistics for a certain league, position, and season:
test_df = key_stats_table(clean_epl_df_copy, test_stat_dict, 'MF', 2122)
test_df

Loop over the Merged Data folder to get all five seasons cleaned and into pandas: 

In [None]:
# Make a list of imported dataframes:
league_df_list = [import_merged_data(os.path.join(merged_data_dir, f"{league}_full_merge.xlsx")) for league in league_ids]

# Make a list of clean, copied dataframes:
cleaned_league_df_list = [remove_unnamed_cols((league_df.copy(deep=True))).dropna(subset=['position']) for league_df in league_df_list]

In [None]:
# Create a midfielder summary statistics table for each league in the 22/23 season:
mid_2223_stat_df_list = [key_stats_table(clean_league_df, test_stat_dict, 'MF', 2223) for clean_league_df in cleaned_league_df_list]

# Output the desired result:
mid_2223_stat_df_list[1]