In [2]:
import os
import pandas as pd
from fuzzywuzzy import process



## Grab Player Metadata

In [3]:
def create_player_meta_csv(season:str, 
                           save_dir:str=os.path.join('clean_data', 'meta')):
    """ 
    Create a csv of player metadata.

    :param str season: Which season of EPL data to process metadata for. Should 
        follow the format 20XX-X(X+1).
    :param str save_dir: path within repo to desired data folder for the metadata
      /path/to/data

    :returns: Nothing
    :rtype: None
    """
    RAW_DATA_DIR = 'raw_data'
    meta_data_list = []

    # Check if raw_data/season/gws exists
    season_dir = os.path.join(RAW_DATA_DIR, season, 'gws')
    if not os.path.exists(season_dir):
        print(f"No data found for {season}/gws. Exiting.")
        return

    # Iterate through all CSV files in gws folder
    for filename in os.listdir(season_dir):
        if filename.endswith(".csv"):
            csv_path = os.path.join(season_dir, filename)
            df = pd.read_csv(csv_path)
            df = df.loc[:,['name', 'position', 'team']]

            meta_data_list.append(df)

    # Concatenate all gw metadata (we do this because players may transfer
    # in and out of the league)
    meta_data_df = pd.concat(meta_data_list, axis=0, ignore_index=True)
    meta_data_df = meta_data_df.drop_duplicates(subset=['name'], keep='first')
    
    # Check if 'name', 'position', 'team' are consistent for each 'name'
    #assert meta_data_df.groupby('name')['position'].nunique().eq(1).all()
    #assert meta_data_df.groupby('name')['team'].nunique().eq(1).all()

    # Write the cleaned dataframe to a CSV file
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    save_path = os.path.join(save_dir, f'player_meta_{season}.csv')
    meta_data_df.to_csv(save_path, index=False)
    print(f"Player metadata saved to {save_path}")

    return

In [4]:
create_player_meta_csv('2020-21')

Player metadata saved to clean_data/meta/player_meta_2020-21.csv


In [15]:
def append_player_metadata(season: str, 
                            save_dir: str = os.path.join('clean_data', 'players'),
                            verbose:bool = False):
    """
    Appends player metadata into player gw-by-gw CSVs.

    :param str season: Which season of EPL data to process metadata for. Should 
        follow the format 20XX-X(X+1).
    :param str save_dir: path within repo to desired data folder for the metadata
        /path/to/data

    :returns: Nothing
    :rtype: None
    """
    RAW_DATA_DIR = 'raw_data'
    PLAYER_META_CSV_PATH = os.path.join('clean_data', 'meta', f'player_meta_{season}.csv')

    # Load player metadata CSV
    player_meta_df = pd.read_csv(PLAYER_META_CSV_PATH)

    # Iterate through all player_name_folders in the raw_data/players directory
    player_dirs = [f for f in os.listdir(os.path.join(RAW_DATA_DIR, season, 'players')) 
                   if os.path.isdir(os.path.join(RAW_DATA_DIR, season, 'players', f))]
    
    used_names = []

    for player_folder in player_dirs:
        player_folder_path = os.path.join(RAW_DATA_DIR, season, 'players', player_folder)
        player_csv_path = os.path.join(player_folder_path, 'gw.csv')

        # Load player gw CSV
        player_gw_df = pd.read_csv(player_csv_path)

        # Fuzzy match based on folder name to the closest name in PLAYER_META_CSV
        # from the aforementioned csv path. column name to match on is 'name'.
        matches = process.extractOne(player_folder.replace('_', ' '), 
                                     player_meta_df['name'])
        top_match = matches[0]

        if top_match in used_names:
            raise Exception(f"Attempted to assign an already matched player name. Player Folder: {player_folder}. Top Match: {matches[0]}")
        else:
            player_metadata = player_meta_df[player_meta_df['name'] == matches[0]]

            # Append player_metadata columns to player_gw_df
            for col in player_metadata.columns:
                player_gw_df[col] = player_metadata[col]

            # Save the updated player_gw_df to the appropriate path
            full_save_dir = os.path.join(season, save_dir, player_folder)
            if not os.path.exists(full_save_dir):
                os.makedirs(full_save_dir, exist_ok=True)

            if verbose: 
                print(f"Saving appended player data to: {full_save_dir}")
            player_gw_df.to_csv(os.path.join(full_save_dir, player_folder), 
                                index=False)

            # Mark the player name as used
            used_names.append(top_match)

In [None]:
append_player_metadata('2020-21', verbose=True)