In [48]:
import os
import pandas as pd
from fuzzywuzzy import process
import warnings
from urllib.parse import unquote   #for cleanup of bad folder player names from encoded URLS
import re

### Helper Func for Cleaning Player Folder Names for Fuzzy Match

In [49]:
def decode_url(url):
    # Define a regular expression pattern to match URL-encoded sequences
    pattern = re.compile('%[0-9a-fA-F]{2}')

    # Use a lambda function to replace each match with its decoded equivalent
    decoded_url = pattern.sub(lambda x: unquote(x.group(0)), url)

    return decoded_url

## Grab Player Metadata

In [50]:
def create_player_meta_csv(season:str, 
                           save_dir:str=os.path.join('clean_data', 'meta')):
    """ 
    Create a csv of player metadata.

    :param str season: Which season of EPL data to process metadata for. Should 
        follow the format 20XX-X(X+1).
    :param str save_dir: path within repo to desired data folder for the metadata
      /path/to/data

    :returns: Nothing
    :rtype: None
    """
    RAW_DATA_DIR = 'raw_data'
    meta_data_list = []

    # Check if raw_data/season/gws exists
    season_dir = os.path.join(RAW_DATA_DIR, season, 'gws')
    if not os.path.exists(season_dir):
        print(f"No data found for {season}/gws. Exiting.")
        return

    # Iterate through all CSV files in gws folder
    for filename in os.listdir(season_dir):
        if filename.endswith(".csv"):
            csv_path = os.path.join(season_dir, filename)
            df = pd.read_csv(csv_path)
            df = df.loc[:,['name', 'position', 'team']]

            meta_data_list.append(df)

    # Concatenate all gw metadata (we do this because players may transfer
    # in and out of the league)
    meta_data_df = pd.concat(meta_data_list, axis=0, ignore_index=True)
    meta_data_df = meta_data_df.drop_duplicates(subset=['name'], keep='first')
    
    # Check if 'name', 'position', 'team' are consistent for each 'name'
    #assert meta_data_df.groupby('name')['position'].nunique().eq(1).all()
    #assert meta_data_df.groupby('name')['team'].nunique().eq(1).all()

    # Write the cleaned dataframe to a CSV file
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    save_path = os.path.join(save_dir, f'player_meta_{season}.csv')
    meta_data_df.to_csv(save_path, index=False)
    print(f"Player metadata saved to {save_path}")

    return

## Reorder and Clean Cols

In [51]:
def reorder_and_limit_gwdata_cols(df: pd.DataFrame) -> pd.DataFrame:
    """ 
    Reorders and limits columns of the provided dataframe to the following 
    format:

    cols = name, position, team, opponent_team, minutes, goals_scored, assists, 
    goals_conceded, clean_sheets, bps, yellow_cards, red_cards, own_goals, 
    saves, penalties_missed, penalties_saved, ict_index, influence, creativity, 
    threat, was_home, total_points 
    """
    df = df.copy()
    # Define the desired column order
    desired_cols = ['name', 'position', 'team', 'opponent_team', 'minutes', 
                    'goals_scored', 'assists', 'goals_conceded', 'clean_sheets', 
                    'bps', 'yellow_cards', 'red_cards', 'own_goals', 'saves', 
                    'penalties_missed', 'penalties_saved', 'ict_index', 
                    'influence', 'creativity', 'threat', 'was_home', 
                    'total_points']

    # Reorder and limit the columns
    df = df[desired_cols]

    return df

def fill_in_opponent_team(df: pd.DataFrame, season: str = '2020-21') -> pd.DataFrame:
    """ 
    Fills in the actual opponent team name from the team ID.

    The `master_team_list.csv` file must be downloaded (see `get_master_team_list`
    in `scrape_fpl_data`).
    """
    df = df.copy() 
    
    CLEAN_DATA_DIR = 'clean_data'
    MASTER_TEAM_LIST_PATH = os.path.join(CLEAN_DATA_DIR, 'master_team_list.csv')

    # Load the master_team_list.csv
    master_team_list = pd.read_csv(MASTER_TEAM_LIST_PATH)

    # Create a mapping dictionary using both 'season' and 'team' columns
    mapping_df = master_team_list[(master_team_list['season'] == season)][['team', 'team_name']]
    team_mapping = dict(list(zip(mapping_df['team'], mapping_df['team_name'])))
    #print(team_mapping)

    # Map the opponent_team column using the created mapping
    df['opponent_team'] = df['opponent_team'].apply(lambda team: team_mapping[team])
    
    return df

In [52]:
def clean_player_data(season: str, 
                    save_dir: str = os.path.join('clean_data'),
                    verbose:bool = False):
    """
    Appends player metadata into player gw-by-gw CSVs. Also cleans player data 
    cols.

    :param str season: Which season of EPL data to process metadata for. Should 
        follow the format 20XX-X(X+1).
    :param str save_dir: path within repo to desired data folder for the metadata
        /path/to/data

    :returns: Nothing
    :rtype: None
    """
    RAW_DATA_DIR = 'raw_data'
    PLAYER_META_CSV_PATH = os.path.join('clean_data', 'meta', f'player_meta_{season}.csv')

    # Load player metadata CSV
    player_meta_df = pd.read_csv(PLAYER_META_CSV_PATH)

    # Iterate through all player_name_folders in the raw_data/players directory
    player_dirs = [f for f in os.listdir(os.path.join(RAW_DATA_DIR, season, 'players')) 
                   if os.path.isdir(os.path.join(RAW_DATA_DIR, season, 'players', f))]
    
    used_names = []

    for player_folder in player_dirs:
        player_name = decode_url(player_folder)
        
        if verbose:
            print(f"Appending metadata via fuzzy match for player: \n{player_folder}\n")
        player_folder_path = os.path.join(RAW_DATA_DIR, season, 'players', player_folder)
        player_csv_path = os.path.join(player_folder_path, 'gw.csv')

        # Load player gw CSV
        if verbose:
            print(f"Reading player data for {player_csv_path}")
        player_gw_df = pd.read_csv(player_csv_path)

        # Fuzzy match based on the closest name in PLAYER_META_CSV
        # with player name extracted via raw data player directory name. 
        # column name to match on is 'name'.
        matches = process.extractOne(player_name.replace('_', ' '), 
                                     player_meta_df['name'])
        top_match = matches[0]

        if top_match in used_names:
            warnings.warn(("Attempted to assign an already matched" 
                          f"player name. Player Folder: {player_folder}." 
                          f"Top Match: {matches[0]}. Skipping this player."))
            continue
        else:
            player_metadata = player_meta_df[player_meta_df['name'] == matches[0]]

            # Append player_metadata columns to player_gw_df
            for col in player_metadata.columns:
                player_gw_df[col] = player_metadata[col].values[0]

            #Clean up player_gw_df columns
            player_gw_df = reorder_and_limit_gwdata_cols(player_gw_df)

            #Fill in Opponent Team
            player_gw_df = fill_in_opponent_team(player_gw_df)

            # Save the updated player_gw_df to the appropriate path
            full_save_dir = os.path.join(save_dir, season, 'players', player_name)
            if not os.path.exists(full_save_dir):
                os.makedirs(full_save_dir, exist_ok=True)


            if verbose: 
                print(f"Saving appended player data to: {full_save_dir}")
            player_gw_df.to_csv(os.path.join(full_save_dir, 'gw.csv'), 
                                index=False)
            
            # Mark the player name as used
            used_names.append(top_match)

    return

## Organize Player Data by Position

In [53]:
def organize_data_by_pos(data_dir: str,
                         verbose: bool = False):
    """
    Organizes player data by position, creating new folders for GKP, DEF, MID, FWD.

    :param str data_dir: Path to the top-level directory containing season and player data.

    :returns: None
    :rtype: None
    """
    POSITIONS = ['GK', 'DEF', 'MID', 'FWD']

    unique_positions = set()

    # Iterate through all seasons (top-level folders)
    top_level_folders = os.listdir(data_dir)
    season_folders = [folder for folder in top_level_folders if not folder == 'meta']
    for season_folder in season_folders:
        season_path = os.path.join(data_dir, season_folder)

        if os.path.isdir(season_path):
            # Iterate through all players within each season
            players_path = os.path.join(season_path, 'players')
            for player_folder in os.listdir(players_path):
                player_path = os.path.join(season_path, 'players', player_folder)

                if os.path.isdir(player_path):
                    # Read in the .csv file in the player's folder
                    player_csv = os.path.join(player_path, "gw.csv")

                    if os.path.isfile(player_csv):
                        player_data = pd.read_csv(player_csv)
                        position = player_data.loc[0, 'position']

                        # Place each player file back into position/season/playername/gw.csv
                        player_pos_folder = os.path.join(season_path, position)
                        os.makedirs(os.path.join(player_pos_folder, player_folder), exist_ok=True)
                        new_player_csv = os.path.join(player_pos_folder, player_folder, "gw.csv")

                        if verbose: 
                            print(f'Copying {player_csv} into {new_player_csv}')
                        player_data.to_csv(new_player_csv, index=False)

    return

In [54]:
create_player_meta_csv('2020-21')
clean_player_data('2020-21')

Player metadata saved to clean_data/meta/player_meta_2020-21.csv




In [55]:
create_player_meta_csv('2021-22')
clean_player_data('2021-22')

Player metadata saved to clean_data/meta/player_meta_2021-22.csv




In [56]:
organize_data_by_pos('clean_data', verbose=True)

Copying clean_data/2021-22/players/Nathaniel_Clyne_493/gw.csv into clean_data/2021-22/DEF/Nathaniel_Clyne_493/gw.csv
Copying clean_data/2021-22/players/Jarell_Quansah_660/gw.csv into clean_data/2021-22/DEF/Jarell_Quansah_660/gw.csv
Copying clean_data/2021-22/players/Shane_Duffy_56/gw.csv into clean_data/2021-22/DEF/Shane_Duffy_56/gw.csv
Copying clean_data/2021-22/players/Bryan_Gil Salvatierra_487/gw.csv into clean_data/2021-22/MID/Bryan_Gil Salvatierra_487/gw.csv
Copying clean_data/2021-22/players/Tariqe_Fosu-Henry_79/gw.csv into clean_data/2021-22/MID/Tariqe_Fosu-Henry_79/gw.csv
Copying clean_data/2021-22/players/Bruno_Guimarães Rodriguez Moura_697/gw.csv into clean_data/2021-22/MID/Bruno_Guimarães Rodriguez Moura_697/gw.csv
Copying clean_data/2021-22/players/Moussa_Djenepo_350/gw.csv into clean_data/2021-22/MID/Moussa_Djenepo_350/gw.csv
Copying clean_data/2021-22/players/Josh_Martin_330/gw.csv into clean_data/2021-22/MID/Josh_Martin_330/gw.csv
Copying clean_data/2021-22/players/Ben_M