In [1]:
%pip install pandas regex -q
import os
import pandas as pd
import re

Note: you may need to restart the kernel to use updated packages.


In [2]:
def clean_data(file_path: str, directory: str = ''):
    '''
    clean_data(file_path: str)

    Clean the data frame to fix column names and types

    Parameters
    ----------
    file_path: str
        Path to the raw dataframe scraped from Pro Football Reference
    directory: str, optional
        Directory to write output CSV files

    Returns
    -------
    ff_year: DataFrame
        Dataframe with cleaned column names and coerced types

    Notes
    -----
    Also outputs processed CSV files into specified directory
    '''
    ff_year = pd.read_csv(file_path)
    def fix_feature_name(prefix: str, name: str):
        '''
        fix_feature_name(prefix: str, name: str)

        Adjust column names to be more descriptive (collapse prefix and name into one)

        Parameters
        ----------
        prefix: str
            The first part of the column name, with periods and 'Unnamed'

        name: str
            The second part of the column name

        Returns
        -------
        str
            String of the fixed column name, with prefix and name separated by '_'
        '''
        tmp_prefix = re.split('\\.', prefix)[0]
        fixed_prefix = [tmp_prefix.lower() + '_' if re.search('Unnamed', tmp_prefix) is None else ''][0]
        return fixed_prefix + name.lower()
    col_names = ff_year.iloc[0, :].reset_index()
    col_names.columns = ['prefix', 'name']
    fixed_names = [fix_feature_name(col_prefix, col_name) for col_prefix, col_name in zip(col_names.prefix, col_names.name)]
    
    # Keep original column names
    original_names = ff_year.iloc[0, :].reset_index(drop = True)
    # Adjust column names, remove extraneous rows
    ff_year.columns = fixed_names
    ff_year = ff_year[ff_year.rk != original_names[0]].reset_index(drop = True)
    # Remove extraneous information from the player column
    ff_year.player = [re.split('[+*^]+$', player)[0] for player in ff_year.player]

    # Coerce numeric columns to float, int
    numeric_columns = ff_year.drop(columns = ['player', 'tm', 'fantpos']).columns
    for col in numeric_columns:
        ff_year[col] = pd.to_numeric(ff_year[col])
    # Coerce categorical columns to factors
    factor_columns = ['tm', 'fantpos']
    for col in factor_columns:
        ff_year[col] = ff_year[col].astype('category')
    # NA values treated as 0
    ff_year.loc[:, 'games_g':'fantasy_vbd'] = ff_year.loc[:, 'games_g':'fantasy_vbd'].fillna(0)

    file_stub = re.findall('(?<=\\/)[\w]+(?=\\.csv)', file_path)[0]
    file_name = directory + file_stub + '_cleaned.csv'
    ff_year.to_csv(file_name, index = False)
    return ff_year

In [3]:
ff_year_files = ['data/raw/' + file_name for file_name in os.listdir('data/raw/') if re.search('\\.csv$', file_name) is not None]
ff_data_cleaned = [clean_data(file_path = file, directory = 'data/processed/') for file in ff_year_files]