In [1]:
# Import necessary packages
import pandas as pd
import numpy as np
import glob
import os

# Set global parameters
data_dir = '../data'
genders = ['Men', 'Women']

# Wide Format

Each row corresponds to one rider; columns range from 2010 to 2020.

In [2]:
# Instantiate a dict of empty DataFrames to collect all our data
dfs = {gender: pd.DataFrame() for gender in genders}

csv_lists = {}

for gender in genders:
    # Give the master DataFrame index a meaningful name
    dfs[gender].index.name = 'Rider ID'
    
    # Generate a list of pathnames to relevant CSVs
    csv_lists[gender] = glob.glob(os.path.join(data_dir, 'FSA_DS_{}*'.format(gender)))
    
    for csv_path in csv_lists[gender]: # Iterate through CSVs
        # Read in CSV
        df = pd.read_csv(csv_path)
        # Calculate two columns based on filename
        df['Gender'] = gender
        year = int(csv_path[-8:-4])
        df['Year'] = year
        # Clean up identified issues in Rider Name column
        df['Rider Name'] = df['Rider Name'].str.strip()
        df['Rider Name'] = df['Rider Name'].str.replace('  ', ' ')
        df['Rider Name'] = df['Rider Name'].str.replace('\\', '')

        # Identify correct age table (birth year and UCI category aren't included in the downloadable CSVs)
        age_path = glob.glob(os.path.join(data_dir, 'age_{}_{}*'.format(gender, year)))[0]
        age = pd.read_csv(age_path, sep = '\t', encoding = 'utf-8')
        # Remove trailing spaces from data and column names
        for col in age.columns:
            if age[col].dtype == 'object':
                age[col] = age[col].str.strip()
        age.columns = age.columns.str.strip()

        if gender == 'Men':
            # Harmonize this table's column names with the CSV's
            age = age.rename(columns = {'Rider': 'Rider Name'})
            # Join birth year and UCI category onto main table
            df = df.merge(age[['Country', 'UCI Team', 'Rider Name', 'UCI Cat', 'Born']], 
                          how = 'left', on = ['Country', 'UCI Team', 'Rider Name'])
            # Append the current year to columns that will change from year to year
            old_cols = ['UCI Team', 'Price', 'UCI Cat']
        else: # The women's table has fewer and differently named columns
            age = age.rename(columns = {'Rider': 'Rider Name', 'Trade Team': 'UCI Team'})
            df = df.merge(age[['Country', 'UCI Team', 'Rider Name', 'Born']], 
                          how = 'left', on = ['Country', 'UCI Team', 'Rider Name'])
            old_cols = ['UCI Team', 'Price']

        new_cols = {col: '{} {}'.format(col, year) for col in old_cols}
        df = df.rename(columns = new_cols)
        # Calculate the riders' age in this year
        df['Age {}'.format(year)] = df['Year'] - df['Born']

        # Drop unnecessary columns
        df = df.drop(columns = ['Rank', 'Score {}'.format(year - 1), 'Year'])
        # Set a useful index
        df = df.set_index('Rider ID')
        
        # Prepare to join new columns to old rows...
        to_join = df.loc[df.index.isin(dfs[gender].index), ~df.columns.isin(dfs[gender].columns)]
        # ...and to append new rows to old columns...
        to_append = df.loc[~df.index.isin(dfs[gender].index), :]
        # ...then do both!
        dfs[gender] = dfs[gender].join(to_join).append(to_append)
        
    # Export the processed data to CSV
    dfs[gender].to_csv(os.path.join(data_dir, gender + '_Wide.csv'))

# Lookback Record Format

Each row corresponds to a retrospective view for one rider, looking back from one year. Columns range from 0ya to 10ya.

In [3]:
# Instantiate a dict to collect all our records
records = {gender: None for gender in genders}

for gender in genders:
    # Extract all years for which we have data
    years = [int(col[-4:]) for col in dfs[gender].columns if 'Price ' in col]
    
    # Define which columns we're interested in from each year
    if gender == 'Men':
        year_vars = ['Age', 'UCI Team', 'UCI Cat', 'Price', 'Score']
    else:
        year_vars = ['Age', 'UCI Team', 'Price', 'Score']

    # Create a list to house yearly observations for later concatenation
    yearly_records = []
    
    for year in years: # Iterate through observation years
        # Set counters and parameters for this iteration
        years_ago = 0
        records_year = year # the year for which we're currently generating records
        year_df = None

        while year >= min(years): # Step back in time from each observation year to the beginning of our data
            # Identify the columns of interest
            cols = ['{} {}'.format(col, year) for col in year_vars]
            # Select the columns of interest for the rows with non-null Price in that year
            df = dfs[gender].loc[~dfs[gender]['Price {}'.format(year)].isna(), cols]
            # Rename columns in terms of years ago (from records_year)
            df.columns = ['{} {}ya'.format(col, years_ago) for col in year_vars]

            # Either create or add to the collection of records
            if year_df is None:
                year_df = df
            else:
                year_df = year_df.join(df) # keep only index values already in the master DataFrame

            # Increment and decrement counters
            year -= 1
            years_ago += 1
        
        # Add non-year-dependent fields to this observation year's records
        year_df['Observation Year'] = records_year
        year_df = year_df.join(dfs[gender][['Rider Name', 'Country']])
        # Append this set of observations to a list for later concatenation
        yearly_records.append(year_df)
        
    # Concatenate records from all observation years into one DataFrame
    records[gender] = pd.concat(yearly_records)
    
    # Export all records to CSV
    records[gender].to_csv(os.path.join(data_dir, gender + '_Lookback.csv'))

# Long Format

Each row corresponds to one rider in one year. This is the format we need to feed into an RNN.

In [4]:
# Instantiate a dict of empty DataFrames to collect all our data
dfs = {gender: pd.DataFrame() for gender in genders}

csv_lists = {}

for gender in genders:
    # Instantiate a list to collect annual DataFrames
    yearly_dfs = []
    
    # Generate a list of pathnames to relevant CSVs
    csv_lists[gender] = glob.glob(os.path.join(data_dir, 'FSA_DS_{}*'.format(gender)))
    
    for csv_path in csv_lists[gender]: # Iterate through CSVs
        # Read in CSV
        df = pd.read_csv(csv_path)
        # Calculate year column based on filename
        year = int(csv_path[-8:-4])
        df['Year'] = year
        # Clean up identified issues in Rider Name column
        df['Rider Name'] = df['Rider Name'].str.strip()
        df['Rider Name'] = df['Rider Name'].str.replace('  ', ' ')
        df['Rider Name'] = df['Rider Name'].str.replace('\\', '')

        # Identify correct age table (birth year and UCI category aren't included in the downloadable CSVs)
        age_path = glob.glob(os.path.join(data_dir, 'age_{}_{}*'.format(gender, year)))[0]
        age = pd.read_csv(age_path, sep = '\t', encoding = 'utf-8')
        # Remove trailing spaces from data and column names
        for col in age.columns:
            if age[col].dtype == 'object':
                age[col] = age[col].str.strip()
        age.columns = age.columns.str.strip()

        if gender == 'Men':
            # Harmonize this table's column names with the CSV's
            age = age.rename(columns = {'Rider': 'Rider Name'})
            # Join birth year and UCI category onto main table
            df = df.merge(age[['Country', 'UCI Team', 'Rider Name', 'UCI Cat', 'Born']], 
                          how = 'left', on = ['Country', 'UCI Team', 'Rider Name'])
            # Append the current year to columns that will change from year to year
            old_cols = ['UCI Team', 'Price', 'UCI Cat']
        else: # The women's table has fewer and differently named columns
            age = age.rename(columns = {'Rider': 'Rider Name', 'Trade Team': 'UCI Team'})
            df = df.merge(age[['Country', 'UCI Team', 'Rider Name', 'Born']], 
                          how = 'left', on = ['Country', 'UCI Team', 'Rider Name'])
            old_cols = ['UCI Team', 'Price']

        # Calculate the riders' age in this year
        df['Age'] = df['Year'] - df['Born']

        # Drop unnecessary columns
        df = df.drop(columns = ['Rank', 'Born'])
        
        # Rename the Score columns
        new_cols = {'Score {}'.format(year - 1): 'Score 1ya',
                    'Score {}'.format(year): 'Score'}
        df = df.rename(columns = new_cols)

        # Reorder columns
        col_order = ['Year', 'Rider ID', 'Rider Name', 'Country', 'UCI Team', 'UCI Cat', 
                     'Age', 'Price', 'Score 1ya', 'Score']
        if gender == 'Women':
            col_order.remove('UCI Cat')
        df = df.reindex(columns = col_order)
        
        # Append this DataFrame to the yearly list
        yearly_dfs.append(df)

    # Concatenate all DataFrames
    dfs[gender] = pd.concat(yearly_dfs)
    
    ### Fill in missing rows as necessary:
    # We're just going to fill any missing rows between the rider's first and last years of data
    # Identify riders with missing years (last_year - (first_year-1) > num_rows)
    missing = dfs[gender].groupby('Rider ID')['Year'].agg([max, min, 'count'])
    missing = missing[missing['max'] - (missing['min'] - 1) > missing['count']]
    
    # Set aside data for riders who aren't missing rows
    complete = dfs[gender][~dfs[gender]['Rider ID'].isin(missing.index)]
    
    # Create missing rows for each rider who's missing data
    filled = []
    for rider_id in missing.index: # iterate over riders with missing data
        df = dfs[gender][dfs[gender]['Rider ID'] == rider_id] # select all that rider's data
        df = df.set_index('Year') # set Year as the index
        df = df.reindex(range(df.index.min(), df.index.max() + 1)) # expand the table using a new index including missing years
        for col in ['Rider ID', 'Rider Name', 'Country']: # set three unchanging columns to equal the first observed value
            df.loc[:, col] = df.loc[df.index.min(), col]
        df['Age'] = df.index - (df.index.min() - df.loc[df.index.min(), 'Age']) # calculate rider age for each year
        df = df.reset_index() # pull Year back out of the index
        filled.append(df) # append to list for later concatenation
    
    # Concatenate and fill remaining missingness in these expanded DataFrames
    fill_values = {'UCI Team': np.nan, 'Price': 0, 'Score 1ya': 0, 'Score': 0}
    filled = pd.concat(filled).fillna(fill_values)
    
    # Now combine the complete and filled portions of the dataset and sort by rider ID and year
    dfs[gender] = pd.concat([complete, filled]).sort_values(['Rider ID', 'Year'])
    
    dfs[gender].to_csv(os.path.join(data_dir, gender + '_Long.csv'), index = False)