In [1]:
import pandas as pd
import glob
import os

In [2]:
data_dir = '../data'
genders = ['Men', 'Women']
csv_lists = {}
# Generate a list of pathnames to relevant CSVs
for gender in genders:
    csv_lists[gender] = glob.glob(os.path.join(data_dir, 'FSA_DS_{}*'.format(gender)))

In [3]:
# Instantiate a dict of empty DataFrames to collect all our data
dfs = {gender: pd.DataFrame() for gender in genders}

for gender in genders:
    # Give the master DataFrame index a meaningful name
    dfs[gender].index.name = 'Rider ID'
    
    for csv_path in csv_lists[gender]:
        # Read in CSV
        df = pd.read_csv(csv_path)
        # Calculate two columns based on filename
        df['Gender'] = gender
        year = int(csv_path[-8:-4])
        df['Year'] = year
        # Clean up identified issues in Rider Name column
        df['Rider Name'] = df['Rider Name'].str.strip()
        df['Rider Name'] = df['Rider Name'].str.replace('  ', ' ')
        df['Rider Name'] = df['Rider Name'].str.replace('\\', '')

        # Identify correct age table (birth year and UCI category aren't included in the downloadable CSVs)
        age_path = glob.glob(os.path.join(data_dir, 'age_{}_{}*'.format(gender, year)))[0]
        age = pd.read_csv(age_path, sep = '\t', encoding = 'utf-8')
        # Remove trailing spaces from data and column names
        for col in age.columns:
            if age[col].dtype == 'object':
                age[col] = age[col].str.strip()
        age.columns = age.columns.str.strip()

        if gender == 'Men':
            # Harmonize this table's column names with the CSV's
            age = age.rename(columns = {'Rider': 'Rider Name'})
            # Join birth year and UCI category onto main table
            df = df.merge(age[['Country', 'UCI Team', 'Rider Name', 'UCI Cat', 'Born']], 
                          how = 'left', on = ['Country', 'UCI Team', 'Rider Name'])
            # Append the current year to columns that will change from year to year
            old_cols = ['UCI Team', 'Price', 'UCI Cat']
        else: # The women's table has fewer and differently named columns
            age = age.rename(columns = {'Rider': 'Rider Name', 'Trade Team': 'UCI Team'})
            df = df.merge(age[['Country', 'UCI Team', 'Rider Name', 'Born']], 
                          how = 'left', on = ['Country', 'UCI Team', 'Rider Name'])
            old_cols = ['UCI Team', 'Price']

        new_cols = {col: '{} {}'.format(col, year) for col in old_cols}
        df = df.rename(columns = new_cols)
        # Calculate the riders' age in this year
        df['Age {}'.format(year)] = df['Year'] - df['Born']

        # Drop unnecessary columns
        df = df.drop(columns = ['Rank', 'Score {}'.format(year - 1), 'Year'])
        # Set a useful index
        df = df.set_index('Rider ID')
        
        # Prepare to join new columns to old rows...
        to_join = df.loc[df.index.isin(dfs[gender].index), ~df.columns.isin(dfs[gender].columns)]
        # ...and to append new rows to old columns...
        to_append = df.loc[~df.index.isin(dfs[gender].index), :]
        # ...then do both!
        dfs[gender] = dfs[gender].join(to_join).append(to_append)
        
    # Export the processed data to CSV
    dfs[gender].to_csv(os.path.join(data_dir, gender + '.csv'))