In [86]:
import pandas as pd
import os
import json
import glob

fields = pd.read_json('raw/fields-2012_2025.json')
standards = pd.read_json('raw/standards-2020_2025.json')
downhill_marathons = pd.read_csv('raw/downhill-marathons.csv')

In [87]:
athlinks_slugs = ['tucson-marathon', 'sundance-to-spearfish', 'pocatello', 'leading-ladies', 'jack-and-kills', 'hawaii-bird-conservation']
raceentry_slugs = ['utah-valley-marathon']
ultrasignup_slugs = ['tunnel-vision', 'tunnel-light', 'light-at-the-end-of-the-tunnel']
chronokeep_slugs = ['super', 'cascade-express']
webscorer_slugs = ['sun', 'east-canyon']
runsignup_slugs = ['runtastic-nebo']
brooksee_slugs = ['revel-white-mountains', 'revel-mt-charleston', 'revel-big-bear', 'revel-big-cottonwood', 'deseret-news', 'bears-ears']

race_slugs = {
    'athlinks': athlinks_slugs,
    'raceentry': raceentry_slugs,
    'ultrasignup': ultrasignup_slugs,
    'chronokeep': chronokeep_slugs,
    'webscorer': webscorer_slugs,
    'runsignup': runsignup_slugs,
    'brooksee': brooksee_slugs
}

In [88]:
def conv_formatted_time_to_millis(formatted_time: str):
    """Convert a formatted time string to milliseconds."""
    if formatted_time == 'DNF' or formatted_time == '---' or pd.isna(formatted_time):
        return None
    parts = formatted_time.split(':')
    if len(parts) == 3:  # HH:MM:SS
        hours, minutes, seconds = map(float, parts)
        return (hours * 3600 + minutes * 60 + seconds) * 1000
    elif len(parts) == 2:  # MM:SS
        minutes, seconds = map(float, parts)
        return (minutes * 60 + seconds) * 1000
    else:
        raise ValueError(f'Invalid time format: {formatted_time}')

In [None]:
results_df = pd.DataFrame(columns=['slug', 'date', 'time', 'gender', 'age'])

for provider in race_slugs:
    for race_slug in race_slugs[provider]:
        file_paths = glob.glob(f'raw/marathons/{race_slug}-*.{"json" if provider == "athlinks" else "csv"}')

        for file_path in file_paths:
            print(file_path)
            date = '-'.join(os.path.basename(file_path).split('-')[-3:]).split('.')[0]
            if provider == 'athlinks':
                with open(file_path, 'r') as file:
                    data = json.load(file)
                results = data['intervals'][0]['results']
                temp_df = pd.DataFrame(results)
                temp_df = temp_df[['age', 'gender', 'chipTimeInMillis']]
            elif provider == 'raceentry' or provider == 'runsignup' or provider == 'brooksee':
                temp_df = pd.read_csv(file_path)
                temp_df = temp_df[['Age', 'Gender', 'Chip Time']]
                temp_df['Chip Time'] = temp_df['Chip Time'].apply(conv_formatted_time_to_millis)
            elif provider == 'ultrasignup' or (race_slug == 'super' and date.startswith('2022')) or (race_slug == 'cascade-express' and date.startswith('2024')):
                temp_df = pd.read_csv(file_path)
                temp_df = temp_df[['age', 'gender', 'time_millis']]
            elif provider == 'chronokeep':
                temp_df = pd.read_csv(file_path)
                temp_df = temp_df[['Age', 'Gender', 'Chip Time*']]
                temp_df['Chip Time*'] = temp_df['Chip Time*'].apply(conv_formatted_time_to_millis)
            elif provider == 'webscorer':
                temp_df = pd.read_csv(file_path)
                temp_df = temp_df[['Age', 'Gender', 'Finish time 			Chip time']]
                temp_df['Finish time 			Chip time'] = temp_df['Finish time 			Chip time'].apply(conv_formatted_time_to_millis)
            else:
                continue

            temp_df.columns = ['age', 'gender', 'time']
            temp_df['slug'] = race_slug
            temp_df['date'] = '-'.join(os.path.basename(file_path).split('-')[-3:]).split('.')[0]
            results_df = pd.concat([results_df, temp_df])

results_df

raw/marathons/tucson-marathon-2023-12-10.json
raw/marathons/tucson-marathon-2021-12-04.json
raw/marathons/tucson-marathon-2022-12-10.json
raw/marathons/sundance-to-spearfish-2022-09-11.json
raw/marathons/sundance-to-spearfish-2024-09-08.json
raw/marathons/sundance-to-spearfish-2023-09-10.json
raw/marathons/pocatello-2024-08-31.json
raw/marathons/pocatello-2022-09-03.json
raw/marathons/pocatello-2021-09-04.json
raw/marathons/pocatello-2023-09-02.json
raw/marathons/leading-ladies-2023-08-20.json
raw/marathons/leading-ladies-2024-08-18.json
raw/marathons/leading-ladies-2022-08-21.json
raw/marathons/hawaii-bird-conservation-2021-12-19.json
raw/marathons/hawaii-bird-conservation-2023-12-17.json
raw/marathons/hawaii-bird-conservation-2022-12-18.json
raw/marathons/utah-valley-marathon-2023-06-03.csv
raw/marathons/utah-valley-marathon-2024-06-01.csv
raw/marathons/utah-valley-marathon-2022-06-04.csv
raw/marathons/tunnel-vision-2022-08-14.csv
raw/marathons/tunnel-vision-2024-08-11.csv
raw/marath

KeyError: "None of [Index(['Age', 'Gender', 'Chip Time*'], dtype='object')] are in the [columns]"

In [82]:
fields['"CUT-OFF TIME"*'] = pd.to_timedelta('00:0' + fields['"CUT-OFF TIME"*'])
fields[['QUALIFIERS NOT ACCEPTED', 'FIELD SIZE']] = fields[['QUALIFIERS NOT ACCEPTED', 'FIELD SIZE']].apply(lambda col: col.str.replace(',', '').astype(int))

In [83]:
standards[['MEN', 'WOMEN']] = standards[['MEN', 'WOMEN']].apply(lambda col: pd.to_timedelta(col.str.replace('hrs', 'hr')))

In [84]:
downhill_marathons['Net Downhill'] = downhill_marathons['Net Downhill'].str.replace(',', '').astype(int)
downhill_marathons['Category'] = downhill_marathons['Net Downhill'].apply(lambda x: '3,000-5,999' if 3000 <= x < 6000 else '1,500-2,999')

In [77]:
fields_to_save = fields.copy()
fields_to_save['"CUT-OFF TIME"*'] = fields_to_save['"CUT-OFF TIME"*'].apply(lambda x: x.seconds)
fields_to_save.columns = ['Year', 'Field Size', 'Cutoff Time (s)', 'Qualifiers Not Accepted']
fields_to_save.to_csv('data/fields.csv', index=False)

standards_to_save = standards.copy()
standards_to_save[['MEN', 'WOMEN']] = standards_to_save[['MEN', 'WOMEN']].apply(lambda col: col.astype('str').apply(lambda x: x.split(' ')[2][1:]))
standards_to_save.columns = ['Age Group', 'Men', 'Women']
standards_to_save.to_csv('data/standards.csv', index=False)

downhill_marathons_to_save = downhill_marathons.copy()
downhill_marathons_to_save.sort_values(by='Net Downhill', ascending=False, inplace=True)
downhill_marathons_to_save.columns = ['Name', 'Net Downhill (ft)', 'Year Established', 'Location', 'Category']
downhill_marathons_to_save.to_csv('data/downhill-marathons.csv', index=False)