In [5]:
import pandas as pd
import os
import json
import glob
import re

fields = pd.read_json('raw/fields-2012_2025.json')
standards = pd.read_json('raw/standards-2020_2025.json')
downhill_marathons = pd.read_csv('raw/downhill-marathons.csv')

In [6]:
athlinks_slugs = ['tucson-marathon', 'sundance-to-spearfish', 'pocatello', 'leading-ladies', 'jack-and-kills', 'hawaii-bird-conservation']
raceentry_slugs = ['utah-valley-marathon']
ultrasignup_slugs = ['tunnel-vision', 'tunnel-light', 'light-at-the-end-of-the-tunnel']
chronokeep_slugs = ['super', 'cascade-express']
webscorer_slugs = ['sun', 'east-canyon']
runsignup_slugs = ['runtastic-nebo']
brooksee_slugs = ['revel-white-mountains', 'revel-mt-charleston', 'revel-big-bear', 'revel-big-cottonwood', 'deseret-news', 'bears-ears']

race_slugs = {
    'athlinks': athlinks_slugs,
    'raceentry': raceentry_slugs,
    'ultrasignup': ultrasignup_slugs,
    'chronokeep': chronokeep_slugs,
    'webscorer': webscorer_slugs,
    'runsignup': runsignup_slugs,
    'brooksee': brooksee_slugs
}

In [7]:
def conv_formatted_time_to_millis(formatted_time: str):
    """Convert a formatted time string to milliseconds."""
    if formatted_time == 'DNF' or formatted_time == '---' or pd.isna(formatted_time):
        return None
    parts = formatted_time.split(':')
    if len(parts) == 3:  # HH:MM:SS
        hours, minutes, seconds = map(float, parts)
        return (hours * 3600 + minutes * 60 + seconds) * 1000
    elif len(parts) == 2:  # MM:SS
        minutes, seconds = map(float, parts)
        return (minutes * 60 + seconds) * 1000
    else:
        raise ValueError(f'Invalid time format: {formatted_time}')

In [None]:
def split_name_affiliation(name_affiliation: str):
    print(name_affiliation)
    parts = re.split(r'([a-z0-9][A-Z])', name_affiliation)
    middle_part = parts[1]
    return [parts[0].strip() + middle_part[0], middle_part[1] + parts[2].strip()]

In [18]:
results_df = pd.DataFrame(columns=['slug', 'date', 'name', 'time', 'gender', 'age', 'city', 'state'])

for provider in race_slugs:
    for race_slug in race_slugs[provider]:
        file_paths = glob.glob(f'raw/marathons/{race_slug}-*.{"json" if provider == "athlinks" else "csv"}')

        for file_path in file_paths:
            date = '-'.join(os.path.basename(file_path).split('-')[-3:]).split('.')[0]
            if provider == 'athlinks':
                with open(file_path, 'r') as file:
                    data = json.load(file)
                results = data['intervals'][0]['results']
                temp_df = pd.DataFrame(results)
                temp_df['locality'] = temp_df['location'].apply(lambda x: x.get('locality', None) if x is not None else None)
                temp_df['region'] = temp_df['location'].apply(lambda x: x.get('region', None) if x is not None else None)
                temp_df = temp_df[['age', 'gender', 'chipTimeInMillis', 'displayName', 'locality', 'region']]
            elif provider == 'raceentry' or provider == 'runsignup' or provider == 'brooksee':
                temp_df = pd.read_csv(file_path)
                if provider == 'raceentry':
                    temp_df['name'] = temp_df['First Name'] + ' ' + temp_df['Last Name']
                    temp_df['city'] = temp_df['City']
                    temp_df['state'] = temp_df['State']
                elif provider == 'runsignup':
                    temp_df['name'] = ' '.join(temp_df['Name'].split()[1:])
                    temp_df['city'] = temp_df['City']
                    temp_df['state'] = temp_df['State']
                elif provider == 'brooksee':
                    temp_df['name'] = temp_df['First Name'] + ' ' + temp_df['Last Name']
                    temp_df['city'] = None
                    temp_df['state'] = None
                temp_df = temp_df[['Age', 'Gender', 'Chip Time', 'name', 'city', 'state']]
                temp_df['Chip Time'] = temp_df['Chip Time'].apply(conv_formatted_time_to_millis)
            elif provider == 'ultrasignup' or (race_slug == 'super' and date.startswith('2022')) or (race_slug == 'cascade-express' and date.startswith('2023')):
                temp_df = pd.read_csv(file_path)
                temp_df['name'] = temp_df['first_name'] + ' ' + temp_df['last_name']
                temp_df = temp_df[['age', 'gender', 'time_millis', 'name', 'city', 'state']]
            elif provider == 'chronokeep':
                temp_df = pd.read_csv(file_path)
                temp_df['city'] = None
                temp_df['state'] = None
                temp_df = temp_df[['Age', 'Gender', 'Chip Time*', 'Name', 'city', 'state']]
                temp_df['Chip Time*'] = temp_df['Chip Time*'].apply(conv_formatted_time_to_millis)
            elif provider == 'webscorer':
                temp_df = pd.read_csv(file_path)
                name_affiliations = temp_df['Name Affiliation'].apply(split_name_affiliation)
                print(name_affiliations)
                temp_df = temp_df[['Age', 'Gender', 'Finish time 			Chip time']]
                temp_df['Finish time 			Chip time'] = temp_df['Finish time 			Chip time'].apply(conv_formatted_time_to_millis)
            else:
                continue

            temp_df.columns = ['age', 'gender', 'time', 'name', 'city', 'state']
            temp_df['slug'] = race_slug
            temp_df['date'] = date
            results_df = pd.concat([results_df, temp_df])

results_df

Drew GalahanHarlowton, MT
Dan ShellySalt Lake City, UT
Ashley PaulsonSaint GeorgE, UT
Josh HowardOgden, UT
Klark LarsenProvo, UT
Jared BlackClearfield, UT
Yiqiao ""Joey"" ZhaoCedar City, UT
Samuel BishopHolladay, UT
John BlackClearfield, UT
Maxwell HillMoab, UT
Logan LefevreFillmore, UT
Brett WhiteExport, PA
Jonathan SalinasSaint GeorgE, UT
Mark FloresSan Dimas, CA
Krista CaseyKaysville, UT
Isaiah LangeEvanston, WY
Dave HallWashington, UT
Jarom YoungWashington, UT
Marc NosalEden Prairie, MN
Andreas LehmannOrem, UT
Eliza JorgensenFarmington, UT
Anna NelsonDraper, UT
Terrika TurnerPayson, UT
Dustin PoulsenProvidence, UT
Jim BengtsonEden Prairie, MN
Brent RoweSaint GeorgE, UT
Kayla PerryDraper, UT
Peter WallaceSalt Lake City, UT
Jacob ThomasProvo, UT
Jacob RossProvo, UT
Sarah HeapsSpanish Fork, UT
Devin KentCarmel, IN
Karolina MoleWest Jordan, UT
Jordan PoulsenOgden, UT
Jo Whitney-Swensen2Talkeetna, AK


IndexError: list index out of range

In [82]:
fields['"CUT-OFF TIME"*'] = pd.to_timedelta('00:0' + fields['"CUT-OFF TIME"*'])
fields[['QUALIFIERS NOT ACCEPTED', 'FIELD SIZE']] = fields[['QUALIFIERS NOT ACCEPTED', 'FIELD SIZE']].apply(lambda col: col.str.replace(',', '').astype(int))

In [83]:
standards[['MEN', 'WOMEN']] = standards[['MEN', 'WOMEN']].apply(lambda col: pd.to_timedelta(col.str.replace('hrs', 'hr')))

In [84]:
downhill_marathons['Net Downhill'] = downhill_marathons['Net Downhill'].str.replace(',', '').astype(int)
downhill_marathons['Category'] = downhill_marathons['Net Downhill'].apply(lambda x: '3,000-5,999' if 3000 <= x < 6000 else '1,500-2,999')

In [77]:
fields_to_save = fields.copy()
fields_to_save['"CUT-OFF TIME"*'] = fields_to_save['"CUT-OFF TIME"*'].apply(lambda x: x.seconds)
fields_to_save.columns = ['Year', 'Field Size', 'Cutoff Time (s)', 'Qualifiers Not Accepted']
fields_to_save.to_csv('data/fields.csv', index=False)

standards_to_save = standards.copy()
standards_to_save[['MEN', 'WOMEN']] = standards_to_save[['MEN', 'WOMEN']].apply(lambda col: col.astype('str').apply(lambda x: x.split(' ')[2][1:]))
standards_to_save.columns = ['Age Group', 'Men', 'Women']
standards_to_save.to_csv('data/standards.csv', index=False)

downhill_marathons_to_save = downhill_marathons.copy()
downhill_marathons_to_save.sort_values(by='Net Downhill', ascending=False, inplace=True)
downhill_marathons_to_save.columns = ['Name', 'Net Downhill (ft)', 'Year Established', 'Location', 'Category']
downhill_marathons_to_save.to_csv('data/downhill-marathons.csv', index=False)