In [124]:
import numpy as np
import pandas as pd
import re

# Dataset can be downloaded here: https://www.kaggle.com/datasets/aiaiaidavid/the-big-dataset-of-ultra-marathon-running/

races = pd.read_csv('TWO_CENTURIES_OF_UM_RACES.csv', low_memory=False)
races.head()

Unnamed: 0,Year of event,Event dates,Event name,Event distance/length,Event number of finishers,Athlete performance,Athlete club,Athlete country,Athlete year of birth,Athlete gender,Athlete age category,Athlete average speed,Athlete ID
0,2018,06.01.2018,Selva Costera (CHI),50km,22,4:51:39 h,Tnfrc,CHI,1978.0,M,M35,10.286,0
1,2018,06.01.2018,Selva Costera (CHI),50km,22,5:15:45 h,Roberto Echeverría,CHI,1981.0,M,M35,9.501,1
2,2018,06.01.2018,Selva Costera (CHI),50km,22,5:16:44 h,Puro Trail Osorno,CHI,1987.0,M,M23,9.472,2
3,2018,06.01.2018,Selva Costera (CHI),50km,22,5:34:13 h,Columbia,ARG,1976.0,M,M40,8.976,3
4,2018,06.01.2018,Selva Costera (CHI),50km,22,5:54:14 h,Baguales Trail,CHI,1992.0,M,M23,8.469,4


In [127]:
races = races.rename(mapper={
    'Year of event': 'event_year',
    'Event dates': 'event_dates',
    'Event name': 'event_name',
    'Event distance/length': 'event_distance_or_length',
    'Event number of finishers': 'event_num_finishers',
    'Athlete performance': 'athlete_performance',
    'Athlete club': 'athlete_club',
    'Athlete country': 'athlete_country',
    'Athlete year of birth': 'athlete_birth_year',
    'Athlete gender': 'athlete_gender',
    'Athlete age category': 'athlete_age_category',
    'Athlete average speed': 'athlete_avg_speed',
    'Athlete ID': 'athlete_id'
}, axis=1)
races.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7461195 entries, 0 to 7461194
Data columns (total 13 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   event_year                int64  
 1   event_dates               object 
 2   event_name                object 
 3   event_distance_or_length  object 
 4   event_num_finishers       int64  
 5   athlete_performance       object 
 6   athlete_club              object 
 7   athlete_country           object 
 8   athlete_birth_year        float64
 9   athlete_gender            object 
 10  athlete_age_category      object 
 11  athlete_avg_speed         object 
 12  athlete_id                int64  
dtypes: float64(1), int64(3), object(9)
memory usage: 740.0+ MB


In [128]:
races['stage_race'] = races['event_distance_or_length'].str.contains('Etappen')

In [129]:
def extract_dates(datestring):
    # datestring = row['event_dates']
    matching = re.findall(r'(\d+)',datestring)
    end_date = matching[-3] + '/' + matching[-2] + '/' + matching[-1]
    start_date = matching[0] + '/'
    if len(matching) >= 5:
        start_date += matching[1] + '/'
    else:
        start_date += matching[-2] + '/'
    if len(matching) == 6:
        start_date += matching[2]
    else:
        start_date += matching[-1]
    return start_date+'-'+end_date

event_dates = races['event_dates'].apply(extract_dates)
races[['start_date','end_date']] = event_dates.str.split('-',expand=True)

races.head()

Unnamed: 0,event_year,event_dates,event_name,event_distance_or_length,event_num_finishers,athlete_performance,athlete_club,athlete_country,athlete_birth_year,athlete_gender,athlete_age_category,athlete_avg_speed,athlete_id,stage_race,start_date,end_date
0,2018,06.01.2018,Selva Costera (CHI),50km,22,4:51:39 h,Tnfrc,CHI,1978.0,M,M35,10.286,0,False,06/01/2018,06/01/2018
1,2018,06.01.2018,Selva Costera (CHI),50km,22,5:15:45 h,Roberto Echeverría,CHI,1981.0,M,M35,9.501,1,False,06/01/2018,06/01/2018
2,2018,06.01.2018,Selva Costera (CHI),50km,22,5:16:44 h,Puro Trail Osorno,CHI,1987.0,M,M23,9.472,2,False,06/01/2018,06/01/2018
3,2018,06.01.2018,Selva Costera (CHI),50km,22,5:34:13 h,Columbia,ARG,1976.0,M,M40,8.976,3,False,06/01/2018,06/01/2018
4,2018,06.01.2018,Selva Costera (CHI),50km,22,5:54:14 h,Baguales Trail,CHI,1992.0,M,M23,8.469,4,False,06/01/2018,06/01/2018


In [130]:
races['event_country'] = races['event_name'].str.extract(r'\((\w{3})\)')
races['event_name'] = races['event_name'].str.extract(r'(^.+)\([A-Z]{3}\)$')
races.head()

Unnamed: 0,event_year,event_dates,event_name,event_distance_or_length,event_num_finishers,athlete_performance,athlete_club,athlete_country,athlete_birth_year,athlete_gender,athlete_age_category,athlete_avg_speed,athlete_id,stage_race,start_date,end_date,event_country
0,2018,06.01.2018,Selva Costera,50km,22,4:51:39 h,Tnfrc,CHI,1978.0,M,M35,10.286,0,False,06/01/2018,06/01/2018,CHI
1,2018,06.01.2018,Selva Costera,50km,22,5:15:45 h,Roberto Echeverría,CHI,1981.0,M,M35,9.501,1,False,06/01/2018,06/01/2018,CHI
2,2018,06.01.2018,Selva Costera,50km,22,5:16:44 h,Puro Trail Osorno,CHI,1987.0,M,M23,9.472,2,False,06/01/2018,06/01/2018,CHI
3,2018,06.01.2018,Selva Costera,50km,22,5:34:13 h,Columbia,ARG,1976.0,M,M40,8.976,3,False,06/01/2018,06/01/2018,CHI
4,2018,06.01.2018,Selva Costera,50km,22,5:54:14 h,Baguales Trail,CHI,1992.0,M,M23,8.469,4,False,06/01/2018,06/01/2018,CHI


In [134]:
# let's split off US races

us_races = races[(races['event_country'] == 'USA') & (races['event_year']>=1990)]
us_races.head()


Unnamed: 0,event_year,event_dates,event_name,event_distance_or_length,event_num_finishers,athlete_performance,athlete_club,athlete_country,athlete_birth_year,athlete_gender,athlete_age_category,athlete_avg_speed,athlete_id,stage_race,start_date,end_date,event_country
55,2018,06.01.2018,Yankee Springs 50 Mile Winter Challenge,50mi,9,9:53:05 h,"*Middleville, MI",USA,1983.0,M,M23,8.141,55,False,06/01/2018,06/01/2018,USA
56,2018,06.01.2018,Yankee Springs 50 Mile Winter Challenge,50mi,9,11:09:35 h,"*Waterloo, ON",CAN,1977.0,F,W40,7.211,56,False,06/01/2018,06/01/2018,USA
57,2018,06.01.2018,Yankee Springs 50 Mile Winter Challenge,50mi,9,11:33:00 h,"*Kitchener, ON",CAN,1976.0,M,M40,6.967,57,False,06/01/2018,06/01/2018,USA
58,2018,06.01.2018,Yankee Springs 50 Mile Winter Challenge,50mi,9,11:38:17 h,"*Utica, MI",USA,1986.0,M,M23,6.914,58,False,06/01/2018,06/01/2018,USA
59,2018,06.01.2018,Yankee Springs 50 Mile Winter Challenge,50mi,9,11:56:35 h,"*Grass Lake, MI",USA,1988.0,M,M23,6.738,59,False,06/01/2018,06/01/2018,USA
