# Goal: Answer the following questions
### Which countries offer the longest races?
### Which races take the longest to finish on average?

Imports

In [146]:
import pandas as pd
import numpy as np
import seaborn as sns
import kagglehub
from kagglehub import KaggleDatasetAdapter
from pathlib import Path

In [147]:
file_path = kagglehub.dataset_download("aiaiaidavid/the-big-dataset-of-ultra-marathon-running")

print("Path to dataset files:", file_path)

Path to dataset files: /Users/ds/.cache/kagglehub/datasets/aiaiaidavid/the-big-dataset-of-ultra-marathon-running/versions/2


In [155]:
df = pd.read_csv(Path(file_path, "TWO_CENTURIES_OF_UM_RACES.csv"))
df=df.dropna()
df=df.drop_duplicates()
df.sample(10)

  df = pd.read_csv(Path(file_path, "TWO_CENTURIES_OF_UM_RACES.csv"))


Unnamed: 0,Year of event,Event dates,Event name,Event distance/length,Event number of finishers,Athlete performance,Athlete club,Athlete country,Athlete year of birth,Athlete gender,Athlete age category,Athlete average speed,Athlete ID
6948694,1981,12.-13.06.1981,100 km Lauf Biel (SUI),100km,2998,7:19:00 h,*Killwangen,SUI,1946.0,M,M23,13667.0,1566140
850583,2016,24.-25.06.2016,Gran Trail de Peñalara 110 km (ESP),110km,345,13:36:22 h,Club Tierra Tragame C....,ESP,1975.0,M,M40,8.085,124165
6965701,1982,30.-31.10.1982,Sri Chinmoy 24 Hour Race (USA),24h,18,198.438 km,"*Jamaica, NY",CAN,1957.0,M,M23,8268.0,82988
2449869,2019,05.-06.10.2019,Arkansas Traveller 100 (USA),100mi,100,28:22:28 h,*AR,USA,1963.0,M,M55,5.672,109051
4847760,2010,17.01.2010,P.F. Changs Rock'n'Roll 50K Marathon (USA),50km,148,4:52:00 h,"*Grosse-Île, QC",CAN,1961.0,M,M45,10.274,1067
625035,2018,17.11.2018,Meat Grinder Trail Race (USA),50km,13,4:48:35 h,"*Santa Clara, CA",USA,1980.0,M,M35,10.396,134982
2986269,2021,21.08.2021,Matterhorn Ultraks (SUI),49km,512,8:14:50 h,"*Cheyres, FR",SUI,1994.0,F,W23,5.941,911468
1422268,2017,29.04.2017,Harzquerung Wernigerode-Nordhausen (GER),51km,563,4:28:56 h,LAG Wesertal,GER,1975.0,M,M40,11.378,119937
1517005,2017,17.-18.06.2017,Ultra-trail du Puy Mary Aurillac (UTPMA) (FRA),105km,327,18:27:50 h,Spiridon Aurillac,FRA,1952.0,M,M60,5.687,141230
219175,2018,10.06.2018,Comrades Marathon - Down Run (RSA),90km,16484,10:47:22 h,Benoni Northerns Athle...,RSA,1966.0,M,M50,8.341,122026


In [149]:
df.dtypes

Year of event                  int64
Event dates                   object
Event name                    object
Event distance/length         object
Event number of finishers      int64
Athlete performance           object
Athlete club                  object
Athlete country               object
Athlete year of birth        float64
Athlete gender                object
Athlete age category          object
Athlete average speed         object
Athlete ID                     int64
dtype: object

## Data Cleaning

#### Standardizing 'Event distance/length' column

In [150]:
# Find unique valies for 'Event distance/length'
unique_distances = df['Event distance/length'].unique()
pd.Series(unique_distances)

0                     50km
1                       6h
2                   63.9km
3                     50mi
4                     28mi
               ...        
1542        303km/4Etappen
1543    2935.8mi/64Etappen
1544        344km/5Etappen
1545        330km/5Etappen
1546        355km/5Etappen
Length: 1547, dtype: object

In [151]:
suffixes = df['Event distance/length'].str.split('/').str.get(0).str[-2:]
suffixes.unique().shape

(29,)

In [152]:
## Get event location country from 'Event name' column
df = df[df['Event distance/length'].str.endswith(("mi", "km"))].reset_index()
df['Event distance/length'].unique()

array(['50km', '63.9km', '50mi', ..., '1011km', '50.4mi', '1006km'],
      shape=(1046,), dtype=object)

In [None]:
# Filtering out distances that can't be converted to a number
df = df[~pd.to_numeric(df['Event distance/length'].str[:-2], errors='coerce').isna()]

condition = df['Event distance/length'].str.endswith("km")

# new column that standardizes event distances to miles
df['Event distance (miles)'] = np.where(
        condition,
        pd.to_numeric(df['Event distance/length'].str[:-2]) * 0.621371, 
        pd.to_numeric(df['Event distance/length'].str[:-2])
    ).round(2)

df = df[['Event name', 'Year of event', 'Event distance/length', 'Event distance (miles)']]
df.sample(25)

Unnamed: 0,Event name,Year of event,Event distance/length,Event distance (miles)
149299,La Maxi Race du Lac d'Annecy (FRA),2018,85km,52.82
2158189,Midnight Mountain 50K (USA),2004,50km,31.07
607527,Le 100 km du Spiridon Catalan (FRA),2016,100km,62.14
790658,Course de l'Arc en Ciel (FRA),2017,63km,39.15
1699742,Beaverhead 55K Endurance Run (USA),2022,55km,34.18
2464611,Pistoia-Abetone Ultramarathon (ITA),2008,50km,31.07
2414776,Balaton Szupermaraton Etappe 2 Fonyód - Szigll...,2008,52.9km,32.87
3020070,Trail des Hospitaliers 75 km (FRA),2012,75km,46.6
2880546,Pemberton Trail 50 km (USA),2012,50km,31.07
3804433,100 km Lauf Biel (SUI),1981,100km,62.14
