In [1]:
'''
I want to create a model where you can input data like age, gender, etc., and I will then be
able to give a prediction of what kind of ultra runner that you would be.

1. I want to display two figures that show a distribution line where y is the probability that you
will enter a race of x length, for both types of races, run for the fastest time and most distance.
2. I also want to show how many races this person would likely complete, again with a distribution line.
3. I also want to show a distribution showing the likelihood that a person would get 1st, 2nd... 14th, 15th place in a race.

I think that I want to make it so people can move a slider for distance and it will show probability distributions for the above, based on that race length

independent variables: age, gender
dependent variables: victoriousness, average race length, amount of races completed, 
'''

import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from tqdm.notebook import tqdm

df = pd.read_csv('../data/ultra_marathons_data_cleaned.csv', index_col = 0, low_memory = False)
df.drop(['age_category', 'average_speed', 'no_of_finishers', 'athlete_country', 'athlete_club', 'year_of_event', 'start_date', 'end_date'], axis = 1, inplace = True)
df.head()

Unnamed: 0,event_name,distance_or_length,total_distance,birth_year,gender,average_speed,athlete_id,race_metric,fastest_time
0,Selva Costera (CHI),50.0,0.0,1978,M,10.286,0,km,4:51:39
1,Selva Costera (CHI),50.0,0.0,1981,M,9.501,1,km,5:15:45
2,Selva Costera (CHI),50.0,0.0,1987,M,9.472,2,km,5:16:44
3,Selva Costera (CHI),50.0,0.0,1976,M,8.976,3,km,5:34:13
4,Selva Costera (CHI),50.0,0.0,1992,M,8.469,4,km,5:54:14


In [3]:
# Converting all km measurements to miles

km_mask = df['race_metric'] == 'km'
df.loc[km_mask, 'distance_or_length'] = df.loc[km_mask,
                                               'distance_or_length'] / 1.60934
df.loc[km_mask, 'race_metric'] = 'mi'

day_mask = df['race_metric'] == 'days'
df.loc[day_mask, 'distance_or_length'] = df.loc[day_mask,
                                                'distance_or_length'] * 24
df.loc[day_mask, 'race_metric'] = 'hours'

df.head(20)

Unnamed: 0,event_name,distance_or_length,total_distance,birth_year,gender,average_speed,athlete_id,race_metric,fastest_time
0,Selva Costera (CHI),31.068637,0.0,1978,M,10.286,0,mi,4:51:39
1,Selva Costera (CHI),31.068637,0.0,1981,M,9.501,1,mi,5:15:45
2,Selva Costera (CHI),31.068637,0.0,1987,M,9.472,2,mi,5:16:44
3,Selva Costera (CHI),31.068637,0.0,1976,M,8.976,3,mi,5:34:13
4,Selva Costera (CHI),31.068637,0.0,1992,M,8.469,4,mi,5:54:14
5,Selva Costera (CHI),31.068637,0.0,1974,M,7.792,5,mi,6:25:01
6,Selva Costera (CHI),31.068637,0.0,1979,F,7.732,6,mi,6:28:00
7,Selva Costera (CHI),31.068637,0.0,1967,F,7.645,7,mi,6:32:24
8,Selva Costera (CHI),31.068637,0.0,1985,M,7.516,8,mi,6:39:08
9,Selva Costera (CHI),31.068637,0.0,1976,M,7.404,9,mi,6:45:11


In [4]:
df['place'] = np.nan

def sort_and_place(group, metric):
    if metric == 'mi':
        group = group.sort_values(by = 'fastest_time', ascending = True)
    elif metric == 'hours':
        group = group.sort_values(by = 'distance_or_length', ascending = False)
    group['place'] = np.arange(1, len(group) + 1)
    return group

for metric in tqdm(['mi', 'hours']):
    grouped_df = df.groupby('event_name', as_index = False).apply(lambda group: sort_and_place(group, metric))
    grouped_df.reset_index(drop = True, inplace = True)
    df.update(grouped_df)

df.sample(10)

  0%|          | 0/2 [00:00<?, ?it/s]