In [105]:
'''
I want to create a model where you can input data like age, gender, etc., and I will then be
able to give a prediction of what kind of ultra runner that you would be.

1. I want to display two figures that show a distribution line where y is the probability that you
will enter a race of x length, for both types of races, run for the fastest time and most distance.
2. I also want to show how many races this person would likely complete, again with a distribution line.
3. I also want to show a distribution showing the likelihood that a person would get 1st, 2nd... 14th, 15th place in a race.

I think that I want to make it so people can move a slider for distance and it will show probability distributions for the above, based on that race length

independent variables: age, gender
dependent variables: victoriousness, average race length, amount of races completed, 
'''

import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from tqdm.notebook import tqdm

pd.set_option('display.float_format', '{:.2f}'.format)

df = pd.read_csv('../data/ultra_marathons_data_cleaned.csv', low_memory = False)
df.drop(['age_category', 'average_speed', 'no_of_finishers', 'athlete_country', 'athlete_club','start_date', 'end_date'], axis = 1, inplace = True)
df.head()

Unnamed: 0,year_of_event,event_name,distance_or_length,total_distance,birth_year,gender,athlete_id,race_metric,fastest_time
0,2018,Selva Costera (CHI),31.07,0.0,1978,M,0,mi,4:51:39
1,2018,Selva Costera (CHI),31.07,0.0,1981,M,1,mi,5:15:45
2,2018,Selva Costera (CHI),31.07,0.0,1987,M,2,mi,5:16:44
3,2018,Selva Costera (CHI),31.07,0.0,1976,M,3,mi,5:34:13
4,2018,Selva Costera (CHI),31.07,0.0,1992,M,4,mi,5:54:14


In [94]:
df['place'] = np.nan

def sort_and_place(group, metric):
    if metric == 'mi':
        group = group.sort_values(by = 'fastest_time', ascending = True)
    elif metric == 'hours':
        group = group.sort_values(by = 'distance_or_length', ascending = False)
    group['place'] = np.arange(1, len(group) + 1, dtype = int)
    return group

for metric in tqdm(['mi', 'hours']):
    grouped_df = df.groupby('event_name', as_index = False).apply(lambda group: sort_and_place(group, metric))
    grouped_df.reset_index(drop = True, inplace = True)
    df.update(grouped_df)

df.sample(10)

  0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,year_of_event,event_name,distance_or_length,total_distance,birth_year,gender,athlete_id,race_metric,fastest_time,place
2955537,2018,Janda Baik Ultra 50 Km (MAS),31.07,0.0,1988,M,131705,mi,11:23:02,392.0
6821275,2014,Ultra Trail Tour du Mont Blanc (UTMB) (FRA),104.39,0.0,1965,M,369380,mi,45:19:48,10746.0
1736156,2019,Dalat Ultra Trail (VIE),43.5,0.0,1983,M,119702,mi,16:25:26,163.0
3649779,2010,Les 100 km de Vendée - Champ. Nationaux (FRA),62.14,0.0,1963,F,501427,mi,10:42:28,104.0
7374377,2012,Yukihashi to Beppu 100 km (JPN),62.14,0.0,1957,M,1197233,mi,21:57:50,28269.0
5186809,2012,Soaring Eagle Park 50K (USA),31.07,0.0,1986,M,1348870,mi,4:47:32,26.0
2233862,2018,Glacial Esker 40 Miles (USA),40.0,0.0,1994,F,85494,mi,11:14:35,95.0
2446709,2013,Green Lakes Endurance Run 50 km (USA),31.07,0.0,1955,M,1218241,mi,8:09:26,289.0
6249269,2017,Transvulcania 2017 (ESP),45.36,0.0,1968,M,64638,mi,12:25:22,1481.0
2521724,2019,GutsMuths-Rennsteiglauf (GER),5.59,0.0,1978,M,188118,mi,10:01:19,43942.0


In [95]:
df['age'] = df['year_of_event'] - df['birth_year']
# df.drop(['year_of_event', 'birth_year'], axis = 1, inplace = True)
df.head()

Unnamed: 0,year_of_event,event_name,distance_or_length,total_distance,birth_year,gender,athlete_id,race_metric,fastest_time,place,age
0,2019,#RC87 Sportparkchallenge (BEL),6.0,75.85,1978,M,64633,hours,0,1.0,41
1,2019,#RC87 Sportparkchallenge (BEL),6.0,56.38,1975,M,64706,hours,0,2.0,44
2,2019,#RC87 Sportparkchallenge (BEL),6.0,53.3,1966,M,64705,hours,0,3.0,53
3,2019,#RC87 Sportparkchallenge (BEL),6.0,69.7,1966,M,28518,hours,0,4.0,53
4,2019,#RC87 Sportparkchallenge (BEL),6.0,65.6,1978,M,64647,hours,0,5.0,41


In [96]:
df_hours = df[(df['race_metric'] == 'hours') & (df['age'] > 12) & (df['age'] < 100)].copy().drop('race_metric', axis = 1)
df_miles = df[(df['race_metric'] == 'mi') & (df['age'] > 12) & (df['age'] < 100)].copy().drop('race_metric', axis = 1)

In [97]:
df_miles = df_miles[pd.to_timedelta(df_miles['fastest_time']) > pd.to_timedelta(1, unit = 'h')]
df_miles['fastest_time'] = df_miles.apply(lambda row: int(pd.to_timedelta(row['fastest_time']).total_seconds()), axis = 1)
df_miles.describe()

Unnamed: 0,year_of_event,distance_or_length,total_distance,birth_year,athlete_id,fastest_time,place,age
count,6383624.0,6383624.0,6383624.0,6383624.0,6383624.0,6383624.0,6383624.0,6383624.0
mean,2012.08,45.51,0.0,1969.92,527485.56,43519.46,16729.32,42.16
std,9.77,35.9,0.0,13.07,467877.46,40054.66,43568.09,9.86
min,1837.0,0.0,0.0,1791.0,0.0,3615.0,1.0,13.0
25%,2010.0,31.07,0.0,1962.0,129856.0,24464.0,181.0,35.0
50%,2015.0,38.53,0.0,1971.0,370783.5,34464.0,818.0,42.0
75%,2018.0,55.92,0.0,1979.0,942810.0,47741.0,5130.0,49.0
max,2022.0,3554.0,0.0,2009.0,1641167.0,3020399.0,263574.0,99.0


In [98]:
df_miles.drop(['year_of_event', 'total_distance', 'birth_year', 'athlete_id', 'event_name', ], axis = 1, inplace = True)
df_hours.drop(['year_of_event', 'total_distance', 'birth_year', 'athlete_id', 'event_name'], axis = 1, inplace = True)

In [102]:
df_miles

Unnamed: 0,distance_or_length,gender,fastest_time,place,age
19,100.00,M,82385,1.00,39
20,100.00,M,82385,2.00,43
21,100.00,M,97920,3.00,36
22,100.00,M,97923,4.00,46
23,100.00,M,100380,5.00,57
...,...,...,...,...,...
7461189,50.00,M,30754,7.00,36
7461190,40.00,M,16604,1.00,37
7461191,40.00,M,16729,2.00,28
7461193,40.00,M,28650,4.00,41


In [108]:
df_miles[['age', 'gender', 'place', 'distance_or_length']].isnull().sum()

age                   0
gender                0
place                 0
distance_or_length    0
dtype: int64

In [154]:
# Most miles races

df_hours = df_hours[df_hours['gender'].isin(['M', 'F'])]
x_h = df_hours[['age', 'gender']]
x_h['gender'] = x_h['gender'].map({'M': 0, 'F': 1})
y_h = df_hours[['distance_or_length', 'place']]

x_train_h, x_test_h, y_train_h, y_test_h = train_test_split(x_h, y_h, train_size = 0.1)
hour_model = LinearRegression()

# Fastest time races
df_miles = df_miles[df_miles['gender'].isin(['M', 'F'])]
x_t = df_miles[['age', 'gender']]

x_t['gender'] = x_t['gender'].map({'M': 0, 'F': 1})
y_t = df_miles[['distance_or_length', 'place']]

x_train_t, x_test_t, y_train_t, y_test_t = train_test_split(x_t, y_t, train_size = 0.1)
mile_model = LinearRegression()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_h['gender'] = x_h['gender'].map({'M': 0, 'F': 1})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_t['gender'] = x_t['gender'].map({'M': 0, 'F': 1})


In [155]:
hour_model.fit(x_train_h, y_train_h)
mile_model.fit(x_train_t, y_train_t)

In [158]:
yt_pred = mile_model.predict(x_test_t)
(yt_pred == y_test_t).sum()

distance_or_length    0
place                 0
dtype: int64

In [160]:
yh_pred = hour_model.predict(x_test_h)
(yh_pred == y_test_h).sum()

distance_or_length    0
place                 0
dtype: int64

In [161]:
mse_h = mean_squared_error(y_test_h, yh_pred)
r2_h = r2_score(y_test_h, yh_pred)

print(f"Mean Squared Error: {mse_h}")
print(f"R^2 Score: {r2_h}")

Mean Squared Error: 101236.75345186668
R^2 Score: 0.0017126431047578827


In [162]:
mse_t = mean_squared_error(y_test_t, yt_pred)
r2_t = r2_score(y_test_t, yt_pred)

print(f"Mean Squared Error: {mse_t}")
print(f"R^2 Score: {r2_t}")

Mean Squared Error: 943032293.5917346
R^2 Score: 0.004265618762760159


In [164]:
hour_model.predict([[28], [0]])



ValueError: X has 1 features, but LinearRegression is expecting 2 features as input.