In [91]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [92]:
# get data
df = pd.read_csv('../data/external/MarathonData.csv')
# set id as index
df.set_index('id', inplace=True)
df

Unnamed: 0_level_0,Marathon,Name,Category,km4week,sp4week,CrossTraining,Wall21,MarathonTime,CATEGORY
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,Prague17,Blair MORGAN,MAM,132.8,14.434783,,1.16,2.37,A
2,Prague17,Robert Heczko,MAM,68.6,13.674419,,1.23,2.59,A
3,Prague17,Michon Jerome,MAM,82.7,13.520436,,1.30,2.66,A
4,Prague17,Daniel Or lek,M45,137.5,12.258544,,1.32,2.68,A
5,Prague17,Luk ? Mr zek,MAM,84.6,13.945055,,1.36,2.74,A
...,...,...,...,...,...,...,...,...,...
83,Prague17,Stefano Vegliani,M55,50.0,10.830325,,2.02,3.93,D
84,Prague17,Andrej Madliak,M40,33.6,10.130653,ciclista 3h,1.94,3.93,D
85,Prague17,Yoi Ohsako,M40,55.4,11.043189,,1.94,3.94,D
86,Prague17,Simon Dunn,M45,33.2,11.066667,,2.05,3.95,D


In [93]:
# convert to float all values
df = df.apply(pd.to_numeric, errors='coerce')

In [94]:
# if it does cross training set it to 1 else 0
df['cross_training'] = np.where(df['CrossTraining'].notnull(), 1, 0)
df

Unnamed: 0_level_0,Marathon,Name,Category,km4week,sp4week,CrossTraining,Wall21,MarathonTime,CATEGORY,cross_training
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,,,,132.8,14.434783,,1.16,2.37,,0
2,,,,68.6,13.674419,,1.23,2.59,,0
3,,,,82.7,13.520436,,1.30,2.66,,0
4,,,,137.5,12.258544,,1.32,2.68,,0
5,,,,84.6,13.945055,,1.36,2.74,,0
...,...,...,...,...,...,...,...,...,...,...
83,,,,50.0,10.830325,,2.02,3.93,,0
84,,,,33.6,10.130653,,1.94,3.93,,0
85,,,,55.4,11.043189,,1.94,3.94,,0
86,,,,33.2,11.066667,,2.05,3.95,,0


In [95]:
#  drop rows with wall21 null
df = df.dropna(subset=['Wall21'])

In [96]:
y = df.MarathonTime
X = df[['cross_training', 'km4week', 'sp4week']]

In [97]:
from sklearn.model_selection import train_test_split

# split data into training and validation data, for both features and target
train_X, val_X, train_y, val_y = train_test_split(X, y)

In [98]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_X, train_y)
print(val_X)
pred = forest_model.predict(val_X)
print(mean_absolute_error(val_y, pred))

    cross_training  km4week    sp4week
id                                    
56               0     26.9  13.121951
40               0     69.2  10.053269
58               0     36.3  11.647059
65               0     54.2  11.782609
83               0     50.0  10.830325
63               0     48.8  11.665339
68               0     34.3  11.307692
18               0     49.7  14.336538
3                0     82.7  13.520436
73               0     52.3  11.708955
59               0     22.7  12.728972
76               0     23.9  12.050420
42               0     58.8  12.829091
32               0     79.4  13.344538
80               0     53.9  11.802920
75               0     66.7  11.566474
85               0     55.4  11.043189
50               0     78.2  12.000000
37               0     50.1  12.170040
1                0    132.8  14.434783
15               0     76.8  12.943820
0.2140714285714283


In [99]:
print(pred)

[3.3472 3.4836 3.5375 3.4226 3.5825 3.4424 3.8716 2.9161 2.8264 3.4627
 3.4441 3.5374 3.1219 2.8617 3.3419 3.4811 3.572  3.14   3.3007 2.8214
 2.9257]


In [100]:
# save the model
from joblib import dump
dump(forest_model, '../models/marathon_model.joblib')

['../models/marathon_model.joblib']