In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error

In [42]:
# get data
path = '../data/external/MarathonData.csv'
pathNew = './MarathonData.csv'
df = pd.read_csv(path)
# set id as index
df.set_index('id', inplace=True)
df

Unnamed: 0_level_0,Marathon,Name,Category,km4week,sp4week,CrossTraining,Wall21,MarathonTime,CATEGORY
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,Prague17,Blair MORGAN,MAM,132.8,14.434783,,1.16,2.37,A
2,Prague17,Robert Heczko,MAM,68.6,13.674419,,1.23,2.59,A
3,Prague17,Michon Jerome,MAM,82.7,13.520436,,1.30,2.66,A
4,Prague17,Daniel Or lek,M45,137.5,12.258544,,1.32,2.68,A
5,Prague17,Luk ? Mr zek,MAM,84.6,13.945055,,1.36,2.74,A
...,...,...,...,...,...,...,...,...,...
83,Prague17,Stefano Vegliani,M55,50.0,10.830325,,2.02,3.93,D
84,Prague17,Andrej Madliak,M40,33.6,10.130653,ciclista 3h,1.94,3.93,D
85,Prague17,Yoi Ohsako,M40,55.4,11.043189,,1.94,3.94,D
86,Prague17,Simon Dunn,M45,33.2,11.066667,,2.05,3.95,D


In [43]:
# convert to float all values
df = df.apply(pd.to_numeric, errors='coerce')

In [44]:
# if it does cross training set it to 1 else 0
df['cross_training'] = np.where(df['CrossTraining'].notnull(), 1, 0)
df

Unnamed: 0_level_0,Marathon,Name,Category,km4week,sp4week,CrossTraining,Wall21,MarathonTime,CATEGORY,cross_training
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,,,,132.8,14.434783,,1.16,2.37,,0
2,,,,68.6,13.674419,,1.23,2.59,,0
3,,,,82.7,13.520436,,1.30,2.66,,0
4,,,,137.5,12.258544,,1.32,2.68,,0
5,,,,84.6,13.945055,,1.36,2.74,,0
...,...,...,...,...,...,...,...,...,...,...
83,,,,50.0,10.830325,,2.02,3.93,,0
84,,,,33.6,10.130653,,1.94,3.93,,0
85,,,,55.4,11.043189,,1.94,3.94,,0
86,,,,33.2,11.066667,,2.05,3.95,,0


In [45]:
#  drop rows with wall21 null
df = df.dropna(subset=['Wall21'])

In [46]:
y = df.MarathonTime
X = df[['cross_training', 'km4week', 'sp4week']]

In [47]:
from sklearn.model_selection import train_test_split

# split data into training and validation data, for both features and target
train_X, val_X, train_y, val_y = train_test_split(X, y)

In [48]:
# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],  # number of trees
    'max_depth': [10, 20, None],      # maximum depth of trees
    'min_samples_split': [2, 5, 10],  # minimum samples required to split
    'min_samples_leaf': [1, 2, 4],    # minimum samples required at leaf node
    'max_features': ['auto', 'sqrt']  # number of features to consider at each split
}


In [49]:
# Create base model
rf = RandomForestRegressor(random_state=1)

# Create GridSearchCV object
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,                          # 5-fold cross-validation
    scoring='neg_mean_absolute_error',
    n_jobs=-1,                     # use all available cores
    verbose=2                      # print progress
)


In [50]:

# Fit the grid search
grid_search.fit(train_X, train_y)

# Print best parameters
print("Best parameters:", grid_search.best_params_)


Fitting 5 folds for each of 162 candidates, totalling 810 fits
Best parameters: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


405 fits failed out of a total of 810.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
299 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Python312\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Python312\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Python312\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.Invali

In [51]:

# Make predictions using the best model
best_model = grid_search.best_estimator_
pred = best_model.predict(val_X)
print("Mean Absolute Error:", mean_absolute_error(val_y, pred))

Mean Absolute Error: 0.20644761904761938


In [52]:
print(pred)

[2.9466 3.6863 2.7577 2.7555 2.93   3.1648 3.2561 3.5918 3.4697 3.3037
 3.1534 3.5494 3.553  2.8613 3.6276 3.4968 2.92   3.2313 3.5224 3.463
 3.453 ]


In [53]:
# save the model
from joblib import dump
dump(best_model, 'marathon_model2.joblib')

['marathon_model2.joblib']