In [1]:
import pandas as pd
pd.set_option("display.max_columns", 100)
import numpy as np
np.set_printoptions(suppress=True)
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Optional, Any, Union

In [2]:
IN_CSV_DATA = Path().cwd().parent.parent / "data/4_data_split"
OUT_MODEL_DATA = Path().cwd().parent.parent / "data/5_models"

# 1. Load in Training and Test Datasets

In [3]:
df_train = pd.read_csv(IN_CSV_DATA/'prepared_train.csv')
df_test = pd.read_csv(IN_CSV_DATA/'prepared_test.csv')

In [4]:
numerical_feature_cols = ['total_distance_mi','total_weight_lbs','avg_cruising_speed', 'log_hours_since_last_ride',
                            'active_time_ratio', 'avg_climb_rate', 'distance_training_intensity','prior_training_load']
categorical_feature_cols = []#['year']
feature_cols = numerical_feature_cols + categorical_feature_cols

target_cols = ['best_power_4s', 'best_power_5s',
                'best_power_10s', 'best_power_20s', 'best_power_30s', 'best_power_1m',
                'best_power_2m', 'best_power_3m', 'best_power_4m', 'best_power_5m',
                'best_power_6m', 'best_power_10m', 'best_power_20m', 'best_power_30m',
                'best_power_40m', 'best_power_1h', 'best_power_2h']

In [5]:
X_train, y_train = df_train[feature_cols].values, df_train[target_cols].values
X_test, y_test = df_test[feature_cols].values, df_test[target_cols].values

In [6]:
from sklearn.model_selection import cross_val_score, cross_val_predict, TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, root_mean_squared_log_error
from sklearn.multioutput import MultiOutputRegressor, RegressorChain

In [7]:
SCORING = 'r2'

In [8]:
from sklearn.kernel_ridge import KernelRidge

In [9]:
tss_cv = TimeSeriesSplit(n_splits=5)

# 1a. Full Targets Vector Regressor

In [10]:
kridge_reg = KernelRidge()

In [11]:
param_grid = [{'kernel':['rbf','polynomial','sigmoid', 'linear'],
               'gamma':[0.001, 0.01, 0.1, 0.5,1,5,10], 
               }]
kridge_reg_gridcv = GridSearchCV(kridge_reg, param_grid, cv=tss_cv, n_jobs=-1, 
                             refit=True, return_train_score=True,
                             scoring=SCORING, verbose=1
                            )

In [12]:
# fitting the model for grid search
kridge_reg_gridcv.fit(X_train, y_train)

# print the best parameter set after tuning
print(kridge_reg_gridcv.best_params_)

Fitting 5 folds for each of 28 candidates, totalling 140 fits




{'gamma': 0.01, 'kernel': 'polynomial'}


In [13]:
cv_results = kridge_reg_gridcv.cv_results_

for train_r2, test_r2, params in zip(cv_results['mean_train_score'], cv_results['mean_test_score'], cv_results['params']):
    print(f'For params:{params}')
    print(f'\t- Train R^2={train_r2} --> Test R^2={test_r2}')
    print('-'*50)

For params:{'gamma': 0.001, 'kernel': 'rbf'}
	- Train R^2=0.21764856218696402 --> Test R^2=0.10342227694171058
--------------------------------------------------
For params:{'gamma': 0.001, 'kernel': 'polynomial'}
	- Train R^2=0.2697208193130315 --> Test R^2=0.14626651744840585
--------------------------------------------------
For params:{'gamma': 0.001, 'kernel': 'sigmoid'}
	- Train R^2=0.06469012696228682 --> Test R^2=-0.01083588613658076
--------------------------------------------------
For params:{'gamma': 0.001, 'kernel': 'linear'}
	- Train R^2=-8.187249476129384 --> Test R^2=-19.63712035462934
--------------------------------------------------
For params:{'gamma': 0.01, 'kernel': 'rbf'}
	- Train R^2=0.48994427434294785 --> Test R^2=0.25884715679840087
--------------------------------------------------
For params:{'gamma': 0.01, 'kernel': 'polynomial'}
	- Train R^2=0.5408061481512128 --> Test R^2=0.2913925854257754
--------------------------------------------------
For params:{'

In [14]:
model_kridge_reg = kridge_reg_gridcv.best_estimator_

# 1b. Individual Target Regressors `MultiOutputRegressor`

In [15]:
param_grid = [{'estimator__kernel':['rbf','polynomial','sigmoid','linear'],
               'estimator__gamma':[0.001, 0.01, 0.1, 0.5,1,5,10], 
               }]

MO_kridge_reg = MultiOutputRegressor(KernelRidge())

kridge_reg_gridcv = GridSearchCV(MO_kridge_reg, param_grid, cv=tss_cv, n_jobs=-1, 
                             refit=True, return_train_score=True,
                             scoring=SCORING, verbose=1
                            )

In [16]:
# fitting the model for grid search
kridge_reg_gridcv.fit(X_train, y_train)

# print the best parameter set after tuning
print(kridge_reg_gridcv.best_params_)

Fitting 5 folds for each of 28 candidates, totalling 140 fits




{'estimator__gamma': 0.01, 'estimator__kernel': 'polynomial'}


In [17]:
cv_results = kridge_reg_gridcv.cv_results_

for train_r2, test_r2, params in zip(cv_results['mean_train_score'], cv_results['mean_test_score'], cv_results['params']):
    print(f'For params:{params}')
    print(f'\t- Train R^2={train_r2} --> Test R^2={test_r2}')
    print('-'*50)

For params:{'estimator__gamma': 0.001, 'estimator__kernel': 'rbf'}
	- Train R^2=0.217648562186964 --> Test R^2=0.10342227694171069
--------------------------------------------------
For params:{'estimator__gamma': 0.001, 'estimator__kernel': 'polynomial'}
	- Train R^2=0.2697208193130315 --> Test R^2=0.1462665174484057
--------------------------------------------------
For params:{'estimator__gamma': 0.001, 'estimator__kernel': 'sigmoid'}
	- Train R^2=0.06469012696228683 --> Test R^2=-0.010835886136580675
--------------------------------------------------
For params:{'estimator__gamma': 0.001, 'estimator__kernel': 'linear'}
	- Train R^2=-8.187249476129395 --> Test R^2=-19.63712035462936
--------------------------------------------------
For params:{'estimator__gamma': 0.01, 'estimator__kernel': 'rbf'}
	- Train R^2=0.48994427434294785 --> Test R^2=0.2588471567984009
--------------------------------------------------
For params:{'estimator__gamma': 0.01, 'estimator__kernel': 'polynomial'}

In [18]:
model_MO_kridge_reg = kridge_reg_gridcv.best_estimator_

# 1c. Chain Regressor

In [19]:
param_grid = [{'base_estimator__kernel':['rbf','polynomial','sigmoid','linear'],
               'base_estimator__gamma':[0.001, 0.01, 0.1, 0.5,1,5,10], 
               }]
RC_kridge_reg = RegressorChain(KernelRidge(), order='random', random_state=42)

kridge_reg_gridcv = GridSearchCV(RC_kridge_reg, param_grid, cv=tss_cv, n_jobs=-1, 
                             refit=True, return_train_score=True,
                             scoring=SCORING, verbose=1
                            )

In [20]:
# fitting the model for grid search
kridge_reg_gridcv.fit(X_train, y_train)

# print the best parameter set after tuning
print(kridge_reg_gridcv.best_params_)

Fitting 5 folds for each of 28 candidates, totalling 140 fits


  K **= degree
  K **= degree
Traceback (most recent call last):
  File "/home/ebauer/Documents/Code_Repos/bike-fitness-tracking/venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ebauer/Documents/Code_Repos/bike-fitness-tracking/venv/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ebauer/Documents/Code_Repos/bike-fitness-tracking/venv/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "/home/ebauer/Documents/Code_Repos/bike-fitness-tracking/venv/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", l

{'base_estimator__gamma': 0.01, 'base_estimator__kernel': 'sigmoid'}


          nan  -0.04912192 -19.63712035 -10.58850931          nan
 -13.80177283 -19.63712035 -10.77523097          nan -22.02970304
 -19.63712035 -10.85657143          nan -46.62526832 -19.63712035
 -10.94506251          nan -83.05731981 -19.63712035 -10.94648769
          nan -49.24299823 -19.63712035]
          nan   0.01029568  -8.18724948 -11.89444161          nan
 -24.25508786  -8.18724948 -12.00332487          nan  -8.41543668
  -8.18724948 -12.03416731          nan -13.52087903  -8.18724948
 -12.07812133          nan -14.65970539  -8.18724948 -12.0827493
          nan -12.64334373  -8.18724948]


In [21]:
cv_results = kridge_reg_gridcv.cv_results_

for train_r2, test_r2, params in zip(cv_results['mean_train_score'], cv_results['mean_test_score'], cv_results['params']):
    print(f'For params:{params}')
    print(f'\t- Train R^2={train_r2} --> Test R^2={test_r2}')
    print('-'*50)

For params:{'base_estimator__gamma': 0.001, 'base_estimator__kernel': 'rbf'}
	- Train R^2=-10.722252834660424 --> Test R^2=-9.741796320970234
--------------------------------------------------
For params:{'base_estimator__gamma': 0.001, 'base_estimator__kernel': 'polynomial'}
	- Train R^2=nan --> Test R^2=nan
--------------------------------------------------
For params:{'base_estimator__gamma': 0.001, 'base_estimator__kernel': 'sigmoid'}
	- Train R^2=0.001116631267365482 --> Test R^2=-0.055398990676212366
--------------------------------------------------
For params:{'base_estimator__gamma': 0.001, 'base_estimator__kernel': 'linear'}
	- Train R^2=-8.187249476119487 --> Test R^2=-19.637120353009205
--------------------------------------------------
For params:{'base_estimator__gamma': 0.01, 'base_estimator__kernel': 'rbf'}
	- Train R^2=-11.689765654332376 --> Test R^2=-10.376210530181895
--------------------------------------------------
For params:{'base_estimator__gamma': 0.01, 'base

In [22]:
model_RC_kridge_reg = kridge_reg_gridcv.best_estimator_

# 2. Save Models

In [23]:
import pickle

pickle.dump(model_kridge_reg, open(OUT_MODEL_DATA / 'kridge_reg_AllAtOnce.pkl','wb'))
pickle.dump(model_MO_kridge_reg, open(OUT_MODEL_DATA / 'kridge_reg_MultiOutput.pkl','wb'))
pickle.dump(model_RC_kridge_reg, open(OUT_MODEL_DATA / 'kridge_reg_RegChain.pkl','wb'))