In [1]:
import pandas as pd
pd.set_option("display.max_columns", 100)
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Optional, Any, Union

In [2]:
IN_CSV_DATA = Path().cwd().parent.parent / "data/4_data_split"
OUT_MODEL_DATA = Path().cwd().parent.parent / "data/5_models"

# 1. Load in Training and Test Datasets

In [3]:
df_train = pd.read_csv(IN_CSV_DATA/'prepared_train.csv')
df_test = pd.read_csv(IN_CSV_DATA/'prepared_test.csv')

In [4]:
numerical_feature_cols = ['total_distance_mi','total_weight_lbs','avg_cruising_speed', 'log_hours_since_last_ride',
                            'active_time_ratio', 'avg_climb_rate', 'distance_training_intensity','prior_training_load']
categorical_feature_cols = ['year']
feature_cols = numerical_feature_cols + categorical_feature_cols

target_cols = ['best_power_4s', 'best_power_5s',
                'best_power_10s', 'best_power_20s', 'best_power_30s', 'best_power_1m',
                'best_power_2m', 'best_power_3m', 'best_power_4m', 'best_power_5m',
                'best_power_6m', 'best_power_10m', 'best_power_20m', 'best_power_30m',
                'best_power_40m', 'best_power_1h', 'best_power_2h']

In [5]:
X_train, y_train = df_train[feature_cols].values, df_train[target_cols].values
X_test, y_test = df_test[feature_cols].values, df_test[target_cols].values

In [6]:
from sklearn.model_selection import cross_val_score, cross_val_predict, TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, root_mean_squared_log_error
from sklearn.linear_model import LinearRegression

In [10]:
def timeseries_cross_validated_regression(X, y, regressor, k_folds=5):
    tss = TimeSeriesSplit(n_splits=k_folds)
    
    r2_scores = []
    rmsle_scores = []
    for train_index, val_index in tss.split(X, y):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]
        
        regressor.fit(X_train, y_train)
        
        y_pred = regressor.predict(X_val)
        r2 = r2_score(y_val, y_pred, multioutput='uniform_average')
        # NOTE: RMSLE is chosen because it represents the average ratio error between the predicted and the true values.
        #       |--> This is useful because the RMSE error in the 5second effort duration is at a different power scale (kW) than a 20minute effort (Watts)
        # ref: https://medium.com/analytics-vidhya/root-mean-square-log-error-rmse-vs-rmlse-935c6cc1802a 
        # NOTE: RMSLE is biased in how it penalizes errors. It penalizes UNDERestimation more than OVERestimation
        # This means if we use y_true=y_val as it truly should match, we're okay with overestimating our power curves...
        # So we swap the ordering of these so that we're okay underestimating our power curves. We'd rather be conservative on our estimates of fitness
        print(y_pred)
        rmsle = root_mean_squared_log_error(y_true=y_pred, y_pred=y_val, multioutput='uniform_average')
        r2_scores.append(r2)
        rmsle_scores.append(rmsle)
    
    print(f'For metric "R^2", the mean = {np.mean(r2_scores)}, std.dev. = {np.std(r2_scores)}')
    print(f'For metric "RMSLE", the mean = {np.mean(rmsle_scores)}, std.dev. = {np.std(rmsle_scores)}')

In [8]:
linreg = LinearRegression()

In [11]:
timeseries_cross_validated_regression(X_train, y_train, linreg, k_folds=5)

[[ 4.56024135e+02  4.53716730e+02  4.48314849e+02  4.02220308e+02
   3.41270398e+02  2.72472721e+02  2.25632400e+02  2.09215304e+02
   1.98765462e+02  1.84621583e+02  1.74989138e+02  1.56122169e+02
   1.33449022e+02  1.19138614e+02  9.38856599e+01  1.42818891e+01
  -5.86814904e+00]
 [ 9.49283547e+02  9.38972810e+02  8.65520019e+02  7.09696456e+02
   6.17929160e+02  3.85766706e+02  2.92744311e+02  2.49409988e+02
   2.23090592e+02  2.16117133e+02  2.10454412e+02  1.84822409e+02
   1.74217286e+02  2.47260957e+02  2.34016452e+02  1.97161023e+02
   3.96508447e+01]
 [ 8.19669334e+02  8.11397805e+02  7.54038012e+02  6.16172193e+02
   5.21989749e+02  3.35991348e+02  2.57941465e+02  2.24030484e+02
   1.98872029e+02  1.91483354e+02  1.85297901e+02  1.62010321e+02
   1.51013573e+02  1.95427371e+02  2.01642791e+02  1.41045073e+02
   3.03198754e+01]
 [ 5.12718905e+02  5.09949483e+02  5.00014923e+02  4.36056557e+02
   3.56794171e+02  2.80033039e+02  2.32699576e+02  2.16419040e+02
   2.01580656e+02  

ValueError: Root Mean Squared Logarithmic Error cannot be used when targets contain values less than or equal to -1.