In [19]:
import pandas as pd
pd.set_option("display.max_columns", 100)
import numpy as np
np.set_printoptions(suppress=True)
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Optional, Any, Union

In [20]:
IN_CSV_DATA = Path().cwd().parent.parent / "data/4_data_split"
OUT_MODEL_DATA = Path().cwd().parent.parent / "data/5_models"

# 1. Load in Training and Test Datasets

In [21]:
df_train = pd.read_csv(IN_CSV_DATA/'prepared_train.csv')
df_test = pd.read_csv(IN_CSV_DATA/'prepared_test.csv')

In [22]:
numerical_feature_cols = ['total_distance_mi','total_weight_lbs','avg_cruising_speed', 'log_hours_since_last_ride',
                            'active_time_ratio', 'avg_climb_rate', 'distance_training_intensity','prior_training_load']
categorical_feature_cols = []#['year']
feature_cols = numerical_feature_cols + categorical_feature_cols

target_cols = ['best_power_4s', 'best_power_5s',
                'best_power_10s', 'best_power_20s', 'best_power_30s', 'best_power_1m',
                'best_power_2m', 'best_power_3m', 'best_power_4m', 'best_power_5m',
                'best_power_6m', 'best_power_10m', 'best_power_20m', 'best_power_30m',
                'best_power_40m', 'best_power_1h', 'best_power_2h']

In [23]:
X_train, y_train = df_train[feature_cols].values, df_train[target_cols].values
X_test, y_test = df_test[feature_cols].values, df_test[target_cols].values

In [24]:
from sklearn.model_selection import cross_val_score, cross_val_predict, TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, root_mean_squared_log_error
from sklearn.multioutput import MultiOutputRegressor, RegressorChain

In [25]:
SCORING = 'r2'

In [26]:
from sklearn.ensemble import RandomForestRegressor

In [27]:
tss_cv = TimeSeriesSplit(n_splits=5)

# 1a. Full Targets Vector Regressor

In [28]:
rf_reg = RandomForestRegressor()

In [29]:
param_grid =[{'criterion':['squared_error',],'max_depth':[5, 15],'n_estimators':[500, 1000, 2000],'max_features':[3,4,5]}]
rf_reg_gridcv = GridSearchCV(rf_reg, param_grid, cv=tss_cv, n_jobs=-1, 
                             refit=True, return_train_score=True,
                             scoring=SCORING, verbose=1
                            )

In [30]:
# fitting the model for grid search
rf_reg_gridcv.fit(X_train, y_train)

# print the best parameter set after tuning
print(rf_reg_gridcv.best_params_)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
{'criterion': 'squared_error', 'max_depth': 15, 'max_features': 5, 'n_estimators': 2000}


In [31]:
cv_results = rf_reg_gridcv.cv_results_

for train_r2, test_r2, params in zip(cv_results['mean_train_score'], cv_results['mean_test_score'], cv_results['params']):
    print(f'For params:{params}')
    print(f'\t- Train R^2={train_r2} --> Test R^2={test_r2}')
    print('-'*50)

For params:{'criterion': 'squared_error', 'max_depth': 5, 'max_features': 3, 'n_estimators': 500}
	- Train R^2=0.6838551258255501 --> Test R^2=0.2896774115031643
--------------------------------------------------
For params:{'criterion': 'squared_error', 'max_depth': 5, 'max_features': 3, 'n_estimators': 1000}
	- Train R^2=0.681719287710683 --> Test R^2=0.28164300710706336
--------------------------------------------------
For params:{'criterion': 'squared_error', 'max_depth': 5, 'max_features': 3, 'n_estimators': 2000}
	- Train R^2=0.6840550950144408 --> Test R^2=0.28827755052249177
--------------------------------------------------
For params:{'criterion': 'squared_error', 'max_depth': 5, 'max_features': 4, 'n_estimators': 500}
	- Train R^2=0.6962163903977976 --> Test R^2=0.2970567981526727
--------------------------------------------------
For params:{'criterion': 'squared_error', 'max_depth': 5, 'max_features': 4, 'n_estimators': 1000}
	- Train R^2=0.6956675571632556 --> Test R^2=0

In [32]:
model_rf_reg = rf_reg_gridcv.best_estimator_
#df_coeficients = pd.DataFrame(model_rf_reg.coef_, columns=feature_cols, index=target_cols)

In [None]:
model_rf_reg.feature_importances_

array([0.25811043, 0.04317425, 0.14583754, 0.07389609, 0.06856802,
       0.29895016, 0.05588599, 0.05557752])

# 1b. Individual Target Regressors `MultiOutputRegressor`

In [37]:
def timeseries_cross_validated_regression(X, y, regressor, n_subseries_splits=5, verbose:bool=True):
    tss = TimeSeriesSplit(n_splits=n_subseries_splits)
    
    r2_scores = []
    rmsle_scores = []
    predictions = []
    actuals = []
    for train_index, val_index in tss.split(X, y):
        X_tr, X_val = X[train_index], X[val_index]
        y_tr, y_val = y[train_index], y[val_index]
        
        regressor.fit(X_tr, y_tr)
        
        y_pred = regressor.predict(X_val)
        # Ensure all predicted powers are non-negative
        y_pred[y_pred<0] = 0.0
        r2 = r2_score(y_val, y_pred, multioutput='raw_values')
        r2 = np.mean(r2)
        r2_scores.append(r2)
        # NOTE: RMSLE is chosen because it represents the average ratio error between the predicted and the true values.
        #       |--> This is useful because the RMSE error in the 5second effort duration is at a different power scale (kW) than a 20minute effort (Watts)
        # ref: https://medium.com/analytics-vidhya/root-mean-square-log-error-rmse-vs-rmlse-935c6cc1802a 
        # NOTE: RMSLE is biased in how it penalizes errors. It penalizes UNDERestimation more than OVERestimation
        # This means if we use y_true=y_val as it truly should match, we're okay with overestimating our power curves...
        # So we swap the ordering of these so that we're okay underestimating our power curves. We'd rather be conservative on our estimates of fitness
        rmsle = root_mean_squared_log_error(y_true=y_pred, y_pred=y_val, multioutput='raw_values')
        rmsle = np.mean(rmsle)
        rmsle_scores.append(rmsle)

        predictions.append(y_pred)
        actuals.append(y_val)
        
    if verbose:
        print(f'For metric "R^2", the latest value (final subseries) = {r2_scores[-1]}')
        print(f'For metric "RMSLE", the latest value (final subseries) = {rmsle_scores[-1]}')

    return {'actuals':actuals, 'predictions':predictions, 
            'raw_final_r2':r2_score(actuals[-1], predictions[-1], multioutput='raw_values'),
            'raw_final_rmsle':root_mean_squared_log_error(actuals[-1], predictions[-1], multioutput='raw_values')}

In [38]:
# param_grid =[{'estimator__criterion':['squared_error',],
#               'estimator__max_depth':[5, 15],
#               'estimator__n_estimators':[500, 1000, 2000],
#               'estimator__max_features':[3,4,5]}]

MO_rf_reg = MultiOutputRegressor(RandomForestRegressor(**rf_reg_gridcv.best_params_))

# rf_reg_gridcv = GridSearchCV(MO_rf_reg, param_grid, cv=tss_cv, n_jobs=-1, 
#                              refit=True, return_train_score=True,
#                              scoring=SCORING, verbose=1
#                             )

In [None]:
_ = timeseries_cross_validated_regression(X_train, y_train, MO_rf_reg)

For metric "R^2", the latest value (final subseries) = 0.37407679218182416
For metric "RMSLE", the latest value (final subseries) = 0.6396840573149658


{'actuals': [array([[ 419.7025091 ,  416.77208208,  395.18476897,  347.21786184,
           300.4889444 ,  272.00138096,  222.94949666,  213.44732298,
           186.99704358,  179.82750612,  177.37268453,  151.19053082,
           126.9321167 ,  117.75325399,  111.62957975,    0.        ,
             0.        ],
         [ 499.21515478,  493.94241757,  458.0518593 ,  382.85341297,
           308.37468752,  278.57024175,  190.9884157 ,  183.08112837,
           175.22007923,  165.83429859,  166.20343591,  152.45590454,
           141.71905849,  140.9509944 ,  138.25562251,  128.57846612,
           121.64660131],
         [ 477.82858089,  473.266836  ,  445.7233043 ,  366.74701894,
           296.22728679,  223.64397971,  199.66645954,  178.80563935,
           161.2267407 ,  149.13856239,  145.46460203,  140.70284171,
           137.78690007,  133.44073357,  125.9926441 ,  111.42558491,
            99.83164111],
         [ 437.71339068,  435.85506915,  421.03791295,  363.05962342,
 

In [40]:
# fitting the model for grid search
MO_rf_reg.fit(X_train, y_train)


# 1c. Chain Regressor

In [41]:
# param_grid =[{'base_estimator__criterion':['squared_error',],
#               'base_estimator__max_depth':[5, 15],
#               'base_estimator__n_estimators':[500, 1000, 2000],
#               'base_estimator__max_features':[3,4,5]}]

RC_rf_reg = RegressorChain(RandomForestRegressor(**rf_reg_gridcv.best_params_), order='random', random_state=42)

# rf_reg_gridcv = GridSearchCV(RC_rf_reg, param_grid, cv=tss_cv, n_jobs=-1, 
#                              refit=True, return_train_score=True,
#                              scoring=SCORING, verbose=1
#                             )

In [43]:
MO_rf_reg

In [44]:
_ = timeseries_cross_validated_regression(X_train, y_train, RC_rf_reg)

For metric "R^2", the latest value (final subseries) = 0.3401799120042982
For metric "RMSLE", the latest value (final subseries) = 0.7265875606469425


In [45]:
# fitting the model for grid search
RC_rf_reg.fit(X_train, y_train)



In [1]:
RC_rf_reg

NameError: name 'RC_rf_reg' is not defined

# 2. Save Models

In [None]:
import pickle

pickle.dump(model_rf_reg, open(OUT_MODEL_DATA / 'randomforest_reg_AllAtOnce.pkl','wb'))
pickle.dump(MO_rf_reg, open(OUT_MODEL_DATA / 'randomforest_reg_MultiOutput.pkl','wb'))
pickle.dump(RC_rf_reg, open(OUT_MODEL_DATA / 'randomforest_reg_RegChain.pkl','wb'))