In [42]:
import pandas as pd
pd.set_option("display.max_columns", 100)
import numpy as np
np.set_printoptions(suppress=True)
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Optional, Any, Union

In [43]:
IN_CSV_DATA = Path().cwd().parent.parent / "data/4_data_split"
OUT_MODEL_DATA = Path().cwd().parent.parent / "data/5_models"

# 1. Load in Training and Test Datasets

In [44]:
df_train = pd.read_csv(IN_CSV_DATA/'prepared_train.csv')
df_test = pd.read_csv(IN_CSV_DATA/'prepared_test.csv')

In [45]:
numerical_feature_cols = ['total_distance_mi','total_weight_lbs','avg_cruising_speed', 'log_hours_since_last_ride',
                            'active_time_ratio', 'avg_climb_rate', 'distance_training_intensity','prior_training_load']
categorical_feature_cols = []#['year']
feature_cols = numerical_feature_cols + categorical_feature_cols

target_cols = ['best_power_4s', 'best_power_5s',
                'best_power_10s', 'best_power_20s', 'best_power_30s', 'best_power_1m',
                'best_power_2m', 'best_power_3m', 'best_power_4m', 'best_power_5m',
                'best_power_6m', 'best_power_10m', 'best_power_20m', 'best_power_30m',
                'best_power_40m', 'best_power_1h', 'best_power_2h']

In [46]:
X_train, y_train = df_train[feature_cols].values, df_train[target_cols].values
X_test, y_test = df_test[feature_cols].values, df_test[target_cols].values

In [47]:
from sklearn.model_selection import cross_val_score, cross_val_predict, TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, root_mean_squared_log_error
from sklearn.multioutput import MultiOutputRegressor, RegressorChain

In [48]:
from sklearn.ensemble import HistGradientBoostingRegressor

In [49]:
SCORING = 'r2'
tss_cv = TimeSeriesSplit(n_splits=5)

# 1b. Individual Target Regressors `MultiOutputRegressor`
Note: `HistGradientBoostingRegressor` doesn't support all-at-once multioutput prediction. Hence we'll train individual estimators for each target

In [50]:
histgb_params = {'max_features':0.6, 'learning_rate':0.1, 'loss':'squared_error', 'max_iter':100}
MO_histgb_reg = MultiOutputRegressor(HistGradientBoostingRegressor(early_stopping=False, **histgb_params))


In [51]:
MO_histgb_reg.fit(X_train, y_train)

In [52]:
r2_score(y_train, MO_histgb_reg.predict(X_train), multioutput='raw_values')

array([0.87333825, 0.87186643, 0.88398271, 0.89726974, 0.8970777 ,
       0.89001056, 0.8916017 , 0.88940329, 0.89673885, 0.90387253,
       0.90620689, 0.91504506, 0.97182799, 0.97388382, 0.98577739,
       0.9825167 , 0.90465033])

In [53]:
## NOTE: This is useful when setting early-stopping=True, though documentation warns against using this on Time Series
# import cycler

# colors = ['#FF0000', "#FF5100", "#CF9516","#918F36","#95BD28","#61B402",
#           "#009C08", "#009B6C", "#00FFD5","#0092CC","#0043A8","#000F50",
#           '#5112E6', "#6000B9", "#8700C5","#FF00EA","#96003E","#0A0404",
#           "#3B3A3D"]
# _ = plt.rcParams['axes.prop_cycle'] = cycler.cycler('color', colors)

# _, ax = plt.subplots()
# for i, estimator in enumerate(MO_histgb_reg.estimators_):
#     plt.plot(-estimator.validation_score_, label=f'{target_cols[i]} (n_iter={estimator.n_iter_})')
#     _ = ax.set(
#         xlabel="number of iterations",
#         ylabel="root mean squared error",
#         title=f"Loss of hgbt with early stopping ",
#     )
# _ = plt.grid()
# _ = ax.set_axisbelow(True)
# _ = plt.legend(loc='best')
# _ = plt.legend(bbox_to_anchor=(1.025, 1), loc="upper left", borderaxespad=0)

In [54]:
MO_histgb_reg.estimators_[0]

# 1c. Regression Chain

In [55]:
histgb_params = {'max_features':0.6, 'learning_rate':0.1, 'loss':'squared_error', 'max_iter':100}
RC_histgb_reg = RegressorChain(HistGradientBoostingRegressor(early_stopping=False, **histgb_params),  order='random', random_state=42)

In [56]:
RC_histgb_reg.fit(X_train, y_train)

In [57]:
r2_score(y_train, RC_histgb_reg.predict(X_train), multioutput='raw_values')

array([0.87109259, 0.89490581, 0.88607667, 0.83517206, 0.8223535 ,
       0.8259712 , 0.75300746, 0.75011162, 0.79159783, 0.76305083,
       0.74636875, 0.8463939 , 0.94137474, 0.95203584, 0.97957161,
       0.97291305, 0.73804727])

# 2. Saving the Models

In [58]:
import pickle

pickle.dump(MO_histgb_reg, open(OUT_MODEL_DATA / 'histgb_reg_MultiOutput.pkl','wb'))
pickle.dump(RC_histgb_reg, open(OUT_MODEL_DATA / 'histgb_reg_RegChain.pkl','wb'))