In [1]:
import pandas as pd
pd.set_option("display.max_columns", 100)
import numpy as np
np.set_printoptions(suppress=True)
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Optional, Any, Union

In [2]:
IN_CSV_DATA = Path().cwd().parent.parent / "data/4_data_split"
OUT_MODEL_DATA = Path().cwd().parent.parent / "data/5_models"

# 1. Load in Training and Test Datasets

In [3]:
df_train = pd.read_csv(IN_CSV_DATA/'prepared_train.csv')
df_test = pd.read_csv(IN_CSV_DATA/'prepared_test.csv')

In [4]:
numerical_feature_cols = ['total_distance_mi','total_weight_lbs','avg_cruising_speed', 'log_hours_since_last_ride',
                            'active_time_ratio', 'avg_climb_rate', 'distance_training_intensity','prior_training_load']
categorical_feature_cols = []#['year']
feature_cols = numerical_feature_cols + categorical_feature_cols

target_cols = ['best_power_4s', 'best_power_5s',
                'best_power_10s', 'best_power_20s', 'best_power_30s', 'best_power_1m',
                'best_power_2m', 'best_power_3m', 'best_power_4m', 'best_power_5m',
                'best_power_6m', 'best_power_10m', 'best_power_20m', 'best_power_30m',
                'best_power_40m', 'best_power_1h', 'best_power_2h']

In [5]:
X_train, y_train = df_train[feature_cols].values, df_train[target_cols].values
X_test, y_test = df_test[feature_cols].values, df_test[target_cols].values

In [6]:
from sklearn.model_selection import cross_val_score, cross_val_predict, TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, root_mean_squared_log_error
from sklearn.multioutput import MultiOutputRegressor, RegressorChain

In [7]:
SCORING = 'r2'

In [8]:
from sklearn.neighbors import KNeighborsRegressor

In [9]:
tss_cv = TimeSeriesSplit(n_splits=5)

# 1a. Full Targets Vector Regressor

In [10]:
knn_reg = KNeighborsRegressor()

In [17]:
param_grid = [{'n_neighbors':[5, 10, 15, 50],
               'weights':['uniform','distance'],
               'p':[1,2], # power for minkowski norm (l1 and l2 norms used)
               'metric':['minkowski']
               }]
knn_reg_gridcv = GridSearchCV(knn_reg, param_grid, cv=tss_cv, n_jobs=-1, 
                             refit=True, return_train_score=True,
                             scoring=SCORING, verbose=1
                            )

In [18]:
# fitting the model for grid search
knn_reg_gridcv.fit(X_train, y_train)

# print the best parameter set after tuning
print(knn_reg_gridcv.best_params_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
{'metric': 'minkowski', 'n_neighbors': 15, 'p': 1, 'weights': 'distance'}


In [19]:
cv_results = knn_reg_gridcv.cv_results_

for train_r2, test_r2, params in zip(cv_results['mean_train_score'], cv_results['mean_test_score'], cv_results['params']):
    print(f'For params:{params}')
    print(f'\t- Train R^2={train_r2} --> Test R^2={test_r2}')
    print('-'*50)

For params:{'metric': 'minkowski', 'n_neighbors': 5, 'p': 1, 'weights': 'uniform'}
	- Train R^2=0.5579434406645578 --> Test R^2=0.22031369010499455
--------------------------------------------------
For params:{'metric': 'minkowski', 'n_neighbors': 5, 'p': 1, 'weights': 'distance'}
	- Train R^2=1.0 --> Test R^2=0.23028277148440787
--------------------------------------------------
For params:{'metric': 'minkowski', 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}
	- Train R^2=0.5684287095165489 --> Test R^2=0.21448099716518537
--------------------------------------------------
For params:{'metric': 'minkowski', 'n_neighbors': 5, 'p': 2, 'weights': 'distance'}
	- Train R^2=1.0 --> Test R^2=0.22572812492188293
--------------------------------------------------
For params:{'metric': 'minkowski', 'n_neighbors': 10, 'p': 1, 'weights': 'uniform'}
	- Train R^2=0.43801042275941626 --> Test R^2=0.21676743124058212
--------------------------------------------------
For params:{'metric': 'minkowsk

In [20]:
model_knn_reg = knn_reg_gridcv.best_estimator_

# 1b. Individual Target Regressors `MultiOutputRegressor`

In [21]:
param_grid = [{'estimator__n_neighbors':[5, 15, 50],
               'estimator__weights':['uniform','distance'],
               'estimator__p':[1,2], # power for minkowski norm (l1 and l2 norms used)
               'estimator__metric':['minkowski']
               }]
MO_knn_reg = MultiOutputRegressor(KNeighborsRegressor())

knn_reg_gridcv = GridSearchCV(MO_knn_reg, param_grid, cv=tss_cv, n_jobs=-1, 
                             refit=True, return_train_score=True,
                             scoring=SCORING, verbose=1
                            )

In [22]:
# fitting the model for grid search
knn_reg_gridcv.fit(X_train, y_train)

# print the best parameter set after tuning
print(knn_reg_gridcv.best_params_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
{'estimator__metric': 'minkowski', 'estimator__n_neighbors': 15, 'estimator__p': 1, 'estimator__weights': 'distance'}


In [23]:
cv_results = knn_reg_gridcv.cv_results_

for train_r2, test_r2, params in zip(cv_results['mean_train_score'], cv_results['mean_test_score'], cv_results['params']):
    print(f'For params:{params}')
    print(f'\t- Train R^2={train_r2} --> Test R^2={test_r2}')
    print('-'*50)

For params:{'estimator__metric': 'minkowski', 'estimator__n_neighbors': 5, 'estimator__p': 1, 'estimator__weights': 'uniform'}
	- Train R^2=0.5579434406645578 --> Test R^2=0.22031369010499455
--------------------------------------------------
For params:{'estimator__metric': 'minkowski', 'estimator__n_neighbors': 5, 'estimator__p': 1, 'estimator__weights': 'distance'}
	- Train R^2=1.0 --> Test R^2=0.23028277148440787
--------------------------------------------------
For params:{'estimator__metric': 'minkowski', 'estimator__n_neighbors': 5, 'estimator__p': 2, 'estimator__weights': 'uniform'}
	- Train R^2=0.5684287095165489 --> Test R^2=0.21448099716518537
--------------------------------------------------
For params:{'estimator__metric': 'minkowski', 'estimator__n_neighbors': 5, 'estimator__p': 2, 'estimator__weights': 'distance'}
	- Train R^2=1.0 --> Test R^2=0.22572812492188293
--------------------------------------------------
For params:{'estimator__metric': 'minkowski', 'estimator

In [24]:
model_MO_knn_reg = knn_reg_gridcv.best_estimator_

# 1c. Chain Regressor

In [27]:
param_grid = [{'base_estimator__n_neighbors':[5, 15, 50],
               'base_estimator__weights':['uniform','distance'],
               'base_estimator__p':[1,2], # power for minkowski norm (l1 and l2 norms used)
               'base_estimator__metric':['minkowski']
               }]
RC_knn_reg = RegressorChain(KNeighborsRegressor(), order='random', random_state=42)

knn_reg_gridcv = GridSearchCV(RC_knn_reg, param_grid, cv=tss_cv, n_jobs=-1, 
                             refit=True, return_train_score=True,
                             scoring=SCORING, verbose=1
                            )

In [28]:
# fitting the model for grid search
knn_reg_gridcv.fit(X_train, y_train)

# print the best parameter set after tuning
print(knn_reg_gridcv.best_params_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
{'base_estimator__metric': 'minkowski', 'base_estimator__n_neighbors': 15, 'base_estimator__p': 1, 'base_estimator__weights': 'distance'}


In [29]:
cv_results = knn_reg_gridcv.cv_results_

for train_r2, test_r2, params in zip(cv_results['mean_train_score'], cv_results['mean_test_score'], cv_results['params']):
    print(f'For params:{params}')
    print(f'\t- Train R^2={train_r2} --> Test R^2={test_r2}')
    print('-'*50)

For params:{'base_estimator__metric': 'minkowski', 'base_estimator__n_neighbors': 5, 'base_estimator__p': 1, 'base_estimator__weights': 'uniform'}
	- Train R^2=0.17733219961465957 --> Test R^2=-0.02423129352103304
--------------------------------------------------
For params:{'base_estimator__metric': 'minkowski', 'base_estimator__n_neighbors': 5, 'base_estimator__p': 1, 'base_estimator__weights': 'distance'}
	- Train R^2=1.0 --> Test R^2=-0.028113883968086832
--------------------------------------------------
For params:{'base_estimator__metric': 'minkowski', 'base_estimator__n_neighbors': 5, 'base_estimator__p': 2, 'base_estimator__weights': 'uniform'}
	- Train R^2=0.19347398718239267 --> Test R^2=-0.0200677901018754
--------------------------------------------------
For params:{'base_estimator__metric': 'minkowski', 'base_estimator__n_neighbors': 5, 'base_estimator__p': 2, 'base_estimator__weights': 'distance'}
	- Train R^2=0.9999999999986724 --> Test R^2=-0.04024384054612341
------

In [30]:
model_RC_knn_reg = knn_reg_gridcv.best_estimator_

# 2. Save Models

In [31]:
import pickle

pickle.dump(model_knn_reg, open(OUT_MODEL_DATA / 'knn_reg_AllAtOnce.pkl','wb'))
pickle.dump(model_MO_knn_reg, open(OUT_MODEL_DATA / 'knn_reg_MultiOutput.pkl','wb'))
pickle.dump(model_RC_knn_reg, open(OUT_MODEL_DATA / 'knn_reg_RegChain.pkl','wb'))