In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler 
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GroupKFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression

# Import helperfunctions
from ML_functions import fun_load_data, fun_preprocessing, fun_fit_tuning, fun_load_best_params
from ML_functions import fun_convert_time
from ML_functions import fun_scaled_neg_MAPE, fun_scaled_neg_RMSE, fun_predict_with_scaling, fun_tuning_results, fun_scores

# Assign string "TSP" or "CVRP" to the following variable to define the optimization problem
optimization_problem = "TSP"

# Load data
data, _ = fun_load_data(optimization_problem)

# Do the train test split during the preprocessing
X_train, X_test, y_train, y_test, train_data = fun_preprocessing(data, train_size=0.8)

### **Scaling the train and test scores**

In [6]:
# Create model
lr = LinearRegression()

# Estimate model performance with cross validation on the train set (scoring: MAPE and RMSE)
model_results_dict = fun_scores(model=lr, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, apply_scaling=False, compute_test_scores=True)

print("############### APPLY SCALING ###############")

# Estimate model performance with cross validation on the train set (scoring: MAPE and RMSE)
model_results_dict = fun_scores(model=lr, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, apply_scaling=True, compute_test_scores=True)

CV MAPE (original) train data:  21.0621 %
CV RMSE (original) train data: 4.5107
CV computation time: 0s

MAPE (original) test data:  20.8629 %
RMSE (original) test data: 4.523
Model fit time: 0s

MAPE and RMSE on test data per instance size:


Number Customers,6,7,8,9,10,11,12,13,14,Mean
MAPE,20.4724,18.3819,17.967,18.6769,17.8236,15.9414,19.8284,21.1639,31.9761,20.8629
RMSE,8.2956,5.8915,4.6093,3.9358,3.7244,3.4681,3.5189,3.7069,4.4538,4.523


############### APPLY SCALING ###############
CV MAPE (scaled) train data:  20.753 %
CV RMSE (scaled) train data: 4.3009
CV computation time: 0s

MAPE (scaled) test data:  20.6584 %
RMSE (scaled) test data: 4.3248

MAPE and RMSE on test data per instance size:


Number Customers,6,7,8,9,10,11,12,13,14,Mean
MAPE,20.6135,18.6215,17.5257,17.4757,16.3983,14.8043,19.4288,21.3928,33.547,20.6584
RMSE,6.1006,5.2035,4.4235,3.8412,3.5101,3.3063,3.5365,3.9908,5.2138,4.3248


### **Scaling the cross-validation scores**

**1. Scaling during Cross-Validation**

In [2]:
# Cross-validation with GroupKFold to keep instances together in one fold
cv_scores = cross_validate(estimator=LinearRegression(), X=X_train, y=y_train, 
                           cv=GroupKFold(n_splits=3).split(X_train, y_train, groups=X_train["Instance ID"]), # Don't use n_jobs=-1 to display the Data Frame of each fold.
                           scoring={"scaled_neg_mape": fun_scaled_neg_MAPE,
                                    "scaled_neg_rmse": fun_scaled_neg_RMSE,
                                    "original_neg_mape": "neg_mean_absolute_percentage_error",
                                    "original_neg_rmse": "neg_root_mean_squared_error"})

MAPE1 = - np.round(cv_scores["test_scaled_neg_mape"].mean(), 6) * 100
RMSE1 = - np.round(cv_scores["test_scaled_neg_rmse"].mean(), 4)
MAPE2 = - np.round(cv_scores["test_original_neg_mape"].mean(), 6) * 100
RMSE2 = - np.round(cv_scores["test_original_neg_rmse"].mean(), 4)

# Print train scores
print("CV MAPE (scaled) train data:  {} %".format(MAPE1))
print("CV RMSE (scaled) train data: ", RMSE1)
print("\nCV MAPE (original) train data:  {} %".format(MAPE2))
print("CV RMSE (original) train data: ", RMSE2)

CV MAPE (scaled) train data:  20.6429 %
CV RMSE (scaled) train data:  4.2858

CV MAPE (original) train data:  20.97 %
CV RMSE (original) train data:  4.4974


**2. Scaling during Hyperparametertunging with Cross-Validation**

In [4]:
# Grid search cross validation with data scaling
pipe = Pipeline(steps=[("scaler", None), 
                       ("knn", KNeighborsRegressor())])

param_grid = {"scaler": [StandardScaler(), MinMaxScaler()],
              "knn__n_neighbors": list(np.arange(start=7, stop=12))}

grid_search = GridSearchCV(estimator=pipe, param_grid=param_grid,
                           cv=GroupKFold(n_splits=3).split(X_train, y_train, groups=X_train["Instance ID"]),
                           scoring=fun_scaled_neg_MAPE, verbose=True, n_jobs=-1)
fit_time = fun_fit_tuning(search_method=grid_search, X_train=X_train, y_train=y_train, file_name=optimization_problem + "_KNN")

# View results of grid search cross validation
model_results_dict = fun_scores(model=grid_search, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, compute_test_scores=True)

# View grid search CV scores of all parameter combinations
results_df = fun_tuning_results(search_method=grid_search, search_space=param_grid)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Tuning fit time: 55s
CV MAPE (scaled) train data:  24.1158 %

Best model / parameter combination:


{'knn__n_neighbors': 8, 'scaler': StandardScaler()}


MAPE (scaled) test data:  22.9962 %
RMSE (scaled) test data: 4.5799

MAPE and RMSE on test data per instance size:


Number Customers,6,7,8,9,10,11,12,13,14,Mean
MAPE,26.6366,23.3209,22.3591,26.0315,23.0882,21.1485,23.0841,21.7686,22.137,22.9962
RMSE,6.9956,5.7227,5.1465,4.8663,4.4717,4.1565,3.7555,3.8364,3.6771,4.5799


Cross validation scores of different parameter combinations:


Unnamed: 0,param_scaler,param_knn__n_neighbors,mean_test_score,converted_mean_fit_time
0,StandardScaler(),8,-0.241158,0s
1,StandardScaler(),9,-0.241195,0s
2,StandardScaler(),10,-0.24172,0s
3,StandardScaler(),11,-0.242185,0s
4,StandardScaler(),7,-0.24229,0s
5,MinMaxScaler(),7,-0.255876,0s
6,MinMaxScaler(),8,-0.255947,0s
7,MinMaxScaler(),9,-0.256556,0s
8,MinMaxScaler(),10,-0.256972,0s
9,MinMaxScaler(),11,-0.258623,0s


In [7]:
# Load best parameters of the model
best_params = fun_load_best_params(optimization_problem + "_KNN_best_params.pkl")

# Create a pipline and set best_params as parameters
pipe = Pipeline(steps=[("scaler", None), 
                       ("knn", KNeighborsRegressor())])
pipe.set_params(**best_params)

# Estimate model performance with cross-validation on the train set and get scores on test set (scoring: MAPE and RMSE)
model_results_dict = fun_scores(model=pipe, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, apply_scaling=False, compute_test_scores=True)

print("############### APPLY SCALING ###############")

# Estimate model performance with cross-validation on the train set and get scores on test set (scoring: MAPE and RMSE)
model_results_dict = fun_scores(model=pipe, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, apply_scaling=True, compute_test_scores=True)

{'knn__n_neighbors': 8, 'scaler': StandardScaler()}

CV MAPE (original) train data:  26.1511 %
CV RMSE (original) train data: 5.6965
CV computation time: 19s

MAPE (original) test data:  24.8426 %
RMSE (original) test data: 5.5442
Model fit time: 4s

MAPE and RMSE on test data per instance size:


Number Customers,6,7,8,9,10,11,12,13,14,Mean
MAPE,27.6196,25.0406,24.0148,27.6169,24.7961,22.507,24.8345,23.4157,25.4432,24.8426
RMSE,9.4513,7.0264,6.2572,5.8272,5.3116,4.7138,4.4556,4.2984,4.3183,5.5442


############### APPLY SCALING ###############
CV MAPE (scaled) train data:  24.1158 %
CV RMSE (scaled) train data: 4.7369
CV computation time: 20s

MAPE (scaled) test data:  22.9962 %
RMSE (scaled) test data: 4.5799

MAPE and RMSE on test data per instance size:


Number Customers,6,7,8,9,10,11,12,13,14,Mean
MAPE,26.6366,23.3209,22.3591,26.0315,23.0882,21.1485,23.0841,21.7686,22.137,22.9962
RMSE,6.9956,5.7227,5.1465,4.8663,4.4717,4.1565,3.7555,3.8364,3.6771,4.5799
