In [1]:
import time
import warnings
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.exceptions import ConvergenceWarning
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
from sklearn.svm import LinearSVR, SVR
from sklearn.neural_network import MLPRegressor

# Import helperfunctions
from ML_functions import fun_load_settings, fun_load_data, fun_preprocessing, fun_load_best_params
from ML_functions import fun_convert_time
from ML_functions import fun_scores

# Set the default optimization problem for the case of manual executing the script (choose either "TSP" or "CVRP")
default_optimization_problem = "TSP"

# Call the function to define optimization_problem based on how the notebook is executed
# If the notebook is run by the script "main.ipynb", load optimization_problem from "settings.json". Otherwise use the default optimization problem from above
optimization_problem = fun_load_settings(default_optimization_problem)

# Load data and start the time count for the script within the function fun_load_data
data, start_script = fun_load_data(optimization_problem)

# Do the train test split during the preprocessing
X_train, X_test, y_train, y_test, train_data = fun_preprocessing(data, train_size=0.8)

The notebook was executed by another notebook. :)
Optimization problem: 'TSP'


# **A. Instance-based models**
### **1. K-nearest Neighbor - KNN**

In [2]:
# Load best parameters of the model
best_params = fun_load_best_params(optimization_problem, model_abbreviation="KNN")

# Create a pipline and set best_params as parameters
pipe = Pipeline(steps=[("scaler", None), 
                       ("knn", KNeighborsRegressor())])
pipe.set_params(**best_params)

# Estimate model performance with cross-validation on the train set and get scores on test set (scoring: MAPE and RMSE)
model_results_dict = fun_scores(model=pipe, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, compute_test_scores=True)

# Create a dictionary to store the results
results_dict = {"KNN": model_results_dict}

{'knn__n_neighbors': 8, 'scaler': StandardScaler()}

CV MAPE (scaled) train data: 22.66 %
CV RMSE (scaled) train data: 4.53
CV computation time: 16s

MAPE (scaled) test data: 21.32 %
RMSE (scaled) test data: 4.36
Model fit time: 0s
Model prediction time: 1s


**MAPE and RMSE on test data per instance size:**

Number Customers,6,7,8,9,10,11,12,13,14,Mean
MAPE,24.58,21.93,22.43,21.5,22.49,19.87,20.53,19.06,21.95,21.32
RMSE,6.45,5.46,5.0,4.55,4.38,3.98,3.7,3.55,3.49,4.36


# **B. Linear Models**
### **1. Linear Regression**

In [3]:
if (optimization_problem == "TSP"):

    # Define the model pipeline
    lr = LinearRegression()

    # Estimate model performance with cross-validation on the train set and get scores on test set (scoring: MAPE and RMSE)
    model_results_dict = fun_scores(lr, X_train, y_train, X_test, y_test, apply_scaling=True, compute_test_scores=True)
    results_dict["Linear Regression"] = model_results_dict

else: print("This cell is only executed for the TSP!")

CV MAPE (scaled) train data: 20.64 %
CV RMSE (scaled) train data: 4.29
CV computation time: 2s

MAPE (scaled) test data: 21.09 %
RMSE (scaled) test data: 4.4
Model fit time: 0s
Model prediction time: 0s


**MAPE and RMSE on test data per instance size:**

Number Customers,6,7,8,9,10,11,12,13,14,Mean
MAPE,22.21,18.51,18.6,16.47,16.6,14.35,19.04,21.86,35.82,21.09
RMSE,6.36,5.29,4.47,3.89,3.61,3.18,3.53,4.09,5.35,4.4


### **2. Rigde Regression (L2-Regularization)**

In [4]:
if (optimization_problem == "TSP"):
    
    # Load best parameters of the model
    best_params = fun_load_best_params(optimization_problem, model_abbreviation="Ridge")
    
    # Define the model pipeline
    ridge = Ridge(solver="svd", **best_params)

    # Estimate model performance with cross-validation on the train set and get scores on test set (scoring: MAPE and RMSE)
    model_results_dict = fun_scores(ridge, X_train, y_train, X_test, y_test, apply_scaling=True, compute_test_scores=True)
    results_dict["Ridge Regression"] = model_results_dict

else: print("This cell is only executed for the TSP!")

{'alpha': 1}

CV MAPE (scaled) train data: 20.64 %
CV RMSE (scaled) train data: 4.29
CV computation time: 2s

MAPE (scaled) test data: 21.09 %
RMSE (scaled) test data: 4.4
Model fit time: 0s
Model prediction time: 0s


**MAPE and RMSE on test data per instance size:**

Number Customers,6,7,8,9,10,11,12,13,14,Mean
MAPE,22.21,18.52,18.6,16.46,16.6,14.35,19.04,21.86,35.85,21.09
RMSE,6.35,5.29,4.47,3.89,3.61,3.18,3.53,4.09,5.35,4.4


### **3. Polynomial Regression**
**Preprocessor to create interactions and polynomial features**

In [5]:
# Load best parameters of the model
best_params = fun_load_best_params(optimization_problem, model_abbreviation="PR")

# Define the model pipeline
pipe = Pipeline(steps=[("preprocessor", None),
                       ("lr", LinearRegression())])
pipe.set_params(**best_params)

# Reduce number of cores (n_jobs) to avoid system overload for poly_features and degree=3
if (optimization_problem == "TSP"): print("Used cores:", cv_n_jobs := 2) # Make assignment and print at the same time
elif (optimization_problem == "CVRP"): print("Used cores:", cv_n_jobs := 1)

# Estimate model performance with cross-validation on the train set and get scores on test set (scoring: MAPE and RMSE)
model_results_dict = fun_scores(pipe, X_train, y_train, X_test, y_test, compute_test_scores=True, cv_n_jobs=cv_n_jobs)
results_dict["Polynomial Regression"] = model_results_dict

{'feature_set': 'all_features (35)',
 'preprocessor__onehot__binning__n_bins': 5,
 'preprocessor__poly__degree': 3}

Used cores: 2
CV MAPE (scaled) train data: 4.569999999999999 %
CV RMSE (scaled) train data: 1.16
CV computation time: 13m, 4s

MAPE (scaled) test data: 4.35 %
RMSE (scaled) test data: 1.09
Model fit time: 6m, 21s
Model prediction time: 1s


**MAPE and RMSE on test data per instance size:**

Number Customers,6,7,8,9,10,11,12,13,14,Mean
MAPE,3.3,3.12,3.5,3.55,3.88,4.1,4.82,5.26,5.7,4.35
RMSE,1.17,1.03,1.08,1.04,0.99,1.02,1.08,1.17,1.2,1.09


# **C. Decision Tree**

In [6]:
if (optimization_problem == "TSP"):
    
    # Load best parameters of the model
    best_params = fun_load_best_params(optimization_problem, model_abbreviation="DT")

    # Create model
    tree = DecisionTreeRegressor(**best_params, random_state=42)

    # Estimate model performance with cross-validation on the train set and get scores on test set (scoring: MAPE and RMSE)
    model_results_dict = fun_scores(tree, X_train, y_train, X_test, y_test, compute_test_scores=True)
    results_dict["Decision Tree"] = model_results_dict

else: print("This cell is only executed for the TSP!")

{'min_samples_leaf': 16,
 'min_impurity_decrease': 0.0001,
 'max_leaf_nodes': None,
 'max_depth': 25}

CV MAPE (scaled) train data: 11.57 %
CV RMSE (scaled) train data: 3.84
CV computation time: 4s

MAPE (scaled) test data: 11.16 %
RMSE (scaled) test data: 3.61
Model fit time: 2s
Model prediction time: 0s


**MAPE and RMSE on test data per instance size:**

Number Customers,6,7,8,9,10,11,12,13,14,Mean
MAPE,9.98,11.18,11.05,11.39,11.21,10.83,11.11,11.31,11.68,11.16
RMSE,4.53,4.47,4.08,3.92,3.79,3.34,3.28,3.04,2.97,3.61


# **D. Ensembles of Decision Trees**
### **1. Random Forest**

In [7]:
# Load best parameters of the model
best_params = fun_load_best_params(optimization_problem, model_abbreviation="RF")

# Create model
forest = RandomForestRegressor(n_estimators=500, bootstrap=True, **best_params, n_jobs=-1, random_state=42)

# Estimate model performance with cross-validation on the train set and get scores on test set (scoring: MAPE and RMSE)
model_results_dict = fun_scores(forest, X_train, y_train, X_test, y_test, compute_test_scores=True)
results_dict["Random Forest"] = model_results_dict

{'min_samples_split': 11,
 'min_samples_leaf': 6,
 'min_impurity_decrease': 0.0001,
 'max_leaf_nodes': None,
 'max_features': 25,
 'max_depth': 13}

CV MAPE (scaled) train data: 7.93 %
CV RMSE (scaled) train data: 2.38
CV computation time: 2m, 59s

MAPE (scaled) test data: 7.76 %
RMSE (scaled) test data: 2.29
Model fit time: 1m, 49s
Model prediction time: 0s


**MAPE and RMSE on test data per instance size:**

Number Customers,6,7,8,9,10,11,12,13,14,Mean
MAPE,6.72,7.05,7.33,7.16,7.71,7.62,7.76,8.27,8.88,7.76
RMSE,2.86,2.68,2.59,2.33,2.32,2.2,2.05,2.06,2.07,2.29


### **2. Gradient Boosting Regression Trees**

In [8]:
if (optimization_problem == "TSP"):
    
    # Load best parameters of the model
    best_params = fun_load_best_params(optimization_problem, model_abbreviation="GBRT")

    # Create model
    gbrt = GradientBoostingRegressor(**best_params, random_state=42)

    # Estimate model performance with cross-validation on the train set and get scores on test set (scoring: MAPE and RMSE)
    model_results_dict = fun_scores(gbrt, X_train, y_train, X_test, y_test, compute_test_scores=True)
    results_dict["GBRT"] = model_results_dict

else: print("This cell is only executed for the TSP!")

{'n_estimators': 120,
 'min_samples_split': 6,
 'min_samples_leaf': 23,
 'max_leaf_nodes': 1500,
 'max_features': 25,
 'max_depth': 10,
 'learning_rate': 0.1}

CV MAPE (scaled) train data: 6.09 %
CV RMSE (scaled) train data: 1.82
CV computation time: 2m, 0s

MAPE (scaled) test data: 5.75 %
RMSE (scaled) test data: 1.71
Model fit time: 2m, 28s
Model prediction time: 0s


**MAPE and RMSE on test data per instance size:**

Number Customers,6,7,8,9,10,11,12,13,14,Mean
MAPE,4.72,5.08,5.57,5.35,5.69,5.45,5.88,6.17,6.63,5.75
RMSE,2.06,1.9,1.9,1.74,1.71,1.58,1.56,1.61,1.6,1.71


### **3. Extreme Gradient Boosting: XGBoost-Package**

In [9]:
# Load best parameters of the model
best_params = fun_load_best_params(optimization_problem, model_abbreviation="XGBoost")

# Create model
xgboost = xgb.XGBRegressor(n_estimators=750, objective="reg:squarederror", **best_params, random_state=42)

# Estimate model performance with cross-validation on the train set and get scores on test set (scoring: MAPE and RMSE)
model_results_dict = fun_scores(xgboost, X_train, y_train, X_test, y_test, compute_test_scores=True)
results_dict["XGBoost"] = model_results_dict

{'colsample_bytree': 1.0,
 'learning_rate': 0.05,
 'max_depth': 7,
 'subsample': 0.6}

CV MAPE (scaled) train data: 5.37 %
CV RMSE (scaled) train data: 1.54
CV computation time: 17s

MAPE (scaled) test data: 5.12 %
RMSE (scaled) test data: 1.46
Model fit time: 6s
Model prediction time: 0s


**MAPE and RMSE on test data per instance size:**

Number Customers,6,7,8,9,10,11,12,13,14,Mean
MAPE,4.36,4.37,4.83,4.67,5.15,4.82,5.15,5.49,6.1,5.12
RMSE,1.79,1.59,1.59,1.44,1.45,1.34,1.31,1.39,1.44,1.46


# **E. Linear Support Vector Machines (SVM) and Kernel Machines**
**Linear SVM**

In [10]:
if (optimization_problem == "TSP"):
    
    # Load best parameters of the model
    best_params = fun_load_best_params(optimization_problem, model_abbreviation="SVM")

    # Define the model pipeline
    pipe = Pipeline(steps=[("scaler", None), 
                           ("SVM", LinearSVR(max_iter=10000, random_state=42))])
    pipe.set_params(**best_params)

    # Estimate model performance with cross-validation on the train set and get scores on test set (scoring: MAPE and RMSE)
    model_results_dict = fun_scores(pipe, X_train, y_train, X_test, y_test, compute_test_scores=True)
    results_dict["SVM"] = model_results_dict

else: print("This cell is only executed for the TSP!")

{'SVM__C': 0.1, 'SVM__epsilon': 0.01, 'scaler': MinMaxScaler()}

CV MAPE (scaled) train data: 19.18 %
CV RMSE (scaled) train data: 4.12
CV computation time: 2s

MAPE (scaled) test data: 19.53 %
RMSE (scaled) test data: 4.2
Model fit time: 0s
Model prediction time: 0s


**MAPE and RMSE on test data per instance size:**

Number Customers,6,7,8,9,10,11,12,13,14,Mean
MAPE,25.11,21.41,20.51,17.34,16.77,13.96,17.02,18.15,26.84,19.53
RMSE,6.87,5.72,4.81,4.03,3.66,3.04,3.13,3.37,4.13,4.2


**Kernel Machine with Gaussian Kernel**

In [11]:
# Suppress ConvergenceWarning (model will probably not converge within 500,000 iterations)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Load best parameters of the model
best_params = fun_load_best_params(optimization_problem, model_abbreviation="KM")

# Define the model pipeline
pipe = Pipeline(steps=[("scaler", None), 
                       ("SVM", SVR(kernel="rbf", cache_size=2000, max_iter=500000))])
pipe.set_params(**best_params)

# Estimate model performance with cross-validation on the train set and get scores on test set (scoring: MAPE and RMSE)
model_results_dict = fun_scores(pipe, X_train, y_train, X_test, y_test, compute_test_scores=True)
results_dict["Kernel Machine"] = model_results_dict

{'SVM__C': 100,
 'SVM__epsilon': 1,
 'SVM__gamma': 'scale',
 'scaler': MinMaxScaler()}

CV MAPE (scaled) train data: 4.25 %
CV RMSE (scaled) train data: 1.09
CV computation time: 13m, 5s

MAPE (scaled) test data: 4.05 %
RMSE (scaled) test data: 1.04
Model fit time: 4m, 3s
Model prediction time: 19s


**MAPE and RMSE on test data per instance size:**

Number Customers,6,7,8,9,10,11,12,13,14,Mean
MAPE,3.38,2.82,3.44,3.35,3.62,3.87,4.38,4.81,5.21,4.05
RMSE,1.13,0.95,1.0,0.96,0.95,0.97,1.02,1.12,1.14,1.04


# **F. Neural Network - Multi Layer Perceptron**

In [12]:
# Load best parameters of the model
best_params = fun_load_best_params(optimization_problem, model_abbreviation="NN")

# Define the parameter hidden_layer_sizes depending on the problem setting
if (optimization_problem == "TSP"): print("Hidden layer sizes:", hls := (256, 128, 64)) # Make assignment and print at the same time
elif (optimization_problem == "CVRP"): print("Hidden layer sizes:", hls := (64, 32, 16))

# Create pipeline
pipe = make_pipeline(StandardScaler(), 
                     MLPRegressor(hidden_layer_sizes=hls, activation="relu", learning_rate="adaptive", 
                                  max_iter=1000, random_state=42))
pipe.set_params(**best_params)

# Estimate model performance with cross-validation on the train set and get scores on test set (scoring: MAPE and RMSE)
model_results_dict = fun_scores(pipe, X_train, y_train, X_test, y_test, compute_test_scores=True)
results_dict["Neural Network"] = model_results_dict

{'mlpregressor__alpha': 0.1,
 'mlpregressor__batch_size': 32,
 'mlpregressor__early_stopping': False,
 'mlpregressor__learning_rate_init': 0.001,
 'mlpregressor__solver': 'sgd'}

Hidden layer sizes: (256, 128, 64)
CV MAPE (scaled) train data: 3.52 %
CV RMSE (scaled) train data: 0.96
CV computation time: 12m, 8s

MAPE (scaled) test data: 3.3000000000000003 %
RMSE (scaled) test data: 0.9
Model fit time: 33m, 11s
Model prediction time: 0s


**MAPE and RMSE on test data per instance size:**

Number Customers,6,7,8,9,10,11,12,13,14,Mean
MAPE,2.32,2.39,2.74,2.81,3.11,3.24,3.59,3.94,4.17,3.3
RMSE,0.8,0.85,0.85,0.85,0.86,0.89,0.92,0.98,0.96,0.9


# **G. Compare Results**

In [13]:
# Get model names, the scores for each model and the computation times
model_names = results_dict.keys()
cv_times = [value["CV computation time"] for value in results_dict.values()]
fit_times = [value["Model fit time"] for value in results_dict.values()]
prediction_times = [value["Model prediction time"] for value in results_dict.values()]
MAPE_train_scores = [value["MAPE"]["Train data"] for value in results_dict.values()]
MAPE_test_scores = [value["MAPE"]["Test data"] for value in results_dict.values()]
RMSE_train_scores = [value["RMSE"]["Train data"] for value in results_dict.values()]
RMSE_test_scores = [value["RMSE"]["Test data"] for value in results_dict.values()]

# Show train and test scores for each model and the computation times in a Data Frame
run_times_df = pd.DataFrame([cv_times, fit_times, prediction_times], index=["CV times", "Fit times", "Prediction times"], columns=model_names)
MAPE_df = pd.DataFrame(data=[MAPE_train_scores, MAPE_test_scores], columns=model_names, index=["Train set", "Test set"]).sort_values(by="Test set", axis=1)
MAPE_df.columns.name = "MAPE scores"
RMSE_df = pd.DataFrame(data=[RMSE_train_scores, RMSE_test_scores], columns=model_names, index=["Train set", "Test set"]).sort_values(by="Test set", axis=1)
RMSE_df.columns.name = "RMSE scores"
display(run_times_df, MAPE_df, RMSE_df)

# Show scores per instance size for each model
MAPE_cat_scores = [value["Scores per instance size"].loc["MAPE"] for value in results_dict.values()]
RMSE_cat_scores = [value["Scores per instance size"].loc["RMSE"] for value in results_dict.values()]
MAPE_cat_scores_df = pd.DataFrame(data=MAPE_cat_scores, index=model_names).sort_values(by="Mean")
MAPE_cat_scores_df.columns.name = "MAPE scores per instance size"
RMSE_cat_scores_df = pd.DataFrame(data=RMSE_cat_scores, index=model_names).sort_values(by="Mean")
RMSE_cat_scores_df.columns.name = "RMSE scores per instance size"
display(MAPE_cat_scores_df, RMSE_cat_scores_df)

# Save data frames with results into an excel file
file_path = str(f"04_test_results/{optimization_problem}_results.xlsx")

# Use ExcelWriter to write multiple DataFrames to the same file
with pd.ExcelWriter(file_path) as writer:
    run_times_df.to_excel(writer, sheet_name="run_times")
    MAPE_df.to_excel(writer, sheet_name="MAPE_scores")
    RMSE_df.to_excel(writer, sheet_name="RMSE_scores")
    MAPE_cat_scores_df.to_excel(writer, sheet_name="MAPE_cat_scores")
    RMSE_cat_scores_df.to_excel(writer, sheet_name="RMSE_cat_scores")

# Print total script run time
print("Total script computation time:", fun_convert_time(start=start_script, end=time.time()))

Unnamed: 0,KNN,Linear Regression,Ridge Regression,Polynomial Regression,Decision Tree,Random Forest,GBRT,XGBoost,SVM,Kernel Machine,Neural Network
CV times,16s,2s,2s,"13m, 4s",4s,"2m, 59s","2m, 0s",17s,2s,"13m, 5s","12m, 8s"
Fit times,0s,0s,0s,"6m, 21s",2s,"1m, 49s","2m, 28s",6s,0s,"4m, 3s","33m, 11s"
Prediction times,1s,0s,0s,1s,0s,0s,0s,0s,0s,19s,0s


MAPE scores,Neural Network,Kernel Machine,Polynomial Regression,XGBoost,GBRT,Random Forest,Decision Tree,SVM,Linear Regression,Ridge Regression,KNN
Train set,3.52,4.25,4.57,5.37,6.09,7.93,11.57,19.18,20.64,20.64,22.66
Test set,3.3,4.05,4.35,5.12,5.75,7.76,11.16,19.53,21.09,21.09,21.32


RMSE scores,Neural Network,Kernel Machine,Polynomial Regression,XGBoost,GBRT,Random Forest,Decision Tree,SVM,KNN,Linear Regression,Ridge Regression
Train set,0.96,1.09,1.16,1.54,1.82,2.38,3.84,4.12,4.53,4.29,4.29
Test set,0.9,1.04,1.09,1.46,1.71,2.29,3.61,4.2,4.36,4.4,4.4


MAPE scores per instance size,6,7,8,9,10,11,12,13,14,Mean
Neural Network,2.32,2.39,2.74,2.81,3.11,3.24,3.59,3.94,4.17,3.3
Kernel Machine,3.38,2.82,3.44,3.35,3.62,3.87,4.38,4.81,5.21,4.05
Polynomial Regression,3.3,3.12,3.5,3.55,3.88,4.1,4.82,5.26,5.7,4.35
XGBoost,4.36,4.37,4.83,4.67,5.15,4.82,5.15,5.49,6.1,5.12
GBRT,4.72,5.08,5.57,5.35,5.69,5.45,5.88,6.17,6.63,5.75
Random Forest,6.72,7.05,7.33,7.16,7.71,7.62,7.76,8.27,8.88,7.76
Decision Tree,9.98,11.18,11.05,11.39,11.21,10.83,11.11,11.31,11.68,11.16
SVM,25.11,21.41,20.51,17.34,16.77,13.96,17.02,18.15,26.84,19.53
Linear Regression,22.21,18.51,18.6,16.47,16.6,14.35,19.04,21.86,35.82,21.09
Ridge Regression,22.21,18.52,18.6,16.46,16.6,14.35,19.04,21.86,35.85,21.09


RMSE scores per instance size,6,7,8,9,10,11,12,13,14,Mean
Neural Network,0.8,0.85,0.85,0.85,0.86,0.89,0.92,0.98,0.96,0.9
Kernel Machine,1.13,0.95,1.0,0.96,0.95,0.97,1.02,1.12,1.14,1.04
Polynomial Regression,1.17,1.03,1.08,1.04,0.99,1.02,1.08,1.17,1.2,1.09
XGBoost,1.79,1.59,1.59,1.44,1.45,1.34,1.31,1.39,1.44,1.46
GBRT,2.06,1.9,1.9,1.74,1.71,1.58,1.56,1.61,1.6,1.71
Random Forest,2.86,2.68,2.59,2.33,2.32,2.2,2.05,2.06,2.07,2.29
Decision Tree,4.53,4.47,4.08,3.92,3.79,3.34,3.28,3.04,2.97,3.61
SVM,6.87,5.72,4.81,4.03,3.66,3.04,3.13,3.37,4.13,4.2
KNN,6.45,5.46,5.0,4.55,4.38,3.98,3.7,3.55,3.49,4.36
Linear Regression,6.36,5.29,4.47,3.89,3.61,3.18,3.53,4.09,5.35,4.4


Total script computation time: 1h, 33m
