In [1]:
import pandas as pd
import numpy as np
import multiprocessing

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GroupKFold, GridSearchCV
from sklearn.linear_model import LinearRegression

# Import helperfunctions
from ML_functions import fun_load_settings, fun_load_data, fun_preprocessing, fun_fit_tuning
from ML_functions import fun_scaled_neg_MAPE, fun_tuning_results, fun_scores

# Set the default optimization problem for the case of manual executing the script (choose either "TSP" or "CVRP")
default_optimization_problem = "TSP"

# Call the function to define optimization_problem based on how the notebook is executed
# If the notebook is run by the script "main.ipynb", load optimization_problem from "settings.json". Otherwise use the default optimization problem from above
optimization_problem = fun_load_settings(default_optimization_problem)

# Load data and start the time count for the script within the function fun_load_data
data, start_script = fun_load_data(optimization_problem)

# Do the train test split during the preprocessing
X_train, X_test, y_train, y_test, train_data = fun_preprocessing(data, train_size=0.8)

# Load most important features from script "b1_feature_selection.ipynb" and get a list with all features
top20_features = list(pd.read_csv(f"02_best_features/{optimization_problem}_top20_features"))
all_features = list(X_train.columns)

The notebook is executed directly. :)
Optimization problem: 'TSP'


# **TSP**

In [2]:
# Define the preprocessing steps for continuous features
onehot_transformer = Pipeline(steps=[("binning", KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="uniform")),
                                     ("onehot", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))])

poly_transformer = Pipeline(steps=[("poly", PolynomialFeatures(degree=3, interaction_only=False, include_bias=True))])

# Combine preprocessing steps using ColumnTransformer
preprocessor = ColumnTransformer(transformers=[("onehot", onehot_transformer, top20_features),
                                               ("poly", poly_transformer, top20_features)],
                                               remainder="passthrough")

# Define the model pipeline
pipe = Pipeline(steps=[("preprocessor", preprocessor),
                       ("lr", LinearRegression())])

# Estimate model performance with cross-validation on the train set (scoring: MAPE and RMSE)
model_results_dict = fun_scores(model=pipe, X_train=X_train, y_train=y_train)

CV MAPE (scaled) train data: 6.3222000000000005 %
CV RMSE (scaled) train data: 1.5603
CV computation time: 31s


In [None]:
# Get the number of CPU cores on the machine
print(f"Total number of logical CPU cores on this machine: {multiprocessing.cpu_count()} cores")

# Define the preprocessing steps for continuous features
onehot_transformer = Pipeline(steps=[("binning", KBinsDiscretizer(encode="ordinal", strategy="uniform")),
                                     ("onehot", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))])

# Combine preprocessing steps using ColumnTransformer
preprocessor_top20 = ColumnTransformer(transformers=[("onehot", onehot_transformer, top20_features), 
                                                     ("poly", PolynomialFeatures(interaction_only=False, include_bias=True), top20_features)], 
                                                     remainder="passthrough")

# Combine preprocessing steps using ColumnTransformer
preprocessor_all = ColumnTransformer(transformers=[("onehot", onehot_transformer, top20_features), 
                                                    ("poly", PolynomialFeatures(interaction_only=False, include_bias=True), all_features)], 
                                                    remainder="passthrough")

# Define the model pipeline
pipe = Pipeline(steps=[("preprocessor", None),
                       ("lr", LinearRegression())])

param_grid = [{"preprocessor": [preprocessor_top20],
              "preprocessor__onehot__binning__n_bins": [5, 10, 15], 
              "preprocessor__poly__degree": [2, 3]},
              {"preprocessor": [preprocessor_all],
              "preprocessor__onehot__binning__n_bins": [5, 10, 15], 
              "preprocessor__poly__degree": [2, 3]}]

# Set up GridSearchCV
cv_n_jobs = 2 # Reduce number of cores (n_jobs) to avoid system overload for poly_features and degree=3
grid_search = GridSearchCV(estimator=pipe, param_grid=param_grid, 
                           cv=GroupKFold(n_splits=3).split(X_train, y_train, groups=X_train.index.get_level_values(level="Instance ID")), 
                           scoring=fun_scaled_neg_MAPE, refit=False, verbose=True, n_jobs=cv_n_jobs) # n_jobs=6: 55m, 9s (screen freeze)
tuning_details = fun_fit_tuning(grid_search, X_train, y_train, file_name=optimization_problem + "_PR")

# Estimate model performance with cross validation on the train set (scoring: MAPE and RMSE)
model_results_dict = fun_scores(grid_search, X_train, y_train)
model_results_dict.update(tuning_details)

# View grid search CV scores of all parameter combinations
results_df = fun_tuning_results(grid_search, param_grid)

Total number of logical CPU cores on this machine: 12 cores
Fitting 3 folds for each of 12 candidates, totalling 36 fits


{'Search type': 'GridSearchCV',
 'Parameter combinations': 12,
 'Total tuning time': '56m, 21s',
 'Total tuning fit time': '33m, 59s',
 'Total tuning prediction time': '16s'}

CV MAPE (scaled) train data:  4.714700000000001 %


**Best model / parameter combination:**

{'feature_set': 'all_features (37)',
 'preprocessor__onehot__binning__n_bins': 10,
 'preprocessor__poly__degree': 3}

**Cross validation scores of different parameter combinations:**

Unnamed: 0,n_bins,degree,preprocessor,mean_test_score,converted_mean_fit_time
0,10,3,all_features (37),-0.047147,"11m, 31s"
1,5,3,all_features (37),-0.047497,"10m, 15s"
2,15,3,all_features (37),-0.047699,"11m, 14s"
3,10,3,top20_features,-0.063338,12s
4,5,3,top20_features,-0.063451,11s
5,15,3,top20_features,-0.063832,13s
6,10,2,all_features (37),-0.074679,3s
7,15,2,all_features (37),-0.0747,7s
8,5,2,all_features (37),-0.075966,4s
9,15,2,top20_features,-0.095698,2s


# **CVRP**

In [3]:
# Set the default optimization problem for the case of manual executing the script (choose either "TSP" or "CVRP")
default_optimization_problem = "CVRP"

# Call the function to define optimization_problem based on how the notebook is executed
# If the notebook is run by the script "main.ipynb", load optimization_problem from "settings.json". Otherwise use the default optimization problem from above
optimization_problem = fun_load_settings(default_optimization_problem)

# Load data and start the time count for the script within the function fun_load_data
data, start_script = fun_load_data(optimization_problem)

# Do the train test split during the preprocessing
X_train, X_test, y_train, y_test, train_data = fun_preprocessing(data, train_size=0.8)

# Load most important features from script "b1_feature_selection.ipynb" and get a list with all features
top20_features = list(pd.read_csv(f"02_best_features/{optimization_problem}_top20_features"))
all_features = list(X_train.columns)

The notebook is executed directly. :)
Optimization problem: 'CVRP'


In [None]:
# Define the preprocessing steps for continuous features
onehot_transformer = Pipeline(steps=[("binning", KBinsDiscretizer(encode="ordinal", strategy="uniform")),
                                     ("onehot", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))])

# Combine preprocessing steps using ColumnTransformer
preprocessor_top20 = ColumnTransformer(transformers=[("onehot", onehot_transformer, top20_features), 
                                                     ("poly", PolynomialFeatures(interaction_only=False, include_bias=True), top20_features)], 
                                                     remainder="passthrough")

# Combine preprocessing steps using ColumnTransformer
preprocessor_all = ColumnTransformer(transformers=[("onehot", onehot_transformer, top20_features), 
                                                    ("poly", PolynomialFeatures(interaction_only=False, include_bias=True), all_features)], 
                                                    remainder="passthrough")

# Define the model pipeline
pipe = Pipeline(steps=[("preprocessor", None),
                       ("lr", LinearRegression())])

param_grid = [{"preprocessor": [preprocessor_top20],
              "preprocessor__onehot__binning__n_bins": [5, 10, 15], 
              "preprocessor__poly__degree": [2, 3]},
              {"preprocessor": [preprocessor_all],
              "preprocessor__onehot__binning__n_bins": [5, 10, 15], 
              "preprocessor__poly__degree": [2, 3]}]

# Set up GridSearchCV
cv_n_jobs = 2 # Reduce number of cores (n_jobs) to avoid system overload for poly_features and degree=3
grid_search = GridSearchCV(estimator=pipe, param_grid=param_grid, 
                           cv=GroupKFold(n_splits=3).split(X_train, y_train, groups=X_train.index.get_level_values(level="Instance ID")), 
                           scoring=fun_scaled_neg_MAPE, refit=False, verbose=True, n_jobs=cv_n_jobs)
tuning_details = fun_fit_tuning(grid_search, X_train, y_train, file_name=optimization_problem + "_PR")

# Estimate model performance with cross validation on the train set (scoring: MAPE and RMSE)
model_results_dict = fun_scores(grid_search, X_train, y_train)
model_results_dict.update(tuning_details)

# View grid search CV scores of all parameter combinations
results_df = fun_tuning_results(grid_search, param_grid)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


{'Search type': 'GridSearchCV',
 'Parameter combinations': 12,
 'Total tuning time': '2h, 18m',
 'Total tuning fit time': '1h, 26m',
 'Total tuning prediction time': '26s'}

CV MAPE (scaled) train data:  5.7608 %


**Best model / parameter combination:**

{'features': 'Number of features: 42',
 'preprocessor__onehot__binning__n_bins': 15,
 'preprocessor__poly__degree': 3}

**Cross validation scores of different parameter combinations:**

Unnamed: 0,preprocessor,n_bins,degree,mean_test_score,converted_mean_fit_time
0,Number of features: 42,15,3,-0.057608,"25m, 56s"
1,Number of features: 42,5,3,-0.057794,"27m, 42s"
2,Number of features: 42,10,3,-0.057829,"31m, 30s"
3,top20_features,15,3,-0.061453,14s
4,top20_features,10,3,-0.061517,13s
5,top20_features,5,3,-0.06173,11s
6,Number of features: 42,15,2,-0.076957,12s
7,Number of features: 42,10,2,-0.077256,4s
8,Number of features: 42,5,2,-0.077449,5s
9,top20_features,15,2,-0.087148,2s
