In [2]:
import time
import pickle
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder, PolynomialFeatures, StandardScaler, MinMaxScaler, RobustScaler 
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV, RandomizedSearchCV

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

# Import helperfunctions
from ML_functions import fun_load_data, fun_save_file, fun_preprocessing, fun_load_best_params
from ML_functions import fun_convert_time
from ML_functions import fun_scores

# Start time count
start_script = time.time()

# Load optimization_problem ('TSP' or 'CVRP') and the size of the train set
with open('settings.pkl', 'rb') as file:
    settings = pickle.load(file)
optimization_problem, train_size = settings['optimization_problem'], settings['train_size']

# Assign string 'TSP' or 'CVRP' to the following variable to define the optimization problem
optimization_problem = 'TSP'
train_size = 0.7

# Load data
data = fun_load_data(optimization_problem)
X, y, train_data = fun_preprocessing(data)

# Create a train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, random_state=42)

# Save train sizes
train_size = f'{int(np.round(100 * len(X_train)/len(X)))} %'
results_dict = {}

# Select most important continuous features from script 'feature_selection.ipynb'
top_features = list(pd.read_csv('02_best_features/' + optimization_problem + '_top_features'))

**Preprocessor to create interactions and polynomial features**

In [2]:
# Define the preprocessing steps for continuous features
onehot_transformer = Pipeline(steps=[('binning', KBinsDiscretizer(n_bins=30, encode='ordinal', strategy='uniform')),
                                     ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))])

poly_transformer = Pipeline(steps=[('poly', PolynomialFeatures(degree=3, interaction_only=False, include_bias=False))])

# Combine preprocessing steps using ColumnTransformer
preprocessor = ColumnTransformer(transformers=[('onehot', onehot_transformer, top_features),
                                               ('poly', poly_transformer, top_features)],
                                               remainder='passthrough')

# **A. Instance-based models**
### **1. K-nearest Neighbor - KNN**

In [None]:
# Load best parameters of the model
best_params = fun_load_best_params(optimization_problem + '_KNN_best_params.pkl')

# Create a pipline and set best_params as parameters
pipe = Pipeline(steps=[('scaler', None), 
                       ('knn', KNeighborsRegressor())])
pipe.set_params(**best_params)

# Estimate model performance with cross-validation on the train set and get scores on test set (scoring: MAPE and RMSE)
model_results_dict = fun_scores(model=pipe, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, train_data=train_data, compute_test_scores=True)
results_dict['KNN'] = model_results_dict

{'knn__n_neighbors': 9, 'scaler': StandardScaler()}

CV MAPE (scaled) train data:  19.2995 %
CV RMSE (scaled) train data: 3.7713
Computation time: 26s
Fitting!

MAPE test data: 19.6788 %
RMSE test data: 3.628

MAPE and RMSE on test data per instance size:


Number Customers,6,7,8,9,10,11,12,13,14,Mean
MAPE,18.4746,20.8207,20.7974,18.8547,21.4698,23.8151,17.7843,16.7983,19.2793,19.6788
RMSE,5.2615,4.7313,4.2218,3.8422,3.6965,3.1906,3.1287,2.866,2.8808,3.628


# **B. Linear Models**
### **1. Linear Regression**

In [None]:
# Define the model pipeline
pipe = Pipeline(steps=[('preprocessor', preprocessor),
                       ('lr', LinearRegression())])

# Estimate model performance with cross-validation on the train set and get scores on test set (scoring: MAPE and RMSE)
model_results_dict = fun_scores(model=pipe, X_train=X_train, y_train=y_train, cv=10, X_test=X_test, y_test=y_test, train_data=train_data, compute_test_scores=True)
results_dict['Linear Regression'] = model_results_dict

CV MAPE train data:  4.7679 %
CV RMSE train data:  1.0483
CV computation time: 1m, 51s

MAPE test data: 4.3062000000000005 %
RMSE test data: 0.8491

MAPE and RMSE on test data per instance size:


Number Customers,6,7,8,9,10,11,12,13,14,Mean
MAPE,2.2397,2.6673,3.6717,3.0031,3.9619,5.9924,4.5414,4.7117,5.498,4.3062
RMSE,0.6267,0.664,0.7254,0.6922,0.734,0.8677,0.9294,0.9581,1.0277,0.8491


### **2. Rigde Regression (L2-Regularization)**

In [16]:
if (optimization_problem == 'TSP'):
    
    # Load best parameters of the model
    best_params = fun_load_best_params(optimization_problem + '_Ridge_best_params.pkl')

    # Define the model pipeline
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                        ('ridge', Ridge(solver='svd'))])
    pipe.set_params(**best_params)

    # Estimate model performance with cross-validation on the train set and get scores on test set (scoring: MAPE and RMSE)
    model_results_dict = fun_scores(model=pipe, X_train=X_train, y_train=y_train, cv=5, X_test=X_test, y_test=y_test, train_data=train_data, compute_test_scores=True)
    results_dict['Ridge Regression'] = model_results_dict

else: print('This cell is only executed for the TSP!')

{'ridge__alpha': 0.1}

CV MAPE train data:  4.7758 %
CV RMSE train data:  1.0474
CV computation time: 2m, 10s

MAPE test data: 4.3061 %
RMSE test data: 0.8498

MAPE and RMSE on test data per instance size:


Number Customers,6,7,8,9,10,11,12,13,14,Mean
MAPE,3.0919,3.6386,4.6379,3.9201,4.8465,6.8053,5.4254,5.5467,6.231,4.3061
RMSE,0.8341,0.8555,1.0576,0.9099,0.9549,1.068,1.1151,1.1548,1.2178,0.8498


# **C. Decision Tree**

In [24]:
if (optimization_problem == 'TSP'):
    
    # Load best parameters of the model
    best_params = fun_load_best_params(optimization_problem + '_DT_GS_best_params.pkl')

    # Define best parameters
    #best_params = {'max_depth': 20, 'max_leaf_nodes': 1900, 'min_impurity_decrease': 0, 'min_samples_leaf': 20}

    # Create model
    tree = DecisionTreeRegressor(**best_params, random_state=0)

    # Estimate model performance with cross-validation on the train set and get scores on test set (scoring: MAPE and RMSE)
    model_results_dict = fun_scores(model=tree, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, train_data=train_data, compute_test_scores=True)
    results_dict['Decision Tree'] = model_results_dict

else: print('This cell is only executed for the TSP!')

{'max_depth': 20, 'min_impurity_decrease': 0.002}

CV MAPE train data:  10.7354 %
CV RMSE train data:  3.5223
CV computation time: 7s

MAPE test data: 7.8854999999999995 %
RMSE test data: 2.3304

MAPE and RMSE on test data per instance size:


Number Customers,6,7,8,9,10,11,12,13,14,Mean
MAPE,5.5518,6.3491,6.9466,7.2805,7.5332,8.8069,8.2331,8.5186,9.1783,7.8855
RMSE,2.6363,2.5754,2.5586,2.4048,2.2587,2.3153,2.2222,2.2014,2.1437,2.3304


# **D. Ensembles of Decision Trees**
### **1. Random Forest**

In [6]:
# Load best parameters of the model
best_params = fun_load_best_params(optimization_problem + '_RF_best_params.pkl')

# Define best parameters
#best_params = {'max_depth': 23, 'max_features': 36, 'max_leaf_nodes': 1359, 
#               'min_impurity_decrease': 0.00006018111448827135, 'min_samples_leaf': 14}

# Create model
forest = RandomForestRegressor(n_estimators=200, **best_params, n_jobs=-1, random_state=0)

# Estimate model performance with cross-validation on the train set and get scores on test set (scoring: MAPE and RMSE)
model_results_dict = fun_scores(model=forest, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, train_data=train_data, compute_test_scores=True)
results_dict['Random Forest'] = model_results_dict

{'max_features': 35,
 'max_depth': 25,
 'max_leaf_nodes': 1400,
 'min_samples_leaf': 15,
 'min_impurity_decrease': 0}

CV MAPE train data:  7.0243 %
CV RMSE train data:  2.1279
CV computation time: 3m, 46s

MAPE test data: 5.2966 %
RMSE test data: 1.3481

MAPE and RMSE on test data per instance size:


Number Customers,6,7,8,9,10,11,12,13,14,Mean
MAPE,3.0636,3.7954,4.352,4.6334,5.0799,5.9923,5.5888,6.0079,6.6267,5.2966
RMSE,1.3238,1.3058,1.3759,1.2842,1.2798,1.3544,1.3486,1.4133,1.3816,1.3481


### **2. Gradient Boosting Regression Trees**

In [7]:
if (optimization_problem == 'TSP'):
    
    # Load best parameters of the model
    best_params = fun_load_best_params(optimization_problem + '_GBRT_RGS_best_params.pkl')

    # Define best parameters
    #best_params = {'n_estimators': 186, 'max_depth': 10, 'max_leaf_nodes': 1207, 'learning_rate': 0.07782676930360648, 
    #               'max_features': 34, 'min_samples_leaf': 22, 'min_samples_split': 7}

    # Create model
    gbrt = GradientBoostingRegressor(**best_params, random_state=0)

    # Estimate model performance with cross-validation on the train set and get scores on test set (scoring: MAPE and RMSE)
    model_results_dict = fun_scores(model=gbrt, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, train_data=train_data, compute_test_scores=True)
    results_dict['Gradient Boosting Regression Trees'] = model_results_dict

else: print('This cell is only executed for the TSP!')

  CV MAPE train data:  4.6764 %
  CV RMSE train data:  1.3694
  CV computation time: 8 min, 23 sec

MAPE test data: 3.4035999999999995 %
RMSE test data: 0.9142
MAPE and RMSE on test data per instance size:


Number Customers,6,7,8,9,10,11,12,13,14,Mean
MAPE,3.3445,3.6536,3.7166,4.1281,4.1829,4.7276,4.7034,4.9871,5.1742,3.4036
RMSE,1.5565,1.3798,1.3411,1.3048,1.1878,1.2892,1.2513,1.2533,1.2828,0.9142


### **3. Extreme Gradient Boosting: XGBoost-Package**

In [2]:
# Load best parameters of the model
best_params = fun_load_best_params(optimization_problem + '_XGBOOST_GS_best_params.pkl')

# Define best parameters
#best_params = {'n_estimators': 500, 'max_depth': 9, 'learning_rate': 0.05, 'subsample': 0.6, 'colsample_bytree': 1}

# Create model
xgboost = xgb.XGBRegressor(objective='reg:squarederror', **best_params, random_state=0)

# Estimate model performance with cross-validation on the train set and get scores on test set (scoring: MAPE and RMSE)
model_results_dict = fun_scores(model=xgboost, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, train_data=train_data, compute_test_scores=True)
results_dict['XGBoost'] = model_results_dict

{'n_estimators': 500,
 'max_depth': 9,
 'learning_rate': 0.05,
 'subsample': 0.6,
 'colsample_bytree': 1}

CV MAPE train data:  4.4872000000000005 %
CV RMSE train data:  1.308
CV computation time: 1m, 19s

MAPE test data: 3.3061 %
RMSE test data: 0.8544

MAPE and RMSE on test data per instance size:


Number Customers,6,7,8,9,10,11,12,13,14,Mean
MAPE,1.8065,2.2579,2.415,2.736,3.0348,3.9209,3.6847,3.8459,4.1977,3.3061
RMSE,0.7816,0.7706,0.8363,0.7681,0.8069,0.8647,0.9087,0.9048,0.9104,0.8544


# **E. Linear Support Vector Machines (SVM) and Kernel Machines**
**Linear SVM**

In [4]:
if (optimization_problem == 'TSP'):
    
    # Define a smaller train set due to the long computation times of svm
    X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X, y, train_size=0.15, random_state=0)

    # Define the preprocessing steps for continuous features
    poly_transformer = Pipeline(steps=[('poly', PolynomialFeatures(degree=3, interaction_only=False, include_bias=False))])

    # Combine preprocessing steps using ColumnTransformer
    preprocessor = ColumnTransformer(transformers=[('poly', poly_transformer, top_features)],
                                                remainder='passthrough')

    # Define the model pipeline
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                                    ('scaler', StandardScaler()),
                                    ('model', SVR(kernel='linear', C=1))])

    # Estimate model performance with cross-validation on the train set and get scores on test set (scoring: MAPE and RMSE)
    model_results_dict = fun_scores(model=pipe, X_train=X_train_s, y_train=y_train_s, X_test=X_test_s, y_test=y_test_s, train_data=train_data, compute_test_scores=True)
    results_dict['Linear SVM'] = model_results_dict

else: print('This cell is only executed for the TSP!')

  CV MAPE train data:  5.2577 %
  CV RMSE train data:  1.1607
  CV computation time: 7 min, 1 sec

MAPE test data: 4.9546 %
RMSE test data: 1.0382
MAPE and RMSE on test data per instance size:


Number Customers,6,7,8,9,10,11,12,13,14,Mean
MAPE,3.4137,3.5725,4.344,4.5297,4.6431,5.6107,5.7985,5.9863,6.4417,4.9546
RMSE,0.9407,0.9435,0.9786,0.9871,1.0215,1.1607,1.1828,1.2445,1.303,1.0382


**Kernel Machine with Gaussian Kernel**

In [None]:
if (optimization_problem == 'TSP'):
    
    # Define a smaller train set due to the long computation times of SVM
    X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X, y, train_size=0.20, random_state=0)

    # Load best parameters of the model
    best_params = fun_load_best_params(optimization_problem + '_SVM_GK_best_params.pkl')

    #{'SVM__C': 750, 'SVM__gamma': 0.05, 'scaler': MinMaxScaler()}

    # Define the model pipeline
    pipe = Pipeline(steps=[('scaler', None),
                        ('model', SVR(kernel='rbf'))])
    pipe.set_params(**best_params)

    # Estimate model performance with cross-validation on the train set and get scores on test set (scoring: MAPE and RMSE)
    model_results_dict = fun_scores(model=pipe, X_train=X_train_s, y_train=y_train_s, X_test=X_test_s, y_test=y_test_s, train_data=train_data, compute_test_scores=True)

else: print('This cell is only executed for the TSP!')

  CV MAPE train data:  4.2866 %
  CV RMSE train data:  1.06
  CV computation time: 4 min, 10 sec

MAPE test data: 3.6768 %
RMSE test data: 0.8985
MAPE and RMSE on test data per instance size:


Number Customers,6,7,8,9,10,11,12,13,14,Mean
MAPE,2.5218,2.7104,3.1515,3.2899,3.66,4.1348,4.4705,4.5488,5.2411,3.6768
RMSE,0.8278,0.8057,0.8496,0.8897,0.9394,1.0102,1.0518,1.0764,1.1477,0.8985


# **F. Neural Network - Multi Layer Perceptron**

In [3]:
# Load best parameters of the model
best_params = fun_load_best_params(optimization_problem + '_NN_GS_best_params.pkl')

# best_params = {'mlpregressor__alpha': 0.1, 'mlpregressor__batch_size': 32, 
#                'mlpregressor__hidden_layer_sizes': (100, 100, 100), 'mlpregressor__solver': 'sgd'}

# Create pipeline
pipe = make_pipeline(StandardScaler(), 
                     MLPRegressor(activation='relu', learning_rate='adaptive', 
                                  max_iter=1000, shuffle=True, random_state=0))
pipe.set_params(**best_params)

# Estimate model performance with cross-validation on the train set and get scores on test set (scoring: MAPE and RMSE)
model_results_dict = fun_scores(model=pipe, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, train_data=train_data, compute_test_scores=True)
results_dict['Neural Network'] = model_results_dict

{'mlpregressor__alpha': 0.1,
 'mlpregressor__batch_size': 32,
 'mlpregressor__hidden_layer_sizes': (100, 100, 100),
 'mlpregressor__solver': 'sgd'}

CV MAPE (scaled) train data:  2.9636 %
CV RMSE (scaled) train data: 0.7859
CV computation time: 10m, 40s

MAPE (scaled) test data:  2.7604 %
RMSE (scaled) test data: 0.7228
Model fit time: 18m, 47s

MAPE and RMSE on test data per instance size:


Number Customers,6,7,8,9,10,11,12,13,14,Mean
MAPE,1.3657,1.788,1.9722,2.2131,2.5333,3.2332,3.026,3.248,3.7247,2.7604
RMSE,0.5756,0.5734,0.631,0.6158,0.7038,0.7477,0.7604,0.7925,0.8383,0.7228


# **G. Compare Results**

In [17]:
# Get model names, the scores for each model and the computation times
model_names = results_dict.keys()
cv_times = [value['CV computation time'] for value in results_dict.values()]
MAPE_train_scores = [value['MAPE']['Train data'] for value in results_dict.values()]
MAPE_test_scores = [value['MAPE']['Test data'] for value in results_dict.values()]
RMSE_train_scores = [value['RMSE']['Train data'] for value in results_dict.values()]
RMSE_test_scores = [value['RMSE']['Test data'] for value in results_dict.values()]

# Show scores per instance size for each model
cv_times_df = pd.DataFrame(data=[cv_times], columns=model_names, index=['CV Computation Times'])
MAPE_df = pd.DataFrame(data=[MAPE_train_scores, MAPE_test_scores], columns=model_names, index=['Train Set', 'Test Set']).sort_values(by='Test Set', axis=1)
MAPE_df.columns.name = 'MAPE Scores'
RMSE_df = pd.DataFrame(data=[RMSE_train_scores, RMSE_test_scores], columns=model_names, index=['Train Set', 'Test Set']).sort_values(by='Test Set', axis=1)
RMSE_df.columns.name = 'RMSE Scores'
display(cv_times_df, MAPE_df, RMSE_df, )

# Show scores per instance size for each model
MAPE_cat_scores = [value['Scores per instance size'].loc['MAPE'] for value in results_dict.values()]
RMSE_cat_scores = [value['Scores per instance size'].loc['RMSE'] for value in results_dict.values()]
MAPE_cat_scores_df = pd.DataFrame(data=MAPE_cat_scores, index=model_names).sort_values(by='Mean')
MAPE_cat_scores_df.columns.name = 'MAPE Scores per instance size'
RMSE_cat_scores_df = pd.DataFrame(data=RMSE_cat_scores, index=model_names).sort_values(by='Mean')
RMSE_cat_scores_df.columns.name = 'RMSE Scores per instance size'
display(MAPE_cat_scores_df, RMSE_cat_scores_df)

# Save data frames with results into excel files
fun_save_file(data=MAPE_df, subfolder_path='04_results', name=optimization_problem + '_MAPE_scores.xlsx')
fun_save_file(data=RMSE_df, subfolder_path='04_results', name=optimization_problem + '_RMSE_scores.xlsx')
fun_save_file(data=MAPE_cat_scores_df, subfolder_path='04_results', name=optimization_problem + '_MAPE_cat_scores.xlsx')
fun_save_file(data=RMSE_cat_scores_df, subfolder_path='04_results', name=optimization_problem + '_RMSE_cat_scores.xlsx')

# Print total script run time
print('Total script computation time:', fun_convert_time(start=start_script, end=time.time()))

Unnamed: 0,KNN,Linear Regression,Ridge Regression
CV Computation Times,12s,"1m, 40s","2m, 10s"


MAPE Scores,Ridge Regression,Linear Regression,KNN
Train Set,4.7758,4.7679,25.2382
Test Set,4.3061,4.3062,19.6788


RMSE Scores,Linear Regression,Ridge Regression,KNN
Train Set,1.0483,1.0474,5.0502
Test Set,0.8491,0.8498,3.628


MAPE Scores per instance size,6,7,8,9,10,11,12,13,14,Mean
Ridge Regression,3.0919,3.6386,4.6379,3.9201,4.8465,6.8053,5.4254,5.5467,6.231,4.3061
Linear Regression,3.1752,3.6034,4.6648,3.9135,4.8887,6.8022,5.3877,5.5283,6.2132,4.3062
KNN,30.0477,30.3287,30.54,26.0854,27.4481,29.6154,21.9447,20.9688,23.7126,19.6788


RMSE Scores per instance size,6,7,8,9,10,11,12,13,14,Mean
Linear Regression,0.853,0.8568,1.3214,0.9085,0.963,1.0654,1.1116,1.1519,1.2163,0.8491
Ridge Regression,0.8341,0.8555,1.0576,0.9099,0.9549,1.068,1.1151,1.1548,1.2178,0.8498
KNN,7.8002,6.678,5.75,5.0469,4.8402,4.3695,3.9374,3.5544,3.5374,3.628


File saved successfully!
File saved successfully!
File saved successfully!
File saved successfully!
Total script computation time: 34m, 3s


In [11]:
#display(MAPE_df, RMSE_df)

MAPE Scores,Neural Network,XGBoost,Gradient Boosting Regression Trees,Ridge Regression,Linear Regression,Random Forest,Decision Tree,KNN
Train Set,4.0641,4.4872,4.6764,5.2108,11.4193,7.09,10.1798,25.2382
Test Set,3.1935,3.3061,3.4036,4.3995,4.3998,5.3467,7.3302,19.6788


RMSE Scores,Neural Network,XGBoost,Gradient Boosting Regression Trees,Linear Regression,Ridge Regression,Random Forest,Decision Tree,KNN
Train Set,0.9677,1.308,1.3694,526.3029,1.1874,2.1105,3.4694,5.0502
Test Set,0.7112,0.8544,0.9142,0.927,0.9305,1.3552,2.2329,3.628
