In [3]:
import os
import sys
import time
import pickle
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.neural_network import MLPRegressor

# Add the parent directory to the Python path to load funtions from file ML_funtions
current_directory = os.getcwd()
parent_directory = os.path.dirname(current_directory)
sys.path.append(parent_directory)

# Import helperfunctions
from ML_functions import fun_load_data, fun_preprocessing, fun_fit_tuning, fun_load_best_params
from ML_functions import fun_convert_time
from ML_functions import fun_scaled_neg_MAPE, fun_tuning_results, fun_scores

# Assign string 'TSP' or 'CVRP' to the following variable to define the optimization problem
optimization_problem = 'Bin_Packing'
train_size = 0.7

# Load data
data = fun_load_data(optimization_problem)
X, y, train_data = fun_preprocessing(data)

# Create a train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, random_state=42)

**Compute train score with all features: Neural Network - Multi Layer Perceptron**

In [4]:
# Load best parameters of the model
best_params = fun_load_best_params(optimization_problem + '_NN_GS_best_params.pkl')

# best_params = {'mlpregressor__alpha': 0.5, 'mlpregressor__batch_size': 'auto', 
#                'mlpregressor__hidden_layer_sizes': (100, 100), 'mlpregressor__solver': 'lbfgs'}

# The solver 'lbfgs' performed best, but was very slow. Therefore, these tests are done with the best parameters for the 'adam' solver.
best_params = {'mlpregressor__alpha': 0.05, 'mlpregressor__batch_size': 'auto', 
               'mlpregressor__hidden_layer_sizes': (100, 100), 'mlpregressor__solver': 'adam'} # Or 'mlpregressor__hidden_layer_sizes': (100)

# Create pipeline
pipe = make_pipeline(StandardScaler(), 
                     MLPRegressor(activation='relu', learning_rate='adaptive', 
                                  max_iter=1000, shuffle=True, random_state=0))
pipe.set_params(**best_params)

# Estimate model performance with cross-validation on the train set (scoring: MAPE and RMSE)
model_results_dict_all = fun_scores(pipe, X_train, y_train)

#model_results_dict_all = {'MAPE': 4.8044, 'RMSE': 0.0213}

{'mlpregressor__alpha': 0.5,
 'mlpregressor__batch_size': 'auto',
 'mlpregressor__hidden_layer_sizes': (100, 100),
 'mlpregressor__solver': 'lbfgs'}

CV MAPE (scaled) train data:  4.8044 %
CV RMSE (scaled) train data: 0.0213
CV computation time: 25s


**Exclude feature categories**

In [6]:
# View all features
display(train_data.columns)

instance_features = ['Instance ID', 'Number Items', 'Item ID', 'Item Weight Ratio', 'Item Size Ratio', 'Bin Weight', 'Bin Size']
weight_and_sum_features = ['Weight Size Sum Ratio', 'Item Volume Ratio', 'Item Density Ratio']
item_utilization_features = ['Item Bin Utilization Weight Ratio', 'Item Bin Utilization Size Ratio', 'Item Total Bin Utilization Ratio']
combination_features = ['Weight Bin Combinations Ratio', 'Size Bin Combinations Ratio', 'Total Bin Combinations Ratio',
                        'Perfect Weight Bin Combinations Ratio', 'Perfect Size Bin Combinations Ratio', 'Perfect Total Bin Combinations Ratio']
quantile_features = ['Weight Quantile Values Ratio', 'Size Quantile Values Ratio']
percentile_features = ['0% Percentile Weight', '25% Percentile Weight', '50% Percentile Weight', '75% Percentile Weight', '100% Percentile Weight', 
                       '0% Percentile Size', '25% Percentile Size', '50% Percentile Size', '75% Percentile Size', '100% Percentile Size',
                       'Weight / 0% Percentile Ratio', 'Weight / 25% Percentile Ratio', 'Weight / 50% Percentile Ratio', 'Weight / 75% Percentile Ratio', 'Weight / 100% Percentile Ratio', 
                       'Size / 0% Percentile Ratio', 'Size / 25% Percentile Ratio', 'Size / 50% Percentile Ratio', 'Size / 75% Percentile Ratio', 'Size / 100% Percentile Ratio']
statistical_features = ['Weight Sum', 'Size Sum', 'Weight Mean', 'Size Mean', 'Weight Std', 'Size Std', 'Weight Max', 
                        'Size Max', 'Weight Min', 'Size Min', 'Correlation', 'Skewness Weight', 'Skewness Size']
final_utilization_features = ['Final Bin Utilization Weight', 'Final Bin Utilization Size', 'Final Total Bin Utilization']
cost_features = ['Marginal Costs/Bins Ratio', 'Total Bins'] #'Shapley Value'

# Combine lists to one complete list and one dictionary
all_features = instance_features + weight_and_sum_features + item_utilization_features + combination_features + quantile_features + percentile_features + statistical_features + final_utilization_features + cost_features
feature_categories_dict = {'weight_and_sum_features': weight_and_sum_features,
                           'item_utilization_features': item_utilization_features,
                           'combination_features':combination_features,
                           'quantile_features': quantile_features,
                           'percentile_features': percentile_features,
                           'statistical_features': statistical_features,
                           'final_utilization_features': final_utilization_features}

Index(['Instance ID', 'Number Items', 'Item ID', 'Item Weight Ratio',
       'Item Size Ratio', 'Bin Weight', 'Bin Size', 'Weight Size Sum Ratio',
       'Item Volume Ratio', 'Item Density Ratio',
       'Item Bin Utilization Weight Ratio', 'Item Bin Utilization Size Ratio',
       'Item Total Bin Utilization Ratio', 'Weight Bin Combinations Ratio',
       'Size Bin Combinations Ratio', 'Total Bin Combinations Ratio',
       'Perfect Weight Bin Combinations Ratio',
       'Perfect Size Bin Combinations Ratio',
       'Perfect Total Bin Combinations Ratio', 'Weight Quantile Values Ratio',
       'Size Quantile Values Ratio', '0% Percentile Weight',
       '25% Percentile Weight', '50% Percentile Weight',
       '75% Percentile Weight', '100% Percentile Weight', '0% Percentile Size',
       '25% Percentile Size', '50% Percentile Size', '75% Percentile Size',
       '100% Percentile Size', 'Weight / 0% Percentile Ratio',
       'Weight / 25% Percentile Ratio', 'Weight / 50% Percentile Rat

In [8]:
for key in feature_categories_dict.keys():
    print('############### Excluded feature category: {} ###############'.format(key))
    
    # Select only the used features in the train set
    excluded_features = [i for i in feature_categories_dict[key]] + [i + ' Ratio' for i in feature_categories_dict[key]]
    used_features = [i for i in all_features if i not in excluded_features]
    X_train_small = X_train[used_features]
    print('Number of excluded features:', len(feature_categories_dict[key]))
    print('Number of used features: {}\n'.format(len(used_features)))
    #display(used_features)

    # Estimate model performance with cross-validation on the train set (scoring: MAPE and RMSE)
    model_results_dict_new = fun_scores(pipe, X_train_small, y_train)

    # Compare the new results with the results of all categories
    MAPE_diff = model_results_dict_new['MAPE'] - model_results_dict_all['MAPE']
    RMSE_diff = model_results_dict_new['RMSE'] - model_results_dict_all['RMSE']
    print('\nMAPE difference: {} - {} = {} %'.format(model_results_dict_new['MAPE'], model_results_dict_all['MAPE'], MAPE_diff))
    print('RMSE difference: {} - {} = {}\n'.format(model_results_dict_new['RMSE'], model_results_dict_all['RMSE'], RMSE_diff))

############### Excluded feature category: weight_and_sum_features ###############
Number of excluded features: 3
Number of used features: 56

CV MAPE (scaled) train data:  4.8663 %
CV RMSE (scaled) train data: 0.0214
CV computation time: 33s

MAPE difference: 4.8663 - 4.8044 = 0.06189999999999962 %
RMSE difference: 0.0214 - 0.0213 = 9.99999999999994e-05

############### Excluded feature category: item_utilization_features ###############
Number of excluded features: 3
Number of used features: 56

CV MAPE (scaled) train data:  4.8587 %
CV RMSE (scaled) train data: 0.0214
CV computation time: 30s

MAPE difference: 4.8587 - 4.8044 = 0.05429999999999957 %
RMSE difference: 0.0214 - 0.0213 = 9.99999999999994e-05

############### Excluded feature category: combination_features ###############
Number of excluded features: 6
Number of used features: 53

CV MAPE (scaled) train data:  5.5044 %
CV RMSE (scaled) train data: 0.0244
CV computation time: 25s

MAPE difference: 5.5044 - 4.8044 = 0.7000

**Add single features to the essential features**

In [None]:
distance_features = ['Depot Distance Ratio', 'Closest Customer Distance (CCD) Ratio', '2nd CCD Ratio', '3rd CCD Ratio', '4th CCD Ratio', '5th CCD Ratio', 
                     '6th CCD Ratio', '7th CCD Ratio', '8th CCD Ratio', 'Mean Distance To Other Customers Ratio', 'Gravity Center Distance Ratio']
essential_features = instance_features + distance_features + statistical_features + cost_features

# Estimate model performance with cross-validation on the train set (scoring: MAPE and RMSE)
X_train_small = X_train[essential_features]
model_results_dict_essential = fun_scores(pipe, X_train_small, y_train)

#model_results_dict_essential = {'MAPE': 2.9232, 'RMSE': 0.7813}

In [None]:
potential_features = ['Number Clusters', 'Number Outliers', 'Cluster Size', 'Cluster', ['X Centroid', 'Y Centroid'], 
                      'Centroid Distance Ratio', 'Centroid Distance To Depot Ratio', 'Distance To Closest Other Cluster Ratio', 
                      'Distance To Closest Other Centroid Ratio', 'Cluster Area Ratio', 'Cluster Density Ratio']

# Add iteratively a single feature or a list of features to the essential features and compute the score difference (compared to the score of the essential features only)
for added_feature in potential_features:
    print('############### Added feature: {} ###############'.format(added_feature))

    # Select only the used features in the train set
    if isinstance(added_feature, list): # Check whether added_feature is a single feature (string) or a list of features
        used_features = all_features + added_feature
    else: used_features = essential_features + [added_feature]
    X_train_small = X_train[used_features]
    print('Number of used features: {}\n'.format(len(used_features)))
    #display(used_features)

    # Estimate model performance with cross-validation on the train set (scoring: MAPE and RMSE)
    model_results_dict_new = fun_scores(pipe, X_train_small, y_train)

    # Compare the new results with the results of the essential features only
    MAPE_diff = model_results_dict_new['MAPE'] - model_results_dict_essential['MAPE']
    RMSE_diff = model_results_dict_new['RMSE'] - model_results_dict_essential['RMSE']
    print('\nMAPE difference: {} - {} = {} %'.format(model_results_dict_new['MAPE'], model_results_dict_essential['MAPE'], MAPE_diff))
    print('RMSE difference: {} - {} = {}\n'.format(model_results_dict_new['RMSE'], model_results_dict_essential['RMSE'], RMSE_diff))

**Exclude single features**

In [4]:
distance_features = ['Depot Distance Ratio', 'Closest Customer Distance (CCD) Ratio', '2nd CCD Ratio', '3rd CCD Ratio', '4th CCD Ratio', '5th CCD Ratio', 
                     '6th CCD Ratio', '7th CCD Ratio', '8th CCD Ratio', 'Mean Distance To Other Customers Ratio', 'Gravity Center Distance Ratio']
all_features = instance_features + distance_features + cluster_features + statistical_features + cost_features

# Estimate model performance with cross-validation on the train set (scoring: MAPE and RMSE)
X_train_small = X_train[all_features]
#model_results_dict_all = fun_scores(pipe, X_train_small, y_train)

model_results_dict_all = {'MAPE': 2.8593, 'RMSE': 0.7692}

In [None]:
potential_features = ['Cluster', 'Number Clusters', 'Number Outliers', 'Cluster Size', ['X Centroid', 'Y Centroid'], 
                      'Centroid Distance Ratio', 'Centroid Distance To Depot Ratio', 'Distance To Closest Other Cluster Ratio', 
                      'Distance To Closest Other Centroid Ratio', 'Cluster Area Ratio', 'Cluster Density Ratio']

# Add iteratively a single feature or a list of features to the essential features and compute the score difference (compared to the score of the essential features only)
for excluded_feature in potential_features:
    print('############### Excluded feature: {} ###############'.format(excluded_feature))

    # Select only the used features in the train set
    if isinstance(excluded_feature, list): # Check whether added_feature is a single feature (string) or a list of features
        used_features = [i for i in all_features if i not in excluded_feature]
    else: used_features = [i for i in all_features if i != excluded_feature]
    X_train_small = X_train[used_features]
    print('Number of used features: {}\n'.format(len(used_features)))
    #display(used_features)

    # Estimate model performance with cross-validation on the train set (scoring: MAPE and RMSE)
    model_results_dict_new = fun_scores(pipe, X_train_small, y_train)

    # Compare the new results with the results of the essential features only
    MAPE_diff = model_results_dict_new['MAPE'] - model_results_dict_all['MAPE']
    RMSE_diff = model_results_dict_new['RMSE'] - model_results_dict_all['RMSE']
    print('\nMAPE difference: {} - {} = {} %'.format(model_results_dict_new['MAPE'], model_results_dict_all['MAPE'], MAPE_diff))
    print('RMSE difference: {} - {} = {}\n'.format(model_results_dict_new['RMSE'], model_results_dict_all['RMSE'], RMSE_diff))

**Exclude updated feature categories**

In [None]:
# Get all features categories with their features
distance_features = ['Depot Distance Ratio', 'Closest Customer Distance (CCD) Ratio', '2nd CCD Ratio', '3rd CCD Ratio', '4th CCD Ratio', '5th CCD Ratio', 
                     '6th CCD Ratio', '7th CCD Ratio', '8th CCD Ratio', 'Mean Distance To Other Customers Ratio', 'Gravity Center Distance Ratio']
cluster_features = ['Cluster', 'Number Clusters', 'Cluster Size', 'X Centroid', 'Y Centroid', 'Centroid Distance To Depot Ratio',
                    'Distance To Closest Other Cluster Ratio', 'Cluster Area Ratio', 'Cluster Density Ratio']
statistical_features = ['X Std', 'Y Std', 'X Max', 'Y Max', 'X Min', 'Y Min', 'Correlation', 'Skewness X', 'Skewness Y']

# Combine lists to one complete list and one dictionary
all_features = instance_features + distance_features + cluster_features + statistical_features + cost_features
feature_categories_dict = {'distance_features': distance_features,
                           'cluster_features': cluster_features,
                           'statistical_features': statistical_features}

X_train_small = X_train[all_features]

# Estimate model performance with cross-validation on the train set (scoring: MAPE and RMSE)
model_results_dict_all = fun_scores(pipe, X_train_small, y_train)

#model_results_dict_all = {'MAPE': 2.8642, 'RMSE': 0.7637}

In [None]:
for key in feature_categories_dict.keys():
    #if (key == 'cluster_features'):
    print('############### Excluded feature category: {} ###############'.format(key))
    
    # Select only the used features in the train set
    excluded_features = [i for i in feature_categories_dict[key]] + [i + ' Ratio' for i in feature_categories_dict[key]]
    used_features = [i for i in all_features if i not in excluded_features]
    X_train_small = X_train[used_features]
    print('Number of excluded features:', len(feature_categories_dict[key]))
    print('Number of used features: {}\n'.format(len(used_features)))
    #display(used_features)

    # Estimate model performance with cross-validation on the train set (scoring: MAPE and RMSE)
    model_results_dict_new = fun_scores(pipe, X_train_small, y_train)

    # Compare the new results with the results of all categories
    MAPE_diff = model_results_dict_new['MAPE'] - model_results_dict_all['MAPE']
    RMSE_diff = model_results_dict_new['RMSE'] - model_results_dict_all['RMSE']
    print('\nMAPE difference: {} - {} = {} %'.format(model_results_dict_new['MAPE'], model_results_dict_all['MAPE'], MAPE_diff))
    print('RMSE difference: {} - {} = {}\n'.format(model_results_dict_new['RMSE'], model_results_dict_all['RMSE'], RMSE_diff))