In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.neural_network import MLPRegressor

# Import helperfunctions
from ML_functions import fun_load_data, fun_preprocessing, fun_load_best_params
from ML_functions import fun_tuning_results, fun_scores

# Assign string 'TSP' or 'CVRP' to the following variable to define the optimization problem
optimization_problem = 'TSP'
train_size = 0.7

# Load data
data = fun_load_data(optimization_problem)
X, y, train_data = fun_preprocessing(data)

# Create a train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, random_state=42)

**Compute train score with all features: Neural Network - Multi Layer Perceptron**

In [2]:
# Load best parameters of the model
best_params = fun_load_best_params(optimization_problem + '_NN_GS_best_params.pkl')

# best_params = {'mlpregressor__alpha': 0.1, 'mlpregressor__batch_size': 32, 
#                'mlpregressor__hidden_layer_sizes': (100, 100, 100), 'mlpregressor__solver': 'sgd'}

# Create pipeline
pipe = make_pipeline(StandardScaler(), 
                     MLPRegressor(activation='relu', learning_rate='adaptive', 
                                  max_iter=1000, shuffle=True, random_state=0))
pipe.set_params(**best_params)

# Estimate model performance with cross-validation on the train set (scoring: MAPE and RMSE)
#model_results_dict_all = fun_scores(pipe, X_train, y_train)

model_results_dict_all = {'MAPE': 2.8765, 'RMSE': 0.7695}

{'mlpregressor__alpha': 0.1,
 'mlpregressor__batch_size': 32,
 'mlpregressor__hidden_layer_sizes': (100, 100, 100),
 'mlpregressor__solver': 'sgd'}

**Exclude feature categories**

In [3]:
# View all features
display(train_data.columns)

# Get all features categories with their features
instance_features = ['Instance ID', 'Number Customers', 'X Ratio', 'Y Ratio', 'X Depot', 'Y Depot']
distance_features = ['Depot Distance Ratio', 'Closest Customer Distance (CCD) Ratio', '2nd CCD Ratio', '3rd CCD Ratio', '4th CCD Ratio', '5th CCD Ratio', '6th CCD Ratio', 
                     '7th CCD Ratio', '8th CCD Ratio', '9th CCD Ratio', '10th CCD Ratio', 'Mean Distance To Other Customers Ratio', 'Gravity Center Distance Ratio']
cluster_features = ['Cluster', 'Number Clusters', 'Number Outliers', 'Cluster Size', 'X Centroid', 'Y Centroid', 'Centroid Distance Ratio', 'Centroid Distance To Depot Ratio',
                    'Distance To Closest Other Cluster Ratio', 'Distance To Closest Other Centroid Ratio', 'Cluster Area Ratio', 'Cluster Density Ratio']
statistical_features = ['X Std', 'Y Std', 'X Max', 'Y Max', 'X Min', 'Y Min', 'Correlation', 'Skewness X', 'Skewness Y']
cost_features = ['Savings Ratio', 'Marginal Costs Ratio', 'Total Costs'] #'Shapley Value'

# Combine lists to one complete list and one dictionary
all_features = instance_features + distance_features + cluster_features + statistical_features + cost_features
feature_categories_dict = {'distance_features': distance_features,
                           'cluster_features': cluster_features,
                           'statistical_features': statistical_features}

Index(['Instance ID', 'Number Customers', 'X Ratio', 'Y Ratio', 'X Depot',
       'Y Depot', 'Depot Distance Ratio',
       'Closest Customer Distance (CCD) Ratio', '2nd CCD Ratio',
       '3rd CCD Ratio', '4th CCD Ratio', '5th CCD Ratio', '6th CCD Ratio',
       '7th CCD Ratio', '8th CCD Ratio', '9th CCD Ratio', '10th CCD Ratio',
       'Mean Distance To Other Customers Ratio',
       'Gravity Center Distance Ratio', 'Cluster', 'Number Clusters',
       'Number Outliers', 'Cluster Size', 'X Centroid', 'Y Centroid',
       'Centroid Distance Ratio', 'Centroid Distance To Depot Ratio',
       'Distance To Closest Other Cluster Ratio',
       'Distance To Closest Other Centroid Ratio', 'Cluster Area Ratio',
       'Cluster Density Ratio', 'X Std', 'Y Std', 'X Max', 'Y Max', 'X Min',
       'Y Min', 'Correlation', 'Skewness X', 'Skewness Y', 'Savings Ratio',
       'Marginal Costs Ratio', 'Total Costs', 'Shapley Value'],
      dtype='object')

In [4]:
for key in feature_categories_dict.keys():
    print('############### Excluded feature category: {} ###############'.format(key))
    
    # Select only the used features in the train set
    excluded_features = [i for i in feature_categories_dict[key]] + [i + ' Ratio' for i in feature_categories_dict[key]]
    used_features = [i for i in all_features if i not in excluded_features]
    X_train_small = X_train[used_features]
    print('Number of excluded features:', len(feature_categories_dict[key]))
    print('Number of used features: {}\n'.format(len(used_features)))
    #display(used_features)

    # Estimate model performance with cross-validation on the train set (scoring: MAPE and RMSE)
    model_results_dict_new = fun_scores(pipe, X_train_small, y_train)

    # Compare the new results with the results of all categories
    MAPE_diff = model_results_dict_new['MAPE'] - model_results_dict_all['MAPE']
    RMSE_diff = model_results_dict_new['RMSE'] - model_results_dict_all['RMSE']
    print('\nMAPE difference: {} - {} = {} %'.format(model_results_dict_new['MAPE'], model_results_dict_all['MAPE'], MAPE_diff))
    print('RMSE difference: {} - {} = {}\n'.format(model_results_dict_new['RMSE'], model_results_dict_all['RMSE'], RMSE_diff))

############### Excluded feature category: distance_features ###############
Number of excluded features: 13
Number of used features: 30

CV MAPE (scaled) train data:  4.9844 %
CV RMSE (scaled) train data: 1.1964
CV computation time: 9m, 6s

MAPE difference: 4.9844 - 2.8765 = 2.1079 %
RMSE difference: 1.1964 - 0.7695 = 0.42689999999999995

############### Excluded feature category: cluster_features ###############
Number of excluded features: 12
Number of used features: 31

CV MAPE (scaled) train data:  2.9401 %
CV RMSE (scaled) train data: 0.7819
CV computation time: 8m, 18s

MAPE difference: 2.9401 - 2.8765 = 0.0636000000000001 %
RMSE difference: 0.7819 - 0.7695 = 0.012400000000000078

############### Excluded feature category: statistical_features ###############
Number of excluded features: 9
Number of used features: 34

CV MAPE (scaled) train data:  3.3681 %
CV RMSE (scaled) train data: 0.9155
CV computation time: 8m, 19s

MAPE difference: 3.3681 - 2.8765 = 0.49160000000000004 %
R

**Add single features to the essential features**

In [31]:
distance_features = ['Depot Distance Ratio', 'Closest Customer Distance (CCD) Ratio', '2nd CCD Ratio', '3rd CCD Ratio', '4th CCD Ratio', '5th CCD Ratio', 
                     '6th CCD Ratio', '7th CCD Ratio', '8th CCD Ratio', 'Mean Distance To Other Customers Ratio', 'Gravity Center Distance Ratio']
essential_features = instance_features + distance_features + statistical_features + cost_features

# Estimate model performance with cross-validation on the train set (scoring: MAPE and RMSE)
X_train_small = X_train[essential_features]
model_results_dict_essential = fun_scores(pipe, X_train_small, y_train)

#model_results_dict_essential = {'MAPE': 2.9232, 'RMSE': 0.7813}

CV MAPE (scaled) train data:  2.9232 %
CV RMSE (scaled) train data: 0.7813
CV computation time: 7m, 39s


In [32]:
potential_features = ['Number Clusters', 'Number Outliers', 'Cluster Size', 'Cluster', ['X Centroid', 'Y Centroid'], 
                      'Centroid Distance Ratio', 'Centroid Distance To Depot Ratio', 'Distance To Closest Other Cluster Ratio', 
                      'Distance To Closest Other Centroid Ratio', 'Cluster Area Ratio', 'Cluster Density Ratio']

# Add iteratively a single feature or a list of features to the essential features and compute the score difference (compared to the score of the essential features only)
for added_feature in potential_features:
    print('############### Added feature: {} ###############'.format(added_feature))

    # Select only the used features in the train set
    if isinstance(added_feature, list): # Check whether added_feature is a single feature (string) or a list of features
        used_features = all_features + added_feature
    else: used_features = essential_features + [added_feature]
    X_train_small = X_train[used_features]
    print('Number of used features: {}\n'.format(len(used_features)))
    #display(used_features)

    # Estimate model performance with cross-validation on the train set (scoring: MAPE and RMSE)
    model_results_dict_new = fun_scores(pipe, X_train_small, y_train)

    # Compare the new results with the results of the essential features only
    MAPE_diff = model_results_dict_new['MAPE'] - model_results_dict_essential['MAPE']
    RMSE_diff = model_results_dict_new['RMSE'] - model_results_dict_essential['RMSE']
    print('\nMAPE difference: {} - {} = {} %'.format(model_results_dict_new['MAPE'], model_results_dict_essential['MAPE'], MAPE_diff))
    print('RMSE difference: {} - {} = {}\n'.format(model_results_dict_new['RMSE'], model_results_dict_essential['RMSE'], RMSE_diff))

############### Added feature: Number Clusters ###############
Number of used features: 30

CV MAPE (scaled) train data:  2.9059999999999997 %
CV RMSE (scaled) train data: 0.7739
CV computation time: 8m, 54s

MAPE difference: 2.9059999999999997 - 2.9232 = -0.017200000000000326 %
RMSE difference: 0.7739 - 0.7813 = -0.007399999999999962

############### Added feature: Number Outliers ###############
Number of used features: 30

CV MAPE (scaled) train data:  2.9515 %
CV RMSE (scaled) train data: 0.7812
CV computation time: 9m, 8s

MAPE difference: 2.9515 - 2.9232 = 0.02829999999999977 %
RMSE difference: 0.7812 - 0.7813 = -9.999999999998899e-05

############### Added feature: Cluster Size ###############
Number of used features: 30

CV MAPE (scaled) train data:  2.9349 %
CV RMSE (scaled) train data: 0.7807
CV computation time: 7m, 32s

MAPE difference: 2.9349 - 2.9232 = 0.011699999999999822 %
RMSE difference: 0.7807 - 0.7813 = -0.0006000000000000449

############### Added feature: Cluster 

**Exclude single features**

In [4]:
distance_features = ['Depot Distance Ratio', 'Closest Customer Distance (CCD) Ratio', '2nd CCD Ratio', '3rd CCD Ratio', '4th CCD Ratio', '5th CCD Ratio', 
                     '6th CCD Ratio', '7th CCD Ratio', '8th CCD Ratio', 'Mean Distance To Other Customers Ratio', 'Gravity Center Distance Ratio']
all_features = instance_features + distance_features + cluster_features + statistical_features + cost_features

# Estimate model performance with cross-validation on the train set (scoring: MAPE and RMSE)
X_train_small = X_train[all_features]
#model_results_dict_all = fun_scores(pipe, X_train_small, y_train)

model_results_dict_all = {'MAPE': 2.8593, 'RMSE': 0.7692}

In [30]:
potential_features = ['Cluster', 'Number Clusters', 'Number Outliers', 'Cluster Size', ['X Centroid', 'Y Centroid'], 
                      'Centroid Distance Ratio', 'Centroid Distance To Depot Ratio', 'Distance To Closest Other Cluster Ratio', 
                      'Distance To Closest Other Centroid Ratio', 'Cluster Area Ratio', 'Cluster Density Ratio']

# Add iteratively a single feature or a list of features to the essential features and compute the score difference (compared to the score of the essential features only)
for excluded_feature in potential_features:
    print('############### Excluded feature: {} ###############'.format(excluded_feature))

    # Select only the used features in the train set
    if isinstance(excluded_feature, list): # Check whether added_feature is a single feature (string) or a list of features
        used_features = [i for i in all_features if i not in excluded_feature]
    else: used_features = [i for i in all_features if i != excluded_feature]
    X_train_small = X_train[used_features]
    print('Number of used features: {}\n'.format(len(used_features)))
    #display(used_features)

    # Estimate model performance with cross-validation on the train set (scoring: MAPE and RMSE)
    model_results_dict_new = fun_scores(pipe, X_train_small, y_train)

    # Compare the new results with the results of the essential features only
    MAPE_diff = model_results_dict_new['MAPE'] - model_results_dict_all['MAPE']
    RMSE_diff = model_results_dict_new['RMSE'] - model_results_dict_all['RMSE']
    print('\nMAPE difference: {} - {} = {} %'.format(model_results_dict_new['MAPE'], model_results_dict_all['MAPE'], MAPE_diff))
    print('RMSE difference: {} - {} = {}\n'.format(model_results_dict_new['RMSE'], model_results_dict_all['RMSE'], RMSE_diff))

############### Excluded feature: Cluster ###############
Number of used features: 40

CV MAPE (scaled) train data:  2.8676 %
CV RMSE (scaled) train data: 0.7625
CV computation time: 8m, 31s

MAPE difference: 2.8676 - 2.8593 = 0.008299999999999752 %
RMSE difference: 0.7625 - 0.7692 = -0.006700000000000039

############### Excluded feature: Number Clusters ###############
Number of used features: 40

CV MAPE (scaled) train data:  2.8779 %
CV RMSE (scaled) train data: 0.7712
CV computation time: 8m, 34s

MAPE difference: 2.8779 - 2.8593 = 0.018599999999999728 %
RMSE difference: 0.7712 - 0.7692 = 0.0020000000000000018

############### Excluded feature: Number Outliers ###############
Number of used features: 40

CV MAPE (scaled) train data:  2.8658 %
CV RMSE (scaled) train data: 0.7665
CV computation time: 8m, 44s

MAPE difference: 2.8658 - 2.8593 = 0.00649999999999995 %
RMSE difference: 0.7665 - 0.7692 = -0.0027000000000000357

############### Excluded feature: Cluster Size #############

**Exclude updated feature categories**

In [5]:
# Get all features categories with their features
distance_features = ['Depot Distance Ratio', 'Closest Customer Distance (CCD) Ratio', '2nd CCD Ratio', '3rd CCD Ratio', '4th CCD Ratio', '5th CCD Ratio', 
                     '6th CCD Ratio', '7th CCD Ratio', '8th CCD Ratio', 'Mean Distance To Other Customers Ratio', 'Gravity Center Distance Ratio']
cluster_features = ['Cluster', 'Number Clusters', 'Cluster Size', 'X Centroid', 'Y Centroid', 'Centroid Distance To Depot Ratio',
                    'Distance To Closest Other Cluster Ratio', 'Cluster Area Ratio', 'Cluster Density Ratio']
statistical_features = ['X Std', 'Y Std', 'X Max', 'Y Max', 'X Min', 'Y Min', 'Correlation', 'Skewness X', 'Skewness Y']

# Combine lists to one complete list and one dictionary
all_features = instance_features + distance_features + cluster_features + statistical_features + cost_features
feature_categories_dict = {'distance_features': distance_features,
                           'cluster_features': cluster_features,
                           'statistical_features': statistical_features}

X_train_small = X_train[all_features]

# Estimate model performance with cross-validation on the train set (scoring: MAPE and RMSE)
model_results_dict_all = fun_scores(pipe, X_train_small, y_train)

#model_results_dict_all = {'MAPE': 2.8642, 'RMSE': 0.7637}

CV MAPE (scaled) train data:  2.8642999999999996 %
CV RMSE (scaled) train data: 0.7637
CV computation time: 8m, 30s


In [6]:
for key in feature_categories_dict.keys():
    #if (key == 'cluster_features'):
    print('############### Excluded feature category: {} ###############'.format(key))
    
    # Select only the used features in the train set
    excluded_features = [i for i in feature_categories_dict[key]] + [i + ' Ratio' for i in feature_categories_dict[key]]
    used_features = [i for i in all_features if i not in excluded_features]
    X_train_small = X_train[used_features]
    print('Number of excluded features:', len(feature_categories_dict[key]))
    print('Number of used features: {}\n'.format(len(used_features)))
    #display(used_features)

    # Estimate model performance with cross-validation on the train set (scoring: MAPE and RMSE)
    model_results_dict_new = fun_scores(pipe, X_train_small, y_train)

    # Compare the new results with the results of all categories
    MAPE_diff = model_results_dict_new['MAPE'] - model_results_dict_all['MAPE']
    RMSE_diff = model_results_dict_new['RMSE'] - model_results_dict_all['RMSE']
    print('\nMAPE difference: {} - {} = {} %'.format(model_results_dict_new['MAPE'], model_results_dict_all['MAPE'], MAPE_diff))
    print('RMSE difference: {} - {} = {}\n'.format(model_results_dict_new['RMSE'], model_results_dict_all['RMSE'], RMSE_diff))

############### Excluded feature category: distance_features ###############
Number of excluded features: 11
Number of used features: 27

CV MAPE (scaled) train data:  5.1110999999999995 %
CV RMSE (scaled) train data: 1.2144
CV computation time: 8m, 57s

MAPE difference: 5.1110999999999995 - 2.8642999999999996 = 2.2468 %
RMSE difference: 1.2144 - 0.7637 = 0.4506999999999999

############### Excluded feature category: cluster_features ###############
Number of excluded features: 9
Number of used features: 29

CV MAPE (scaled) train data:  2.9232 %
CV RMSE (scaled) train data: 0.7813
CV computation time: 7m, 33s

MAPE difference: 2.9232 - 2.8642999999999996 = 0.058900000000000396 %
RMSE difference: 0.7813 - 0.7637 = 0.01759999999999995

############### Excluded feature category: statistical_features ###############
Number of excluded features: 9
Number of used features: 29

CV MAPE (scaled) train data:  3.3824 %
CV RMSE (scaled) train data: 0.9214
CV computation time: 8m, 22s

MAPE diffe