In [1]:
import pandas as pd
import numpy as np
from IPython.display import Markdown

from sklearn.base import clone
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from sklearn.neural_network import MLPRegressor

# Import helperfunctions
from ML_functions import fun_load_data, fun_preprocessing, fun_load_best_params
from ML_functions import fun_scores

# Assign string "TSP" or "CVRP" to the following variable to define the optimization problem
optimization_problem = "TSP"

# Load data
data, _ = fun_load_data(optimization_problem)

# Do the train test split during the preprocessing
X_train, X_test, y_train, y_test, train_data = fun_preprocessing(data, train_size=0.8)

# **Feature Importance**
### **1. Importance of feature categories**
**Compute train and test score with all features: Neural Network - Multi Layer Perceptron**

In [2]:
# Load best parameters of the model
best_params = fun_load_best_params(optimization_problem, model_abbreviation="NN")

# Create pipeline
pipe = make_pipeline(StandardScaler(), 
                     MLPRegressor(hidden_layer_sizes=(256, 128, 64), activation="relu", learning_rate="adaptive", 
                                  max_iter=1000, random_state=42))
pipe.set_params(**best_params)

# Estimate model performance with cross-validation on the train set and get scores on test set (scoring: MAPE and RMSE)
model_results_dict_all = fun_scores(pipe, X_train, y_train, X_test, y_test, compute_test_scores=True)

{'mlpregressor__alpha': 0.1,
 'mlpregressor__batch_size': 32,
 'mlpregressor__early_stopping': False,
 'mlpregressor__learning_rate_init': 0.001,
 'mlpregressor__solver': 'sgd'}

CV MAPE (scaled) train data: 3.52 %
CV RMSE (scaled) train data: 0.96
CV computation time: 12m, 20s

MAPE (scaled) test data: 3.3000000000000003 %
RMSE (scaled) test data: 0.9
Model fit time: 23m, 38s
Model prediction time: 0s


**MAPE and RMSE on test data per instance size:**

Number Customers,6,7,8,9,10,11,12,13,14,Mean
MAPE,2.32,2.39,2.74,2.81,3.11,3.24,3.59,3.94,4.17,3.3
RMSE,0.8,0.85,0.85,0.85,0.86,0.89,0.92,0.98,0.96,0.9


**Exclude feature categories**

In [3]:
# View all features
display(train_data.columns)

# Get all features categories with their features
instance_features = ["Number Customers", "X Ratio", "Y Ratio", "X Depot", "Y Depot"]
distance_features = ["Depot Distance Ratio", "Closest Customer Distance (CCD) Ratio", "2nd CCD Ratio", "3rd CCD Ratio", "4th CCD Ratio", "5th CCD Ratio", 
                     "6th CCD Ratio", "7th CCD Ratio", "8th CCD Ratio", "Mean Distance To Other Customers Ratio", "Gravity Center Distance Ratio"]
cluster_features = ["Number Clusters", "Cluster Size", "X Centroid", "Y Centroid", "Centroid Distance To Depot Ratio", "Cluster Area Ratio", "Cluster Density Ratio"] #"Cluster", "Distance To Closest Other Cluster Ratio"
statistical_features = ["X Std", "Y Std", "X Max", "Y Max", "X Min", "Y Min", "Correlation", "Skewness X", "Skewness Y"]
cost_features = ["Savings Ratio", "Marginal Cost Ratio", "Total Cost"] #"Shapley Value"

# Combine lists to one complete list and one dictionary
all_features = instance_features + distance_features + cluster_features + statistical_features + cost_features
feature_categories_dict = {"Distance features": distance_features,
                           "Cluster features": cluster_features,
                           "Statistical features": statistical_features}

Index(['Number Customers', 'X Ratio', 'Y Ratio', 'X Depot', 'Y Depot',
       'Depot Distance Ratio', 'Closest Customer Distance (CCD) Ratio',
       '2nd CCD Ratio', '3rd CCD Ratio', '4th CCD Ratio', '5th CCD Ratio',
       '6th CCD Ratio', '7th CCD Ratio', '8th CCD Ratio',
       'Mean Distance To Other Customers Ratio',
       'Gravity Center Distance Ratio', 'Number Clusters', 'Cluster Size',
       'X Centroid', 'Y Centroid', 'Centroid Distance To Depot Ratio',
       'Cluster Area Ratio', 'Cluster Density Ratio', 'X Std', 'Y Std',
       'X Max', 'Y Max', 'X Min', 'Y Min', 'Correlation', 'Skewness X',
       'Skewness Y', 'Savings Ratio', 'Marginal Cost Ratio', 'Total Cost',
       'Shapley Value'],
      dtype='object')

In [4]:
# Create a dictionary to store all the dictionaries with the results
model_results_dict_all["MAPE difference"] = None
model_results_dict_all["RMSE difference"] = None
model_results_dict_all["Used features"] = len(all_features)
results_dict1 = {"All features": model_results_dict_all}

# Exclude iteratively all three feature categories from the features and compute the train score
for key in feature_categories_dict.keys():
    display(Markdown(f"**############### Excluded feature category: {key} ###############**"))
    
    # Select only the used features in the train and test set
    used_features = [i for i in all_features if i not in feature_categories_dict[key]]
    X_train_small = X_train[used_features]
    X_test_small = X_test[used_features]
    print("Number of excluded features:", len(feature_categories_dict[key]))
    print("Number of used features: {}\n".format(len(used_features)))
    #display(used_features)
    
    # Clone the pipeline to get an unfitted version
    pipe = clone(pipe)

    # Estimate model performance with cross-validation on the train set and get scores on test set (scoring: MAPE and RMSE)
    model_results_dict_new = fun_scores(pipe, X_train_small, y_train, X_test_small, y_test, compute_test_scores=True)

    # Compare the new results with the results of all categories
    MAPE_diff = np.round(model_results_dict_new["MAPE"]["Test data"] - model_results_dict_all["MAPE"]["Test data"], 2)
    RMSE_diff = np.round(model_results_dict_new["RMSE"]["Test data"] - model_results_dict_all["RMSE"]["Test data"], 2)
    model_results_dict_new["MAPE difference"] = MAPE_diff
    model_results_dict_new["RMSE difference"] = RMSE_diff
    print("\nMAPE difference: {} - {} = {} %".format(model_results_dict_new["MAPE"]["Test data"], model_results_dict_all["MAPE"]["Test data"], MAPE_diff))
    print("RMSE difference: {} - {} = {}\n".format(model_results_dict_new["RMSE"]["Test data"], model_results_dict_all["RMSE"]["Test data"], RMSE_diff))
    model_results_dict_new["Used features"] = len(used_features)

    # Add the dictionary to the results dictionary
    results_dict1[key] = model_results_dict_new

**############### Excluded feature category: Distance features ###############**

Number of excluded features: 11
Number of used features: 24

CV MAPE (scaled) train data: 6.460000000000001 %
CV RMSE (scaled) train data: 1.54
CV computation time: 14m, 48s

MAPE (scaled) test data: 5.86 %
RMSE (scaled) test data: 1.45
Model fit time: 26m, 41s
Model prediction time: 0s


**MAPE and RMSE on test data per instance size:**

Number Customers,6,7,8,9,10,11,12,13,14,Mean
MAPE,5.03,4.66,5.61,4.97,5.84,5.5,6.25,6.03,7.35,5.86
RMSE,1.62,1.46,1.43,1.37,1.4,1.41,1.43,1.46,1.5,1.45



MAPE difference: 5.86 - 3.3000000000000003 = 2.56 %
RMSE difference: 1.45 - 0.9 = 0.55



**############### Excluded feature category: Cluster features ###############**

Number of excluded features: 7
Number of used features: 28

CV MAPE (scaled) train data: 3.53 %
CV RMSE (scaled) train data: 0.96
CV computation time: 10m, 21s

MAPE (scaled) test data: 3.34 %
RMSE (scaled) test data: 0.9
Model fit time: 17m, 12s
Model prediction time: 0s


**MAPE and RMSE on test data per instance size:**

Number Customers,6,7,8,9,10,11,12,13,14,Mean
MAPE,2.29,2.47,2.71,2.85,3.06,3.28,3.64,4.0,4.25,3.34
RMSE,0.82,0.85,0.86,0.86,0.87,0.89,0.92,0.98,0.94,0.9



MAPE difference: 3.34 - 3.3000000000000003 = 0.04 %
RMSE difference: 0.9 - 0.9 = 0.0



**############### Excluded feature category: Statistical features ###############**

Number of excluded features: 9
Number of used features: 26

CV MAPE (scaled) train data: 4.17 %
CV RMSE (scaled) train data: 1.17
CV computation time: 12m, 24s

MAPE (scaled) test data: 3.94 %
RMSE (scaled) test data: 1.11
Model fit time: 21m, 46s
Model prediction time: 0s


**MAPE and RMSE on test data per instance size:**

Number Customers,6,7,8,9,10,11,12,13,14,Mean
MAPE,2.72,2.51,3.3,3.41,3.59,3.93,4.43,4.69,4.99,3.94
RMSE,0.98,0.95,1.08,1.07,1.02,1.12,1.16,1.23,1.19,1.11



MAPE difference: 3.94 - 3.3000000000000003 = 0.64 %
RMSE difference: 1.11 - 0.9 = 0.21



**Save results**

In [5]:
# Remove the unnecessary dictionary keys
for dictionary in results_dict1:
  for key in ["CV computation time", "Model fit time", "Model prediction time", "Scores per instance size"]: results_dict1[dictionary].pop(key)
display(results_dict1)

# Get all the scores from the results dictionary
used_features_list = [str(dict["Used features"]) for dict in results_dict1.values()]
MAPE_test_scores = [dict["MAPE"]["Test data"] for dict in results_dict1.values()]
RMSE_test_scores = [dict["RMSE"]["Test data"] for dict in results_dict1.values()]
MAPE_test_deltas = [dict["MAPE difference"] for dict in results_dict1.values()]
RMSE_test_deltas = [dict["RMSE difference"] for dict in results_dict1.values()]

# Save all results in Data Frames
df_mape1 = pd.DataFrame(data=[used_features_list, MAPE_test_scores, MAPE_test_deltas], 
                       index=["Number of used features", "MAPE test scores", "MAPE test difference"], 
                       columns=["All features", "No distance features", "No cluster features", "No statistical features"])

df_rmse1 = pd.DataFrame(data=[used_features_list, RMSE_test_scores, RMSE_test_deltas], 
                       index=["Number of used features", "RMSE test scores", "RMSE test difference"], 
                       columns=["All features", "No distance features", "No cluster features", "No statistical features"])
display(df_mape1, df_rmse1)

{'All features': {'MAPE': {'Train data': 3.52,
   'Test data': 3.3000000000000003},
  'RMSE': {'Train data': 0.96, 'Test data': 0.9},
  'MAPE difference': None,
  'RMSE difference': None,
  'Used features': 35},
 'Distance features': {'MAPE': {'Train data': 6.460000000000001,
   'Test data': 5.86},
  'RMSE': {'Train data': 1.54, 'Test data': 1.45},
  'MAPE difference': 2.56,
  'RMSE difference': 0.55,
  'Used features': 24},
 'Cluster features': {'MAPE': {'Train data': 3.53, 'Test data': 3.34},
  'RMSE': {'Train data': 0.96, 'Test data': 0.9},
  'MAPE difference': 0.04,
  'RMSE difference': 0.0,
  'Used features': 28},
 'Statistical features': {'MAPE': {'Train data': 4.17, 'Test data': 3.94},
  'RMSE': {'Train data': 1.17, 'Test data': 1.11},
  'MAPE difference': 0.64,
  'RMSE difference': 0.21,
  'Used features': 26}}

Unnamed: 0,All features,No distance features,No cluster features,No statistical features
Number of used features,35.0,24.0,28.0,26.0
MAPE test scores,3.3,5.86,3.34,3.94
MAPE test difference,,2.56,0.04,0.64


Unnamed: 0,All features,No distance features,No cluster features,No statistical features
Number of used features,35.0,24.0,28.0,26.0
RMSE test scores,0.9,1.45,0.9,1.11
RMSE test difference,,0.55,0.0,0.21


### **2. Top 20 features**

In [6]:
# Create a dictionary to store all the dictionaries with the results
results_dict2 = {"All features": model_results_dict_all}

# Load most important features from script "feature_selection.ipynb"
top20_features = list(pd.read_csv(f"02_best_features/{optimization_problem}_top20_features"))

# Select only the most important features in the train and test set
X_train_small = X_train[top20_features]
X_test_small = X_test[top20_features]

# Clone the pipeline to get an unfitted version
pipe = clone(pipe)

# Estimate model performance with cross-validation on the train set and get scores on test set (scoring: MAPE and RMSE)
model_results_dict_new = fun_scores(pipe, X_train_small, y_train, X_test_small, y_test, compute_test_scores=True)

# Compare the new results with the results of all categories
MAPE_diff = np.round(model_results_dict_new["MAPE"]["Test data"] - model_results_dict_all["MAPE"]["Test data"], 2)
RMSE_diff = np.round(model_results_dict_new["RMSE"]["Test data"] - model_results_dict_all["RMSE"]["Test data"], 2)
model_results_dict_new["MAPE difference"] = MAPE_diff
model_results_dict_new["RMSE difference"] = RMSE_diff
print("\nMAPE difference: {} - {} = {} %".format(model_results_dict_new["MAPE"]["Test data"], model_results_dict_all["MAPE"]["Test data"], MAPE_diff))
print("RMSE difference: {} - {} = {}\n".format(model_results_dict_new["RMSE"]["Test data"], model_results_dict_all["RMSE"]["Test data"], RMSE_diff))

# Add the dictionary to the results dictionary
results_dict2["Top20"] = model_results_dict_new

CV MAPE (scaled) train data: 5.140000000000001 %
CV RMSE (scaled) train data: 1.46
CV computation time: 14m, 3s

MAPE (scaled) test data: 4.87 %
RMSE (scaled) test data: 1.39
Model fit time: 20m, 58s
Model prediction time: 0s


**MAPE and RMSE on test data per instance size:**

Number Customers,6,7,8,9,10,11,12,13,14,Mean
MAPE,3.56,4.1,4.41,4.66,4.8,4.76,4.95,5.29,5.92,4.87
RMSE,1.44,1.48,1.42,1.4,1.41,1.34,1.32,1.37,1.39,1.39



MAPE difference: 4.87 - 3.3000000000000003 = 1.57 %
RMSE difference: 1.39 - 0.9 = 0.49



**Save results**

In [7]:
# Get all the scores from the results dictionary
MAPE_test_scores = [dict["MAPE"]["Test data"] for dict in results_dict2.values()]
RMSE_test_scores = [dict["RMSE"]["Test data"] for dict in results_dict2.values()]
MAPE_test_deltas = [dict["MAPE difference"] for dict in results_dict2.values()]
RMSE_test_deltas = [dict["RMSE difference"] for dict in results_dict2.values()]

# Save all results in Data Frames
df_mape2 = pd.DataFrame(data=[MAPE_test_scores, MAPE_test_deltas], 
                       index=["MAPE test scores", "MAPE test difference"], 
                       columns=["All features", "Top 20 features"])

df_rmse2 = pd.DataFrame(data=[RMSE_test_scores, RMSE_test_deltas], 
                       index=["RMSE test scores", "RMSE test difference"], 
                       columns=["All features", "Top 20 features"])
display(df_mape2, df_rmse2)

# Save data frames with results into an excel file
file_path = str(f"04_test_results/{optimization_problem}_feature_analysis.xlsx")

# Use ExcelWriter to write multiple DataFrames to the same file
with pd.ExcelWriter(file_path) as writer:
    df_mape1.to_excel(writer, sheet_name="MAPE_scores_categories")
    df_rmse1.to_excel(writer, sheet_name="RMSE_scores_categories")
    df_mape2.to_excel(writer, sheet_name="MAPE_scores_top20")
    df_rmse2.to_excel(writer, sheet_name="RMSE_scores_top20")

Unnamed: 0,All features,Top 20 features
MAPE test scores,3.3,4.87
MAPE test difference,,1.57


Unnamed: 0,All features,Top 20 features
RMSE test scores,0.9,1.39
RMSE test difference,,0.49
