In [1]:
import pandas as pd
import numpy as np
from IPython.display import Markdown

from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.neural_network import MLPRegressor

# Import helperfunctions
from ML_functions import fun_load_data, fun_save_file, fun_preprocessing, fun_load_best_params
from ML_functions import fun_scores

# Assign string "TSP" or "CVRP" to the following variable to define the optimization problem
optimization_problem = "TSP"

# Load data
data, _ = fun_load_data(optimization_problem)

# Do the train test split during the preprocessing
X_train, X_test, y_train, y_test, train_data = fun_preprocessing(data, train_size=0.8)

# **Feature Importance**
### **1. Importance of feature categories**
**Compute train and test score with all features: Neural Network - Multi Layer Perceptron**

In [3]:
# Load best parameters of the model
best_params = fun_load_best_params(optimization_problem, model_abbreviation="NN")

# Create pipeline
pipe = make_pipeline(StandardScaler(), 
                     MLPRegressor(hidden_layer_sizes=(256, 128, 64), activation="relu", learning_rate="adaptive", 
                                  max_iter=1000, random_state=42))
pipe.set_params(**best_params)

# Estimate model performance with cross-validation on the train set and get scores on test set (scoring: MAPE and RMSE)
model_results_dict_all = fun_scores(pipe, X_train, y_train, X_test, y_test, compute_test_scores=True)

{'mlpregressor__alpha': 0.1,
 'mlpregressor__batch_size': 32,
 'mlpregressor__early_stopping': False,
 'mlpregressor__learning_rate_init': 0.001,
 'mlpregressor__solver': 'sgd'}

CV MAPE (scaled) train data:  3.5156 %
CV RMSE (scaled) train data: 0.9565
CV computation time: 19m, 55s

MAPE (scaled) test data:  3.3343 %
RMSE (scaled) test data: 0.8999
Model fit time: 25m, 30s
Model prediction time: 0s


**MAPE and RMSE on test data per instance size:**

Number Customers,6,7,8,9,10,11,12,13,14,Mean
MAPE,2.2156,2.3114,2.6244,2.8952,3.1252,3.2157,3.7293,3.934,4.3603,3.3343
RMSE,0.8039,0.8375,0.8582,0.8582,0.8375,0.8806,0.9356,0.9706,0.9739,0.8999


**Exclude feature categories**

In [3]:
# View all features
display(train_data.columns)

# Get all features categories with their features
instance_features = ["Instance ID", "Number Customers", "X Ratio", "Y Ratio", "X Depot", "Y Depot"]
distance_features = ["Depot Distance Ratio", "Closest Customer Distance (CCD) Ratio", "2nd CCD Ratio", "3rd CCD Ratio", "4th CCD Ratio", "5th CCD Ratio", 
                     "6th CCD Ratio", "7th CCD Ratio", "8th CCD Ratio", "Mean Distance To Other Customers Ratio", "Gravity Center Distance Ratio"]
cluster_features = ["Cluster", "Number Clusters", "Cluster Size", "X Centroid", "Y Centroid", "Centroid Distance To Depot Ratio",
                    "Distance To Closest Other Cluster Ratio", "Cluster Area Ratio", "Cluster Density Ratio"]
statistical_features = ["X Std", "Y Std", "X Max", "Y Max", "X Min", "Y Min", "Correlation", "Skewness X", "Skewness Y"]
cost_features = ["Savings Ratio", "Marginal Costs Ratio", "Total Costs"] #"Shapley Value"

# Combine lists to one complete list and one dictionary
all_features = instance_features + distance_features + cluster_features + statistical_features + cost_features
feature_categories_dict = {"Distance features": distance_features,
                           "Cluster features": cluster_features,
                           "Statistical features": statistical_features}

Index(['Instance ID', 'Number Customers', 'X Ratio', 'Y Ratio', 'X Depot',
       'Y Depot', 'Depot Distance Ratio',
       'Closest Customer Distance (CCD) Ratio', '2nd CCD Ratio',
       '3rd CCD Ratio', '4th CCD Ratio', '5th CCD Ratio', '6th CCD Ratio',
       '7th CCD Ratio', '8th CCD Ratio',
       'Mean Distance To Other Customers Ratio',
       'Gravity Center Distance Ratio', 'Cluster', 'Number Clusters',
       'Cluster Size', 'X Centroid', 'Y Centroid',
       'Centroid Distance To Depot Ratio',
       'Distance To Closest Other Cluster Ratio', 'Cluster Area Ratio',
       'Cluster Density Ratio', 'X Std', 'Y Std', 'X Max', 'Y Max', 'X Min',
       'Y Min', 'Correlation', 'Skewness X', 'Skewness Y', 'Savings Ratio',
       'Marginal Costs Ratio', 'Total Costs', 'Shapley Value'],
      dtype='object')

In [4]:
# Create a dictionary to store all the dictionaries with the results
model_results_dict_all["MAPE difference"] = None
model_results_dict_all["RMSE difference"] = None
model_results_dict_all["Used features"] = len(all_features)
results_dict1 = {"All features": model_results_dict_all}

# Exclude iteratively all three feature categories from the features and compute the train score
for key in feature_categories_dict.keys():
    display(Markdown(f"**############### Excluded feature category: {key} ###############**"))
    
    # Select only the used features in the train and test set
    used_features = [i for i in all_features if i not in feature_categories_dict[key]]
    X_train_small = X_train[used_features]
    X_test_small = X_test[used_features]
    print("Number of excluded features:", len(feature_categories_dict[key]))
    print("Number of used features: {}\n".format(len(used_features)))
    #display(used_features)

    # Estimate model performance with cross-validation on the train set and get scores on test set (scoring: MAPE and RMSE)
    pipe.fit(X_train_small, y_train) # Model needs to be fitted again with the new set of features to avoid an error
    model_results_dict_new = fun_scores(pipe, X_train_small, y_train, X_test_small, y_test, compute_test_scores=True)

    # Compare the new results with the results of all categories
    MAPE_diff = np.round(model_results_dict_new["MAPE"]["Test data"] - model_results_dict_all["MAPE"]["Test data"], 4)
    RMSE_diff = np.round(model_results_dict_new["RMSE"]["Test data"] - model_results_dict_all["RMSE"]["Test data"], 4)
    model_results_dict_new["MAPE difference"] = MAPE_diff
    model_results_dict_new["RMSE difference"] = RMSE_diff
    print("\nMAPE difference: {} - {} = {} %".format(model_results_dict_new["MAPE"]["Test data"], model_results_dict_all["MAPE"]["Test data"], MAPE_diff))
    print("RMSE difference: {} - {} = {}\n".format(model_results_dict_new["RMSE"]["Test data"], model_results_dict_all["RMSE"]["Test data"], RMSE_diff))
    model_results_dict_new["Used features"] = len(used_features)

    # Add the dictionary to the results dictionary
    results_dict1[key] = model_results_dict_new

**############### Excluded feature category: Distance features ###############**

Number of excluded features: 11
Number of used features: 27

CV MAPE (scaled) train data:  6.5481 %
CV RMSE (scaled) train data: 1.5714
CV computation time: 23m, 42s

MAPE (scaled) test data:  5.915 %
RMSE (scaled) test data: 1.4649
Model prediction time: 0s


**MAPE and RMSE on test data per instance size:**

Number Customers,6,7,8,9,10,11,12,13,14,Mean
MAPE,5.3563,4.673,5.536,5.1717,5.5691,5.5747,6.1454,6.1907,7.531,5.915
RMSE,1.7305,1.5105,1.4342,1.3911,1.4086,1.4051,1.4193,1.4863,1.4852,1.4649



MAPE difference: 5.915 - 3.3343 = 2.5807 %
RMSE difference: 1.4649 - 0.8999 = 0.565



**############### Excluded feature category: Cluster features ###############**

Number of excluded features: 9
Number of used features: 29

CV MAPE (scaled) train data:  3.5525 %
CV RMSE (scaled) train data: 0.9653
CV computation time: 15m, 59s

MAPE (scaled) test data:  3.3785999999999996 %
RMSE (scaled) test data: 0.9087
Model prediction time: 0s


**MAPE and RMSE on test data per instance size:**

Number Customers,6,7,8,9,10,11,12,13,14,Mean
MAPE,2.3671,2.4101,2.6994,2.8498,3.1543,3.3627,3.6974,4.0609,4.2904,3.3786
RMSE,0.8578,0.8526,0.8602,0.8319,0.8776,0.8961,0.9217,0.9975,0.9636,0.9087



MAPE difference: 3.3785999999999996 - 3.3343 = 0.0443 %
RMSE difference: 0.9087 - 0.8999 = 0.0088



**############### Excluded feature category: Statistical features ###############**

Number of excluded features: 9
Number of used features: 29

CV MAPE (scaled) train data:  4.1996 %
CV RMSE (scaled) train data: 1.1731
CV computation time: 17m, 10s

MAPE (scaled) test data:  3.9191999999999996 %
RMSE (scaled) test data: 1.1116
Model prediction time: 0s


**MAPE and RMSE on test data per instance size:**

Number Customers,6,7,8,9,10,11,12,13,14,Mean
MAPE,2.5819,2.7048,3.3323,3.3242,3.6571,3.9755,4.3237,4.6195,4.9638,3.9192
RMSE,0.9435,1.0185,1.0772,1.0512,1.0475,1.1272,1.1526,1.2005,1.1878,1.1116



MAPE difference: 3.9191999999999996 - 3.3343 = 0.5849 %
RMSE difference: 1.1116 - 0.8999 = 0.2117



**Save results**

In [5]:
# Remove the unnecessary dictionary keys
for dictionary in results_dict1:
  for key in ["CV computation time", "Model fit time", "Model prediction time", "Scores per instance size"]: results_dict1[dictionary].pop(key)
display(results_dict1)

# Get all the scores from the results dictionary
used_features_list = [str(dict["Used features"]) for dict in results_dict1.values()]
MAPE_test_scores = [np.round(dict["MAPE"]["Test data"], 2) for dict in results_dict1.values()]
RMSE_test_scores = [dict["RMSE"]["Test data"] for dict in results_dict1.values()]
MAPE_test_deltas = [None] + list(np.round([dict["MAPE difference"] for dict in results_dict1.values()][1:], 2))
RMSE_test_deltas = [dict["RMSE difference"] for dict in results_dict1.values()]

# Save all results in Data Frames
df_mape1 = pd.DataFrame(data=[used_features_list, MAPE_test_scores, MAPE_test_deltas], 
                       index=["Number of used features", "MAPE test scores", "MAPE test difference"], 
                       columns=["All features", "No distance features", "No cluster features", "No statistical features"])

df_rmse1 = pd.DataFrame(data=[used_features_list, RMSE_test_scores, RMSE_test_deltas], 
                       index=["Number of used features", "RMSE test scores", "RMSE test difference"], 
                       columns=["All features", "No distance features", "No cluster features", "No statistical features"])
display(df_mape1, df_rmse1)

# Save data frames with results into an excel file
file_path = str(f"04_test_results/{optimization_problem}_feature_category_importance.xlsx")

# Use ExcelWriter to write multiple DataFrames to the same file
with pd.ExcelWriter(file_path) as writer:
    df_mape1.to_excel(writer, sheet_name="MAPE_scores")
    df_rmse1.to_excel(writer, sheet_name="RMSE_scores")

{'All features': {'MAPE': {'Train data': 3.5156, 'Test data': 3.3343},
  'RMSE': {'Train data': 0.9565, 'Test data': 0.8999},
  'MAPE Difference': None,
  'RMSE Difference': None,
  'Used Features': 38},
 'Distance features': {'MAPE': {'Train data': 6.5481, 'Test data': 5.915},
  'RMSE': {'Train data': 1.5714, 'Test data': 1.4649},
  'MAPE Difference': 2.5807,
  'RMSE Difference': 0.565,
  'Used Features': 27},
 'Cluster features': {'MAPE': {'Train data': 3.5525,
   'Test data': 3.3785999999999996},
  'RMSE': {'Train data': 0.9653, 'Test data': 0.9087},
  'MAPE Difference': 0.0443,
  'RMSE Difference': 0.0088,
  'Used Features': 29},
 'Statistical features': {'MAPE': {'Train data': 4.1996,
   'Test data': 3.9191999999999996},
  'RMSE': {'Train data': 1.1731, 'Test data': 1.1116},
  'MAPE Difference': 0.5849,
  'RMSE Difference': 0.2117,
  'Used Features': 29}}

Unnamed: 0,All Features,No Distance Features,No Cluster Features,No Statistical Features
Number of used features,38.0,27.0,29.0,29.0
MAPE test scores,3.33,5.92,3.38,3.92
MAPE test difference,,2.58,0.04,0.58


Unnamed: 0,All Features,No Distance Features,No Cluster Features,No Statistical Features
Number of used features,38.0,27.0,29.0,29.0
RMSE test scores,0.8999,1.4649,0.9087,1.1116
RMSE test difference,,0.565,0.0088,0.2117


### **2. Top 20 features**

In [None]:
# Create a dictionary to store all the dictionaries with the results
results_dict2 = {"All features": model_results_dict_all}

# Load most important features from script "feature_selection.ipynb" (always add "Instance ID" for the scaling part)
top_features = list(pd.read_csv(f"02_best_features/{optimization_problem}_top_features")) + ["Instance ID"]

# Select only the most important features in the train and test set
X_train_small = X_train[top_features]
X_test_small = X_test[top_features]

# Create pipeline
pipe = make_pipeline(StandardScaler(), 
                     MLPRegressor(hidden_layer_sizes=(256, 128, 64), activation="relu", learning_rate="adaptive", 
                                  max_iter=1000, random_state=42))
pipe.set_params(**best_params)

# Estimate model performance with cross-validation on the train set and get scores on test set (scoring: MAPE and RMSE)
model_results_dict_new = fun_scores(pipe, X_train_small, y_train, X_test_small, y_test, compute_test_scores=True)

# Compare the new results with the results of all categories
MAPE_diff = np.round(model_results_dict_new["MAPE"]["Test data"] - model_results_dict_all["MAPE"]["Test data"], 4)
RMSE_diff = np.round(model_results_dict_new["RMSE"]["Test data"] - model_results_dict_all["RMSE"]["Test data"], 4)
model_results_dict_new["MAPE difference"] = MAPE_diff
model_results_dict_new["RMSE difference"] = RMSE_diff
print("\nMAPE difference: {} - {} = {} %".format(model_results_dict_new["MAPE"]["Test data"], model_results_dict_all["MAPE"]["Test data"], MAPE_diff))
print("RMSE difference: {} - {} = {}\n".format(model_results_dict_new["RMSE"]["Test data"], model_results_dict_all["RMSE"]["Test data"], RMSE_diff))

# Add the dictionary to the results dictionary
results_dict2["Top20"] = model_results_dict_new

### **3. Principal component analysis (PCA)**

In [None]:
# Create pipeline
pipe = make_pipeline(StandardScaler(),
                     PCA(n_components=10), # Apply PCA to the training data and reduce it to n_components 
                     MLPRegressor(hidden_layer_sizes=(256, 128, 64), activation="relu", learning_rate="adaptive", 
                                  max_iter=1000, random_state=42))
pipe.set_params(**best_params)

# Estimate model performance with cross-validation on the train set and get scores on test set (scoring: MAPE and RMSE)
model_results_dict_new = fun_scores(pipe, X_train, y_train, X_test, y_test, compute_test_scores=True)

# Compare the new results with the results of all categories
MAPE_diff = np.round(model_results_dict_new["MAPE"]["Test data"] - model_results_dict_all["MAPE"]["Test data"], 4)
RMSE_diff = np.round(model_results_dict_new["RMSE"]["Test data"] - model_results_dict_all["RMSE"]["Test data"], 4)
model_results_dict_new["MAPE difference"] = MAPE_diff
model_results_dict_new["RMSE difference"] = RMSE_diff
print("\nMAPE difference: {} - {} = {} %".format(model_results_dict_new["MAPE"]["Test data"], model_results_dict_all["MAPE"]["Test data"], MAPE_diff))
print("RMSE difference: {} - {} = {}\n".format(model_results_dict_new["RMSE"]["Test data"], model_results_dict_all["RMSE"]["Test data"], RMSE_diff))

# Add the dictionary to the results dictionary
results_dict2["PCA"] = model_results_dict_new

CV MAPE (scaled) train data:  13.780800000000001 %
CV RMSE (scaled) train data: 3.8533
CV computation time: 27m, 16s

MAPE (scaled) test data:  12.9055 %
RMSE (scaled) test data: 3.5008
Model prediction time: 0s


**MAPE and RMSE on test data per instance size:**

Number Customers,6,7,8,9,10,11,12,13,14,Mean
MAPE,11.5882,12.026,11.8583,12.8728,12.7622,13.2019,12.6856,13.1309,14.3778,12.9055
RMSE,4.4515,3.9392,3.8939,3.7653,3.4448,3.3983,3.0937,3.0312,3.2358,3.5008



MAPE difference: 12.9055 - 3.3343 = 9.5712 %
RMSE difference: 3.5008 - 0.8999 = 2.6009



**Save results**

In [12]:
# Get all the scores from the results dictionary
MAPE_test_scores = [np.round(dict["MAPE"]["Test data"], 2) for dict in results_dict2.values()]
RMSE_test_scores = [dict["RMSE"]["Test data"] for dict in results_dict2.values()]
MAPE_test_deltas = [None] + list(np.round([dict["MAPE difference"] for dict in results_dict2.values()][1:], 2))
RMSE_test_deltas = [dict["RMSE difference"] for dict in results_dict2.values()]

# Save all results in Data Frames
df_mape2 = pd.DataFrame(data=[MAPE_test_scores, MAPE_test_deltas], 
                       index=["MAPE test scores", "MAPE test difference"], 
                       columns=["All features", "Top 20 features", "PCA"])

df_rmse2 = pd.DataFrame(data=[RMSE_test_scores, RMSE_test_deltas], 
                       index=["RMSE test scores", "RMSE test difference"], 
                       columns=["All features", "Top 20 features", "PCA"])
display(df_mape2, df_rmse2)

# Save data frames with results into an excel file
file_path = str(f"04_test_results/{optimization_problem}_top20_pca.xlsx")

# Use ExcelWriter to write multiple DataFrames to the same file
with pd.ExcelWriter(file_path) as writer:
    df_mape2.to_excel(writer, sheet_name="MAPE_scores")
    df_rmse2.to_excel(writer, sheet_name="RMSE_scores")

Unnamed: 0,All Features,Top 20 Features,PCA
MAPE test scores,3.33,6.25,12.91
MAPE test difference,,2.92,9.57


Unnamed: 0,All Features,Top 20 Features,PCA
RMSE test scores,0.8999,1.7862,3.5008
RMSE test difference,,0.8863,2.6009


**further tests**

In [None]:
# Create pipeline
pipe = make_pipeline(StandardScaler(),
                     PCA(n_components=20), # Apply PCA to the training data and reduce it to n_components 
                     MLPRegressor(hidden_layer_sizes=(256, 128, 64), activation="relu", learning_rate="adaptive", 
                                  max_iter=1000, random_state=42))
pipe.set_params(**best_params)

# Estimate model performance with cross-validation on the train set and get scores on test set (scoring: MAPE and RMSE)
model_results_dict_new = fun_scores(pipe, X_train, y_train, X_test, y_test, compute_test_scores=True)

# Compare the new results with the results of all categories
MAPE_diff = np.round(model_results_dict_new["MAPE"]["Test data"] - model_results_dict_all["MAPE"]["Test data"], 4)
RMSE_diff = np.round(model_results_dict_new["RMSE"]["Test data"] - model_results_dict_all["RMSE"]["Test data"], 4)
model_results_dict_new["MAPE difference"] = MAPE_diff
model_results_dict_new["RMSE difference"] = RMSE_diff
print("\nMAPE difference: {} - {} = {} %".format(model_results_dict_new["MAPE"]["Test data"], model_results_dict_all["MAPE"]["Test data"], MAPE_diff))
print("RMSE difference: {} - {} = {}\n".format(model_results_dict_new["RMSE"]["Test data"], model_results_dict_all["RMSE"]["Test data"], RMSE_diff))

# Add the dictionary to the results dictionary
results_dict2["PCA"] = model_results_dict_new