In [1]:
import os
import sys

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.neural_network import MLPRegressor

# Add the parent directory to the Python path to load funtions from file ML_funtions
current_directory = os.getcwd()
parent_directory = os.path.dirname(current_directory)
sys.path.append(parent_directory)

# Import helperfunctions
from ML_functions import fun_load_data, fun_preprocessing, fun_fit_tuning, fun_load_best_params
from ML_functions import fun_convert_time
from ML_functions import fun_scaled_neg_MAPE, fun_tuning_results, fun_scores

# Assign string "TSP" or "CVRP" to the following variable to define the optimization problem
optimization_problem = "BPP"

# Load data
data = fun_load_data(optimization_problem)

# Do the train test split during the preprocessing
X_train, X_test, y_train, y_test, train_data = fun_preprocessing(data, train_size=0.8)

**Compute train score with all features: Neural Network - Multi Layer Perceptron**

In [None]:
# Load best parameters of the model
best_params = fun_load_best_params(optimization_problem, model_abbreviation="NN")

# The solver "lbfgs" performed best, but was very slow. Therefore, these tests are done with the best parameters for the "adam" solver.
best_params = {"mlpregressor__alpha": 0.05, "mlpregressor__batch_size": "auto", 
               "mlpregressor__hidden_layer_sizes": (100, 100), "mlpregressor__solver": "adam"} # Or "mlpregressor__hidden_layer_sizes": (100)

# Create pipeline
pipe = make_pipeline(StandardScaler(), 
                     MLPRegressor(activation="relu", learning_rate="adaptive", 
                                  max_iter=1000, shuffle=True, random_state=0))
pipe.set_params(**best_params)

# Estimate model performance with cross-validation on the train set (scoring: MAPE and RMSE)
model_results_dict_all = fun_scores(pipe, X_train, y_train)

#model_results_dict_all = {"MAPE": 4.8439, "RMSE": 0.0213}

{'mlpregressor__alpha': 0.5,
 'mlpregressor__batch_size': 'auto',
 'mlpregressor__hidden_layer_sizes': (100, 100),
 'mlpregressor__solver': 'lbfgs'}

CV MAPE (scaled) train data:  4.843999999999999 %
CV RMSE (scaled) train data: 0.0213
CV computation time: 28s


**Exclude feature categories**

In [10]:
# View all features
display(train_data.columns)

# Essential feature categories
instance_features = ["Instance ID", "Number Items", "Item ID", "Item Weight Ratio", "Item Size Ratio", "Bin Weight", "Bin Size"]
weight_and_sum_features = ["Weight Size Sum Ratio", "Item Volume Ratio", "Item Density Ratio"]
cost_features = ["Marginal Cost/Bins Ratio", "Total Bins"] #"Shapley Value"
statistical_features = ["Weight Sum", "Size Sum", "Weight Std", "Size Std", "Weight Max", 
                        "Size Max", "Weight Min", "Size Min", "Correlation", "Skewness Weight", "Skewness Size"]

# Potential feature categories
item_utilization_features = ["Item Bin Utilization Weight Ratio", "Item Bin Utilization Size Ratio", "Item Total Bin Utilization Ratio"]
combination_features = ["Weight Bin Combinations Ratio", "Size Bin Combinations Ratio", "Total Bin Combinations Ratio"]
perfect_combination_features = ["Perfect Weight Bin Combinations Ratio", "Perfect Size Bin Combinations Ratio", "Perfect Total Bin Combinations Ratio"]
quantile_features = ["Weight Quantile Values Ratio", "Size Quantile Values Ratio"]
percentile_features = ["25% Percentile Weight", "50% Percentile Weight", "75% Percentile Weight", 
                       "25% Percentile Size", "50% Percentile Size", "75% Percentile Size",
                       "Weight / 25% Percentile Ratio", "Weight / 50% Percentile Ratio", "Weight / 75% Percentile Ratio", 
                       "Size / 25% Percentile Ratio", "Size / 50% Percentile Ratio", "Size / 75% Percentile Ratio"]
final_utilization_features = ["Final Bin Utilization Weight", "Final Bin Utilization Size", "Final Total Bin Utilization"]

# Combine lists to one complete list and one dictionary
essential_features = instance_features + weight_and_sum_features + statistical_features + cost_features
all_features = essential_features + item_utilization_features + combination_features + perfect_combination_features + quantile_features + percentile_features + final_utilization_features
feature_categories_dict = {"item_utilization_features": item_utilization_features,
                           "combination_features": combination_features,
                           "perfect_combination_features": perfect_combination_features,
                           "quantile_features": quantile_features,
                           "percentile_features": percentile_features,
                           "final_utilization_features": final_utilization_features}

Index(['Instance ID', 'Number Items', 'Item ID', 'Item Weight Ratio',
       'Item Size Ratio', 'Bin Weight', 'Bin Size', 'Weight Size Sum Ratio',
       'Item Volume Ratio', 'Item Density Ratio',
       'Item Bin Utilization Weight Ratio', 'Item Bin Utilization Size Ratio',
       'Item Total Bin Utilization Ratio', 'Weight Bin Combinations Ratio',
       'Size Bin Combinations Ratio', 'Total Bin Combinations Ratio',
       'Perfect Weight Bin Combinations Ratio',
       'Perfect Size Bin Combinations Ratio',
       'Perfect Total Bin Combinations Ratio', 'Weight Quantile Values Ratio',
       'Size Quantile Values Ratio', '25% Percentile Weight',
       '50% Percentile Weight', '75% Percentile Weight', '25% Percentile Size',
       '50% Percentile Size', '75% Percentile Size', 'Weight / 0% Percentile',
       'Weight / 25% Percentile Ratio', 'Weight / 50% Percentile Ratio',
       'Weight / 75% Percentile Ratio', 'Weight / 100% Percentile',
       'Size / 0% Percentile', 'Size / 25% 

In [5]:
for key in feature_categories_dict.keys():
    print("############### Excluded feature category: {} ###############".format(key))
    
    # Select only the used features in the train set
    excluded_features = [i for i in feature_categories_dict[key]] + [i + " Ratio" for i in feature_categories_dict[key]]
    used_features = [i for i in all_features if i not in excluded_features]
    X_train_small = X_train[used_features]
    print("Number of excluded features:", len(feature_categories_dict[key]))
    print("Number of used features: {}\n".format(len(used_features)))
    #display(used_features)

    # Estimate model performance with cross-validation on the train set (scoring: MAPE and RMSE)
    model_results_dict_new = fun_scores(pipe, X_train_small, y_train)

    # Compare the new results with the results of all categories
    MAPE_diff = model_results_dict_new["MAPE"] - model_results_dict_all["MAPE"]
    RMSE_diff = model_results_dict_new["RMSE"] - model_results_dict_all["RMSE"]
    print("\nMAPE difference: {} - {} = {} %".format(model_results_dict_new["MAPE"], model_results_dict_all["MAPE"], MAPE_diff))
    print("RMSE difference: {} - {} = {}\n".format(model_results_dict_new["RMSE"], model_results_dict_all["RMSE"], RMSE_diff))

############### Excluded feature category: item_utilization_features ###############
Number of excluded features: 3
Number of used features: 46

CV MAPE (scaled) train data:  4.9765 %
CV RMSE (scaled) train data: 0.0217
CV computation time: 24s

MAPE difference: 4.9765 - 4.843999999999999 = 0.13250000000000028 %
RMSE difference: 0.0217 - 0.0213 = 0.00040000000000000105

############### Excluded feature category: combination_features ###############
Number of excluded features: 3
Number of used features: 46

CV MAPE (scaled) train data:  5.5963 %
CV RMSE (scaled) train data: 0.0243
CV computation time: 25s

MAPE difference: 5.5963 - 4.843999999999999 = 0.7523000000000009 %
RMSE difference: 0.0243 - 0.0213 = 0.002999999999999999

############### Excluded feature category: perfect_combination_features ###############
Number of excluded features: 3
Number of used features: 46

CV MAPE (scaled) train data:  5.0516 %
CV RMSE (scaled) train data: 0.022
CV computation time: 28s

MAPE differenc

**Add single features to the essential features**

In [12]:
# Essential feature categories
instance_features = ["Instance ID", "Number Items", "Item ID", "Item Weight Ratio", "Item Size Ratio", "Bin Weight", "Bin Size"]
weight_and_sum_features = ["Weight Size Sum Ratio", "Item Volume Ratio", "Item Density Ratio"]
statistical_features = ["Weight Sum", "Size Sum", "Weight Std", "Size Std", "Weight Max", 
                        "Size Max", "Weight Min", "Size Min", "Correlation", "Skewness Weight", "Skewness Size"]
cost_features = ["Marginal Cost/Bins Ratio", "Total Bins"] #"Shapley Value"
total_features = ["Item Total Bin Utilization Ratio", "Total Bin Combinations Ratio", "Perfect Total Bin Combinations Ratio", "Final Total Bin Utilization"]

# Potential feature categories
item_utilization_features = ["Item Bin Utilization Weight Ratio", "Item Bin Utilization Size Ratio"]
combination_features = ["Weight Bin Combinations Ratio", "Size Bin Combinations Ratio"]
perfect_combination_features = ["Perfect Weight Bin Combinations Ratio", "Perfect Size Bin Combinations Ratio"]
quantile_features = ["Weight Quantile Values Ratio", "Size Quantile Values Ratio"]
percentile_features1 = ["25% Percentile Weight", "50% Percentile Weight", "75% Percentile Weight", 
                       "25% Percentile Size", "50% Percentile Size", "75% Percentile Size"]
percentile_features2 = ["Weight / 25% Percentile Ratio", "Weight / 50% Percentile Ratio", "Weight / 75% Percentile Ratio", 
                       "Size / 25% Percentile Ratio", "Size / 50% Percentile Ratio", "Size / 75% Percentile Ratio"]
final_utilization_features = ["Final Bin Utilization Weight", "Final Bin Utilization Size"]

# Combine lists to one complete list and one dictionary
essential_features = instance_features + weight_and_sum_features + statistical_features + cost_features + total_features
all_features = essential_features + item_utilization_features + combination_features + perfect_combination_features + quantile_features + percentile_features1 + percentile_features2 + final_utilization_features
feature_categories_dict = {"item_utilization_features": item_utilization_features,
                           "combination_features": combination_features,
                           "perfect_combination_features": perfect_combination_features,
                           "quantile_features": quantile_features,
                           "percentile_features1": percentile_features1,
                           "percentile_features2": percentile_features2,
                           "final_utilization_features": final_utilization_features}

print("Number of essential features:", len(essential_features))

Number of essential features: 27


In [5]:
# Estimate model performance with cross-validation on the train set (scoring: MAPE and RMSE)
X_train_small = X_train[essential_features]
model_results_dict_essential = fun_scores(pipe, X_train_small, y_train)

#model_results_dict_essential = {"MAPE": 5.115, "RMSE": 0.0229}

CV MAPE (scaled) train data:  5.115 %
CV RMSE (scaled) train data: 0.0229
CV computation time: 22s


In [6]:
potential_features = [item_utilization_features, combination_features, perfect_combination_features, quantile_features, percentile_features1, percentile_features2, final_utilization_features]

# Add iteratively a single feature or a list of features to the essential features and compute the score difference (compared to the score of the essential features only)
for added_features in potential_features:
    print("############### Added features: ###############\n {}".format(added_features))

    # Select only the used features in the train set
    if isinstance(added_features, list): # Check whether added_feature is a single feature (string) or a list of features
        used_features = essential_features + added_features
    else: used_features = essential_features + [added_features]
    X_train_small = X_train[used_features]
    print("Number of used features: {}\n".format(len(used_features)))
    #display(used_features)

    # Estimate model performance with cross-validation on the train set (scoring: MAPE and RMSE)
    model_results_dict_new = fun_scores(pipe, X_train_small, y_train)

    # Compare the new results with the results of the essential features only
    MAPE_diff = model_results_dict_new["MAPE"] - model_results_dict_essential["MAPE"]
    RMSE_diff = model_results_dict_new["RMSE"] - model_results_dict_essential["RMSE"]
    print("\nMAPE difference: {} - {} = {} %".format(model_results_dict_new["MAPE"], model_results_dict_essential["MAPE"], MAPE_diff))
    print("RMSE difference: {} - {} = {}\n".format(model_results_dict_new["RMSE"], model_results_dict_essential["RMSE"], RMSE_diff))

############### Added features: ###############
 ['Item Bin Utilization Weight Ratio', 'Item Bin Utilization Size Ratio']
Number of used features: 29

CV MAPE (scaled) train data:  5.2522 %
CV RMSE (scaled) train data: 0.0232
CV computation time: 26s

MAPE difference: 5.2522 - 5.115 = 0.1372 %
RMSE difference: 0.0232 - 0.0229 = 0.0002999999999999982

############### Added features: ###############
 ['Weight Bin Combinations Ratio', 'Size Bin Combinations Ratio']
Number of used features: 29

CV MAPE (scaled) train data:  5.196 %
CV RMSE (scaled) train data: 0.0229
CV computation time: 25s

MAPE difference: 5.196 - 5.115 = 0.08099999999999952 %
RMSE difference: 0.0229 - 0.0229 = 0.0

############### Added features: ###############
 ['Perfect Weight Bin Combinations Ratio', 'Perfect Size Bin Combinations Ratio']
Number of used features: 29

CV MAPE (scaled) train data:  5.1047 %
CV RMSE (scaled) train data: 0.0225
CV computation time: 21s

MAPE difference: 5.1047 - 5.115 = -0.010299999999

**Update essential features**

In [14]:
essential_features += perfect_combination_features + final_utilization_features

# Estimate model performance with cross-validation on the train set (scoring: MAPE and RMSE)
X_train_small = X_train[essential_features]
model_results_dict_essential = fun_scores(pipe, X_train_small, y_train)

#model_results_dict_essential = {"MAPE": 4.9702, "RMSE": 0.022}

CV MAPE (scaled) train data:  4.9702 %
CV RMSE (scaled) train data: 0.022
CV computation time: 19s


In [16]:
print("Number of used features:", len(essential_features))
display(essential_features)

Number of used features: 31


['Instance ID',
 'Number Items',
 'Item ID',
 'Item Weight Ratio',
 'Item Size Ratio',
 'Bin Weight',
 'Bin Size',
 'Weight Size Sum Ratio',
 'Item Volume Ratio',
 'Item Density Ratio',
 'Weight Sum',
 'Size Sum',
 'Weight Std',
 'Size Std',
 'Weight Max',
 'Size Max',
 'Weight Min',
 'Size Min',
 'Correlation',
 'Skewness Weight',
 'Skewness Size',
 'Marginal Costs/Bins Ratio',
 'Total Bins',
 'Item Total Bin Utilization Ratio',
 'Total Bin Combinations Ratio',
 'Perfect Total Bin Combinations Ratio',
 'Final Total Bin Utilization',
 'Perfect Weight Bin Combinations Ratio',
 'Perfect Size Bin Combinations Ratio',
 'Final Bin Utilization Weight',
 'Final Bin Utilization Size']

In [None]:
# Maybe not necessary features
["Perfect Weight Bin Combinations Ratio",
 "Perfect Size Bin Combinations Ratio",
 "Final Bin Utilization Weight",
 "Final Bin Utilization Size"]

In [17]:
dropped_features = item_utilization_features + combination_features + quantile_features + percentile_features1 + percentile_features2
display(dropped_features)

['Item Bin Utilization Weight Ratio',
 'Item Bin Utilization Size Ratio',
 'Weight Bin Combinations Ratio',
 'Size Bin Combinations Ratio',
 'Weight Quantile Values Ratio',
 'Size Quantile Values Ratio',
 '25% Percentile Weight',
 '50% Percentile Weight',
 '75% Percentile Weight',
 '25% Percentile Size',
 '50% Percentile Size',
 '75% Percentile Size',
 'Weight / 25% Percentile Ratio',
 'Weight / 50% Percentile Ratio',
 'Weight / 75% Percentile Ratio',
 'Size / 25% Percentile Ratio',
 'Size / 50% Percentile Ratio',
 'Size / 75% Percentile Ratio']