In [1]:
import time
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler 
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression

# Import helperfunctions
from ML_functions import fun_load_data, fun_preprocessing, fun_fit_tuning
from ML_functions import fun_convert_time
from ML_functions import fun_scaled_neg_MAPE, fun_scaled_neg_RMSE, fun_predict_with_scaling, fun_tuning_results, fun_scores

# Assign string 'TSP' or 'CVRP' to the following variable to define the optimization problem
optimization_problem = 'TSP'
train_size = 0.7

# Load data
data = fun_load_data(optimization_problem)
X, y, train_data = fun_preprocessing(data)

# Create a train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, random_state=42)

### **Scaling the train and test scores**

In [6]:
# Function make predictions with a model, scale the predictions and compute the MAPE and RMSE for the train or test set
def fun_predict_with_scaling(model, X_train, y_train, X_predict, y_true, apply_scaling):
    
    # Fit model on train data and get predictions for X_predict (X_predict usually is X_test, but the prediction for X_train is also possible to get the train score)
    try: # If the model is already fitted (e.g. a grid search model after the tuning), you can directly make the predictions
        y_pred = model.predict(X_predict)
        fit_time = None
    except:
        start = time.time()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_predict)
        fit_time = fun_convert_time(start=start, end=time.time())
    
    # Improve predictions: Sum of predicted Shapley values must be equal to the total costs for all instances
    if (apply_scaling == True):
        
        # Connect the X_predict Data Frame with the true y labels of y_true; then assighn the predictions as a columns to the Data Frame
        Xy_train = pd.merge(left=X_predict, right=y_true, left_index=True, right_index=True)
        Xy_train_pred = Xy_train.assign(Predictions=pd.Series(data=y_pred, index=X_predict.index))
        
        # Compute the sum of predicted Shapley values and the sum of true Shapley values (the sum of the predicted Shapley values should be equal to the total costs/sum of all Shapley values of an instance)
        Xy_train_pred['Sum of Predictions'] = Xy_train_pred.groupby('Instance ID')['Predictions'].transform('sum')
        Xy_train_pred['Sum of Costs (Shapley values)'] = Xy_train_pred.groupby('Instance ID')['Shapley Value'].transform('sum')
        
        # Compute new predictions in column 'Improved Predictions' and get all predictions as a pd.Series; optionally view the Data Frame Xy_train_pred
        Xy_train_pred['Improved Predictions'] = Xy_train_pred['Predictions'] * (Xy_train_pred['Sum of Costs (Shapley values)'] / Xy_train_pred['Sum of Predictions'])
        y_pred = Xy_train_pred['Improved Predictions']
        #display(Xy_train_pred[['Instance ID', 'Number Customers', 'Total Costs', 'Sum of Costs (Shapley values)', 'Predictions', 'Sum of Predictions', 'Improved Predictions', 'Shapley Value']].sort_index().head(12))
    
    # If the scaling is not applied, just add the correct indices to the predictions for the categorical scores later on
    else: y_pred = pd.Series(data=y_pred, index=X_predict.index)

    # Compute errors
    MAPE_score = np.round(mean_absolute_percentage_error(y_true=y_true, y_pred=y_pred), 6) * 100
    RMSE_score = np.round(mean_squared_error(y_true=y_true, y_pred=y_pred, squared=False), 4)

    return MAPE_score, RMSE_score, y_pred, fit_time

In [7]:
# Compute train and test scores
def fun_scores(model, X_train, y_train, X_test=None, y_test=None, apply_scaling=True, compute_test_scores=False):

    # Get CV train scores of a grid search model
    if (hasattr(model, 'best_score_')):
        MAPE_train = - np.round(model.best_score_, 6) * 100
        RMSE_train, cv_computation_time = None, None
        print('CV MAPE (scaled) train data:  {} %'.format(MAPE_train)) 
        
        # Show best parameter combination
        print('\nBest model / parameter combination:')
        if (len(model.get_params()) <= 10): display(model.best_estimator_)
        else: display(model.best_params_)

    # Compute CV train scores if model is a usual estimator and measure CV computation time
    else:
        # Get MAPE and RMSE scores from the model's scaled predictions and unscaled predictions
        start = time.time()
        cv_scores = cross_validate(estimator=model, X=X_train, y=y_train, cv=3, n_jobs=-1,
                                   scoring={'scaled_mape': fun_scaled_neg_MAPE,
                                            'scaled_rmse': fun_scaled_neg_RMSE,
                                            'original_neg_mape': 'neg_mean_absolute_percentage_error',
                                            'original_neg_rmse': 'neg_root_mean_squared_error'})
        cv_computation_time = fun_convert_time(start=start, end=time.time())

        # Print train scores for either the scaled predictions or the unscaled predictions
        if (apply_scaling == True):
            MAPE_train = - np.round(cv_scores['test_scaled_mape'].mean(), 6) * 100
            RMSE_train = - np.round(cv_scores['test_scaled_rmse'].mean(), 4)
        else:
            MAPE_train = - np.round(cv_scores['test_original_neg_mape'].mean(), 6) * 100
            RMSE_train = - np.round(cv_scores['test_original_neg_rmse'].mean(), 4)
        print('CV MAPE ({}) train data:  {} %'.format('scaled' if apply_scaling else 'original', MAPE_train))
        print('CV RMSE ({}) train data: {}'.format('scaled' if apply_scaling else 'original', RMSE_train))        
        print('CV computation time:', cv_computation_time)
    
    # Compute test scores if compute_test_scores == True
    if (compute_test_scores == True):
        if (X_test is None) or (y_test is None): raise ValueError("You need to define X_test and y_test to compute the test scores.")
        # Get MAPE and RMSE scores from the model's scaled predictions and update scores
        MAPE_test, RMSE_test, y_pred, fit_time = fun_predict_with_scaling(model, X_train, y_train, X_test, y_test, apply_scaling)
        MAPE = {'Train data': MAPE_train, 'Test data': MAPE_test}
        RMSE = {'Train data': RMSE_train, 'Test data': RMSE_test}

        # Compute error measures for each instance size group individually
        # Group X by instance size and apply for each group the error measure fct. Use indices of each group to select the regarding true y values and the improved predictions
        entities = 'Customers' if ('Number Customers' in X_train.columns) else 'Items' # Feature name in TSP and CVRP: 'Number Customers', Bin_Packing: 'Number Items'
        MAPE_cat = X_test.groupby(by='Number ' + entities).apply(lambda group: mean_absolute_percentage_error(y_true=y_test.loc[group.index], y_pred=y_pred.loc[group.index]))
        RMSE_cat = X_test.groupby(by='Number ' + entities).apply(lambda group: mean_squared_error(y_true=y_test.loc[group.index], y_pred=y_pred.loc[group.index], squared=False))

        # Round results and merge them into a data frame
        MAPE_cat = np.round(MAPE_cat, 6) * 100
        RMSE_cat = np.round(RMSE_cat, 4)
        df = pd.DataFrame(data=[MAPE_cat, RMSE_cat], index=['MAPE', 'RMSE'])
        df['Mean'] = [MAPE_test, RMSE_test]

        # Print results and show data frame of instance size groups
        print('\nMAPE ({}) test data:  {} %'.format('scaled' if apply_scaling else 'original', MAPE_test))
        print('RMSE ({}) test data: {}'.format('scaled' if apply_scaling else 'original', RMSE_test))
        if (fit_time is not None): print('Model fit time:', fit_time)
        print('\nMAPE and RMSE on test data per instance size:'), display(df)

        return {'MAPE': MAPE, 'RMSE': RMSE, 'CV computation time': cv_computation_time, 'Fit model time': fit_time, 'Scores per instance size': df}

    else: return {'MAPE': MAPE_train, 'RMSE': RMSE_train, 'CV computation time': cv_computation_time}

In [3]:
#from ML_functions import fun_tuning_results, fun_predict_with_scaling, fun_scores

# Create model
lr = LinearRegression()

# Estimate model performance with cross validation on the train set (scoring: MAPE and RMSE)
model_results_dict = fun_scores(model=lr, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, compute_test_scores=True)

CV MAPE (scaled) train data:  23.7618 %
CV RMSE (scaled) train data: 7.6303
CV computation time: 3s

MAPE (scaled) test data:  19.1927 %
RMSE (scaled) test data: 3.8665
Model fit time: 0s

MAPE and RMSE on test data per instance size:


Number Customers,6,7,8,9,10,11,12,13,14,Mean
MAPE,14.3279,13.5322,13.4337,13.4527,15.2197,14.1552,17.9915,22.0098,36.5798,19.1927
RMSE,4.7701,3.9908,3.6785,3.0414,3.0327,2.9727,3.2028,4.0646,5.2378,3.8665


### **Scaling the cross-validation scores**

In [137]:
# Function to compute the scaled MAPE in a CV process
# (Scale the predictions, such that the sum of all predicitons per instance is equal to the sum of the Shapley values of that instance)
def fun_scaled_neg_MAPE(estimator, X, y_true):
    # Make predictions
    y_pred = estimator.predict(X)
    
    # Connect the X_predict Data Frame with the true y labels of y_true; then assighn the predictions as a columns to the Data Frame
    Xy_train = pd.merge(left=X, right=y_true, left_index=True, right_index=True)
    Xy_train_pred = Xy_train.assign(Predictions=pd.Series(data=y_pred, index=X.index))
    
    # Compute the sum of predicted Shapley values and the sum of true Shapley values (the sum of the predicted Shapley values should be equal to the total costs/sum of all Shapley values of an instance)
    Xy_train_pred['Sum of Predictions'] = Xy_train_pred.groupby('Instance ID')['Predictions'].transform('sum')
    Xy_train_pred['Sum of Costs (Shapley values)'] = Xy_train_pred.groupby('Instance ID')['Shapley Value'].transform('sum')
    
    # Compute new predictions
    y_pred = Xy_train_pred['Predictions'] * (Xy_train_pred['Sum of Costs (Shapley values)'] / Xy_train_pred['Sum of Predictions'])

    return - np.mean(np.abs((y_true - y_pred) / y_true))

# Function to compute the scaled RMSE  in a CV process
def fun_scaled_neg_RMSE(estimator, X, y_true):
    # Make predictions
    y_pred = estimator.predict(X)
    
    # Connect the X_predict Data Frame with the true y labels of y_true; then assighn the predictions as a columns to the Data Frame
    Xy_train = pd.merge(left=X, right=y_true, left_index=True, right_index=True)
    Xy_train_pred = Xy_train.assign(Predictions=pd.Series(data=y_pred, index=X.index))
    
    # Compute the sum of predicted Shapley values and the sum of true Shapley values (the sum of the predicted Shapley values should be equal to the total costs/sum of all Shapley values of an instance)
    Xy_train_pred['Sum of Predictions'] = Xy_train_pred.groupby('Instance ID')['Predictions'].transform('sum')
    Xy_train_pred['Sum of Costs (Shapley values)'] = Xy_train_pred.groupby('Instance ID')['Shapley Value'].transform('sum')
    
    # Compute new predictions
    y_pred = Xy_train_pred['Predictions'] * (Xy_train_pred['Sum of Costs (Shapley values)'] / Xy_train_pred['Sum of Predictions'])

    return - np.sqrt(np.mean((y_true - y_pred)**2))

**1. Scaling during Cross-Validation**

In [3]:
cv_scores = cross_validate(estimator=lr, X=X_train, y=y_train, cv=3,
                           scoring={'scaled_neg_mape': fun_scaled_neg_MAPE,
                                    'scaled_neg_rmse': fun_scaled_neg_RMSE,
                                    'original_neg_mape': 'neg_mean_absolute_percentage_error',
                                    'original_neg_rmse': 'neg_root_mean_squared_error'}, 
                           n_jobs=1, verbose=False)

MAPE1 = - np.round(cv_scores['test_scaled_neg_mape'].mean(), 6) * 100
RMSE1 = - np.round(cv_scores['test_scaled_neg_rmse'].mean(), 4)
MAPE2 = - np.round(cv_scores['test_original_neg_mape'].mean(), 6) * 100
RMSE2 = - np.round(cv_scores['test_original_neg_rmse'].mean(), 4)

# Print train scores
print('CV MAPE (scaled) train data:  {} %'.format(MAPE1))
print('CV RMSE (scaled) train data: ', RMSE1)
print('\nCV MAPE (original) train data:  {} %'.format(MAPE2))
print('CV RMSE (original) train data: ', RMSE2)

CV MAPE (scaled) train data:  13.9566 %
CV RMSE (scaled) train data:  2.963

CV MAPE (original) train data:  16.150100000000002 %
CV RMSE (original) train data:  3.2191


**2. Scaling during Hyperparametertunging with Cross-Validation**

In [4]:
# Grid search cross validation with data scaling
pipe = Pipeline(steps=[('scaler', None), 
                       ('knn', KNeighborsRegressor())])

param_grid = {'scaler': [StandardScaler(), MinMaxScaler()],
              'knn__n_neighbors': list(np.arange(start=7, stop=12))}

grid_search = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=3,
                           scoring=fun_scaled_neg_MAPE, verbose=True, n_jobs=-1)
fit_time = fun_fit_tuning(search_method=grid_search, X_train=X_train, y_train=y_train, file_name=optimization_problem + '_KNN')

# View results of grid search cross validation
model_results_dict = fun_scores(model=grid_search, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, compute_test_scores=True)

# View grid search CV scores of all parameter combinations
results_df = fun_tuning_results(search_method=grid_search, search_space=param_grid)

Fitting 3 folds for each of 14 candidates, totalling 42 fits
Tuning fit time: 3m, 41s
CV MAPE (scaled) train data:  20.0617 %

Best model / parameter combination:


{'knn__n_neighbors': 10, 'scaler': StandardScaler()}


MAPE (scaled) test data:  20.2527 %
RMSE (scaled) test data: 3.8038

MAPE and RMSE on test data per instance size:


Number Customers,6,7,8,9,10,11,12,13,14,Mean
MAPE,18.136,19.8731,20.5779,19.2701,21.5637,23.9496,18.9224,18.4711,20.7266,20.2527
RMSE,5.4058,4.6265,4.263,3.9368,3.9171,3.501,3.3223,3.1965,3.177,3.8038


Cross validation scores of different parameter combinations:


Unnamed: 0,param_scaler,param_knn__n_neighbors,mean_test_score,converted_mean_fit_time
0,StandardScaler(),10,-0.200617,0s
1,StandardScaler(),9,-0.200644,0s
2,StandardScaler(),8,-0.200801,0s
3,StandardScaler(),11,-0.201046,0s
4,StandardScaler(),7,-0.201471,0s
5,StandardScaler(),6,-0.202578,0s
6,StandardScaler(),5,-0.20528,0s
7,MinMaxScaler(),10,-0.211307,0s
8,MinMaxScaler(),11,-0.211429,0s
9,MinMaxScaler(),9,-0.211632,0s


In [5]:
from ML_functions import fun_load_best_params

# Load best parameters of the model
best_params = fun_load_best_params(optimization_problem + '_KNN_best_params.pkl')

# Create a pipline and set best_params as parameters
pipe = Pipeline(steps=[('scaler', None), 
                       ('knn', KNeighborsRegressor())])
pipe.set_params(**best_params)

# Estimate model performance with cross-validation on the train set and get scores on test set (scoring: MAPE and RMSE)
model_results_dict = fun_scores(model=pipe, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, compute_test_scores=True)

{'knn__n_neighbors': 10, 'scaler': StandardScaler()}

CV MAPE (scaled) train data:  20.0617 %
CV RMSE (scaled) train data: 3.9645
CV computation time: 1m, 12s

MAPE (scaled) test data:  20.2527 %
RMSE (scaled) test data: 3.8038
Model fit time: 1m, 5s

MAPE and RMSE on test data per instance size:


Number Customers,6,7,8,9,10,11,12,13,14,Mean
MAPE,18.136,19.8731,20.5779,19.2701,21.5637,23.9496,18.9224,18.4711,20.7266,20.2527
RMSE,5.4058,4.6265,4.263,3.9368,3.9171,3.501,3.3223,3.1965,3.177,3.8038
