In [1]:
import time
import pickle
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

# Import helperfunctions
from ML_functions import fun_load_data, fun_save_file, fun_preprocessing
from ML_functions import fun_convert_time
from ML_functions import fun_scores

# Start time count
start_script = time.time()

# Load optimization_problem ('TSP' or 'CVRP') and the size of the train set
with open('settings.pkl', 'rb') as file:
    settings = pickle.load(file)
optimization_problem, train_size = settings['optimization_problem'], settings['train_size']

# Assign string 'TSP' or 'CVRP' to the following variable to define the routing problem
optimization_problem = 'TSP'
train_size = 0.75

# Load data
data = fun_load_data(optimization_problem=optimization_problem)
X, y, train_data = fun_preprocessing(data)

# Create a train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, random_state=0)

# Create a smaller train set for SVM
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X, y, train_size=0.05, random_state=0)

# Save number of features and train sizes and create a dictionary to store the results
n_features = X_train.shape[1]
train_size = f'{int(np.round(100 * len(X_train) / len(X)))} %'
train_size_s = f'{int(np.round(100 * len(X_train_s) / len(X)))} %'
results_dict = {}

# **KNN**

In [2]:
# Create pipline to scale each fold first during CV
pipe = Pipeline(steps=[('scaler', StandardScaler()), 
                       ('knn', KNeighborsRegressor(n_neighbors=5))])

# Estimate model performance with cross validation on the train set (scoring: MAPE and RMSE)
model_results_dict = fun_scores(model=pipe, X_train=X_train, y_train=y_train, cv=3)

# Save results to dictionary
model_results_dict['Train size'] = train_size
results_dict['KNN'] = model_results_dict

  CV MAPE train data:  25.806 %
  CV RMSE train data:  5.3892
  CV computation time: 12 sec


# **Linear Models**

**Linear Regression**

In [4]:
# Create model
lr = LinearRegression()

# Estimate model performance with cross validation on the train set (scoring: MAPE and RMSE)
model_results_dict = fun_scores(lr, X_train, y_train, cv=5)

# Save results to dictionary
model_results_dict['Train size'] = train_size
results_dict['Linear Regression'] = model_results_dict

  CV MAPE train data:  16.1503 %
  CV RMSE train data:  3.2184
  CV computation time: 2 sec


**Rigde Regression (L2-Regularization)**

In [5]:
# Create model with default parameters
ridge = Ridge(alpha=1) # alpha=0: linear regression without regularization

# Estimate model performance with cross validation on the train set (scoring: MAPE and RMSE)
model_results_dict = fun_scores(ridge, X_train, y_train, cv=5)

# Save results to dictionary
model_results_dict['Train size'] = train_size
results_dict['Ridge Regression'] = model_results_dict

  CV MAPE train data:  16.148899999999998 %
  CV RMSE train data:  3.2184
  CV computation time: 2 sec


**Lasso Regression (L1-Regularization)**

In [6]:
# if (optimization_problem == 'TSP'):

#     # Create model with default parameters
#     lasso = Lasso(alpha=1, max_iter=10000) # Higher alpha means higher regularization and lower model complexity (less overfitting)

#     # Estimate model performance with cross validation on the train set (scoring: MAPE and RMSE)
#     model_results_dict = fun_scores(lasso, X_train, y_train, cv=2)

#     # Save results to dictionary
#     model_results_dict['Train size'] = train_size
#     results_dict['Lasso Regression'] = model_results_dict

# else: print('This cell is only executed for the TSP!')

**Support Vector Machine (SVM)**

In [48]:
if (optimization_problem == 'TSP'):
    
    # Create pipline to scale each fold first during CV
    pipe = Pipeline(steps=[('scaler', StandardScaler()), 
                        ('SVM', SVR(kernel='linear', C=1))]) #regularization parameter C controls trade-off between maximizing the margin and minimizing the classification error (how important it is to satisfy the constraint)

    # Estimate model performance with cross validation on the train set (scoring: MAPE and RMSE)
    model_results_dict = fun_scores(model=pipe, X_train=X_train_s, y_train=y_train_s, cv=2)

    # Save results to dictionary
    model_results_dict['Train size'] = train_size_s
    results_dict['Linear SVM'] = model_results_dict

else: print('This cell is only executed for the TSP!')

  CV MAPE train data:  16.4465 %
  CV RMSE train data:  3.3243
  CV computation time: 1 min, 7 sec


# **Decision Tree**

In [8]:
if (optimization_problem == 'TSP'):

    # Create model with default parameters
    tree = DecisionTreeRegressor(max_depth=None, max_leaf_nodes=None, min_samples_leaf=1, 
                                min_impurity_decrease=0, random_state=0)

    # Estimate model performance with cross validation on the train set (scoring: MAPE and RMSE)
    model_results_dict = fun_scores(tree, X_train, y_train, cv=3)

    # Save results to dictionary
    model_results_dict['Train size'] = train_size
    results_dict['Decision Tree'] = model_results_dict

else: print('This cell is only executed for the TSP!')

  CV MAPE train data:  11.0683 %
  CV RMSE train data:  3.7304
  CV computation time: 4 sec


# **Ensebmles of Decision Trees**
**Random Forest**

In [9]:
# Create model with default parameters
forest = RandomForestRegressor(n_estimators=100, max_features=n_features, max_depth=None, 
                               max_leaf_nodes=None, min_samples_leaf=1, min_impurity_decrease=0,
                               random_state=0, bootstrap=True, n_jobs=-1)

# Estimate model performance with cross validation on the train set (scoring: MAPE and RMSE)
model_results_dict = fun_scores(forest, X_train, y_train, cv=3)

# Save results to dictionary
model_results_dict['Train size'] = train_size
results_dict['Random Forest'] = model_results_dict

  CV MAPE train data:  6.6114999999999995 %
  CV RMSE train data:  2.0375
  CV computation time: 4 min, 24 sec


**Gradient Boosting Regression Trees**

In [10]:
if (optimization_problem == 'TSP'):
    # Create model with default parameters
    gbrt = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, # Lower learning rate requires more trees
                                    max_depth=3, max_leaf_nodes=None,
                                    random_state=0)

    # Estimate model performance with cross validation on the train set (scoring: MAPE and RMSE)
    model_results_dict = fun_scores(gbrt, X_train, y_train, cv=3)

    # Save results to dictionary
    model_results_dict['Train size'] = train_size
    results_dict['Gradient Boosting Regression Tree'] = model_results_dict

else: print('This cell is only executed for the TSP!')

  CV MAPE train data:  8.2129 %
  CV RMSE train data:  2.174
  CV computation time: 2 min, 31 sec


**Extreme Gradient Boosting: XGBoost-Package**

In [11]:
# Create model with default parameters
xgboost = xgb.XGBRegressor(objective='reg:squarederror',
                           n_estimators=None, 
                           learning_rate=None,
                           max_depth=None)

# Estimate model performance with cross validation on the train set (scoring: MAPE and RMSE)
model_results_dict = fun_scores(xgboost, X_train, y_train, cv=3)

# Save results to dictionary
model_results_dict['Train size'] = train_size
results_dict['XGBoost'] = model_results_dict

  CV MAPE train data:  6.0082 %
  CV RMSE train data:  1.7806
  CV computation time: 6 sec


# **Support Vector Machines with Kernels - Kernel Machines**

**Gaussian Kernel**

In [27]:
if (optimization_problem == 'TSP'):
    
    # Create pipline to scale each fold first during CV
    pipe = Pipeline(steps=[('scaler', StandardScaler()), 
                        ('SVM', SVR(kernel='rbf', C=1, gamma=1/n_features))])

    # Estimate model performance with cross validation on the train set (scoring: MAPE and RMSE)
    model_results_dict = fun_scores(model=pipe, X_train=X_train_s, y_train=y_train_s, cv=2)

    # Save results to dictionary
    model_results_dict['Train size'] = train_size_s
    results_dict['SVM Gaussian Kernel'] = model_results_dict

else: print('This cell is only executed for the TSP!')

  CV MAPE train data:  14.1731 %
  CV RMSE train data:  4.6833
  CV computation time: 30 sec


**Polynomial Kernel**

In [36]:
if (optimization_problem == 'TSP'):

    # Create pipline to scale each fold first during CV
    pipe = Pipeline(steps=[('scaler', StandardScaler()), 
                        ('SVM', SVR(kernel='poly', C=10, gamma=1/n_features, degree=3))])

    # Estimate model performance with cross validation on the train set (scoring: MAPE and RMSE)
    model_results_dict = fun_scores(model=pipe, X_train=X_train_s, y_train=y_train_s, cv=2)

    # Save results to dictionary
    model_results_dict['Train size'] = train_size_s
    results_dict['SVM Polynomial Kernel'] = model_results_dict

else: print('This cell is only executed for the TSP!')

  CV MAPE train data:  25.902199999999997 %
  CV RMSE train data:  5.3705
  CV computation time: 6 sec


# **Neural Network**

In [14]:
# Create pipline to scale each fold first during CV
pipe = Pipeline(steps=[('scaler', StandardScaler()), 
                       ('MLP', MLPRegressor(hidden_layer_sizes=(100,), alpha=0.0001,
                                           activation='relu', solver='adam', max_iter=1000, random_state=0))])

# Estimate model performance with cross validation on the train set (scoring: MAPE and RMSE)
model_results_dict = fun_scores(model=pipe, X_train=X_train, y_train=y_train, cv=3)

# Save results to dictionary
model_results_dict['Train size'] = train_size
results_dict['Neural Network'] = model_results_dict

  CV MAPE train data:  4.0641 %
  CV RMSE train data:  0.9677
  CV computation time: 1 min, 19 sec


# **Compare Results**

In [15]:
results_df = pd.DataFrame(results_dict).sort_values(by='MAPE', axis=1)
display(results_df)
fun_save_file(data=results_df, subfolder_path='04_results', name=optimization_problem + '_untuned_models_train_scores.xlsx')
print('Total script computation time:', fun_convert_time(start=start_script, end=time.time()))

Unnamed: 0,Neural Network,XGBoost,Random Forest,Gradient Boosting Regression Tree,Decision Tree,Linear SVM,Ridge Regression,Linear Regression,KNN,SVM Gaussian Kernel,SVM Polynomial Kernel
MAPE,4.0641,6.0082,6.6115,8.2129,11.0683,15.9746,16.1489,16.1503,25.806,27.2515,66.328
RMSE,0.9677,1.7806,2.0375,2.174,3.7304,3.31,3.2184,3.2184,5.3892,8.4212,11.758
CV computation time,"1 min, 19 sec",6 sec,"4 min, 24 sec","2 min, 31 sec",4 sec,1 sec,2 sec,2 sec,12 sec,1 sec,1 sec
Train size,75 %,75 %,75 %,75 %,75 %,5 %,75 %,75 %,75 %,5 %,5 %


Total script computation time: 9 min, 35 sec
