In [2]:
import time
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

# Import helperfunctions
from helperfunctions_models import fun_load_file, fun_preprocessing, fun_split_X_y
from helperfunctions_models import fun_convert_time
from helperfunctions_models import fun_train_score

# Start time count and load data
start_script = time.time()
data = fun_load_file(subfolder_path='..\\01_data\\01_TSP', name='tsp_instances_j_updated.xlsx')
train_data = fun_preprocessing(data)
X, y = fun_split_X_y(train_data)

# Create a train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

# Create a smaller train set for svm
X_train_small, X_test_small, y_train_small, y_test_small = train_test_split(X, y, test_size=0.9, random_state=0)

# Save number of features and train sizes
n_features = X_train.shape[1]
train_size = f'{int(np.round(100 * len(X_train) / len(X)))} %'
train_size_small = f'{int(np.round(100 * len(X_train_small) / len(X)))} %'

# **KNN**

In [2]:
# Create model with default parameters
knn = KNeighborsRegressor(n_neighbors=5)

# Estimate model performance with cross validation on the train set (scoring: MAPE and RMSE)
MAPE, RMSE, computation_time = fun_train_score(knn, X_train, y_train, cv=10, return_results=True)

# Save results to dictionary
results_dict = {}
results_dict['KNN'] = {'MAPE': MAPE, 'RMSE': RMSE, 'CV computation time': computation_time, 'Train size': train_size}

  CV MAPE train data:  52.525999999999996 %
  CV RMSE train data:  12.0287
  CV computation time: 14 sec


# **Linear Models**

**Linear Regression**

In [3]:
# Create model
lr = LinearRegression()

# Estimate model performance with cross validation on the train set (scoring: MAPE and RMSE)
MAPE, RMSE, computation_time = fun_train_score(lr, X_train, y_train, cv=10, return_results=True)

# Save results to dictionary
results_dict['Linear Regression'] = {'MAPE': MAPE, 'RMSE': RMSE, 'CV computation time': computation_time, 'Train size': train_size}

  CV MAPE train data:  16.2226 %
  CV RMSE train data:  3.268
  CV computation time: 2 sec


**Rigde Regression (L2-Regularization)**

In [4]:
# Create model with default parameters
ridge = Ridge(alpha=1) # alpha=0: linear regression without regularization

# Estimate model performance with cross validation on the train set (scoring: MAPE and RMSE)
MAPE, RMSE, computation_time = fun_train_score(ridge, X_train, y_train, cv=10, return_results=True)

# Save results to dictionary
results_dict['Ridge Regression'] = {'MAPE': MAPE, 'RMSE': RMSE, 'CV computation time': computation_time, 'Train size': train_size}

  CV MAPE train data:  16.2208 %
  CV RMSE train data:  3.268
  CV computation time: 0 sec


**Lasso Regression (L1-Regularization)**

In [5]:
# Create model with default parameters
lasso = Lasso(alpha=1, max_iter=10000) # Higher alpha means higher regularization and lower model complexity (less overfitting)

# Estimate model performance with cross validation on the train set (scoring: MAPE and RMSE)
MAPE, RMSE, computation_time = fun_train_score(lasso, X_train, y_train, cv=10, return_results=True)

# Save results to dictionary
results_dict['Lasso Regression'] = {'MAPE': MAPE, 'RMSE': RMSE, 'CV computation time': computation_time, 'Train size': train_size}

  CV MAPE train data:  20.2219 %
  CV RMSE train data:  4.3451
  CV computation time: 2 sec


**Support Vector Machine (SVM)**

In [6]:
# Compare the scaling methods
best_MAPE = 100
for i in [StandardScaler(), MinMaxScaler(), RobustScaler()]:

    # Scale the train set first
    scaler = i
    X_train_scaled = scaler.fit_transform(X_train_small)

    # Create model with default parameters
    svm = SVR(kernel='linear', C=1) #regularization parameter C controls trade-off between maximizing the margin and minimizing the classification error (how important it is to satisfy the constraint)

    # Estimate model performance with cross validation on the train set (scoring: MAPE and RMSE)
    print('Method:', i)
    MAPE, RMSE, computation_time = fun_train_score(svm, X_train_scaled, y_train_small, cv=3, return_results=True)

    # Save best result
    if MAPE < best_MAPE:
        best_MAPE = MAPE
        best_RMSE = RMSE
        best_computation_time = computation_time

# Save results to dictionary
results_dict['Linear SVM'] = {'MAPE': MAPE, 'RMSE': RMSE, 'CV computation time': best_computation_time, 'Train size': train_size_small}

Method: StandardScaler()
  CV MAPE train data:  16.4337 %
  CV RMSE train data:  3.367
  CV computation time: 2 min, 37 sec
Method: MinMaxScaler()
  CV MAPE train data:  16.6138 %
  CV RMSE train data:  3.3881
  CV computation time: 1 min, 1 sec
Method: RobustScaler()
  CV MAPE train data:  16.484299999999998 %
  CV RMSE train data:  3.4087
  CV computation time: 4 min, 43 sec


# **Decision Tree**

In [7]:
# Create model with default parameters
tree = DecisionTreeRegressor(max_depth=None, max_leaf_nodes=None, min_samples_leaf=1, 
                             min_impurity_decrease=0, random_state=0)

# Estimate model performance with cross validation on the train set (scoring: MAPE and RMSE)
MAPE, RMSE, computation_time = fun_train_score(tree, X_train, y_train, cv=10, return_results=True)

# Save results to dictionary
results_dict['Decision Tree'] = {'MAPE': MAPE, 'RMSE': RMSE, 'CV computation time': computation_time, 'Train size': train_size}

  CV MAPE train data:  10.7706 %
  CV RMSE train data:  3.6624
  CV computation time: 11 sec


# **Ensebmles of Decision Trees**
**Random Forest**

In [8]:
# Create model with default parameters
forest = RandomForestRegressor(n_estimators=100, max_features=n_features, max_depth=None, 
                               max_leaf_nodes=None, min_samples_leaf=1, min_impurity_decrease=0,
                               random_state=0, bootstrap=True, n_jobs=-1)

# Estimate model performance with cross validation on the train set (scoring: MAPE and RMSE)
MAPE, RMSE, computation_time = fun_train_score(forest, X_train, y_train, cv=10, return_results=True)

# Save results to dictionary
results_dict['Random Forest'] = {'MAPE': MAPE, 'RMSE': RMSE, 'CV computation time': computation_time, 'Train size': train_size}

  CV MAPE train data:  6.4811 %
  CV RMSE train data:  2.0806
  CV computation time: 15 min, 35 sec


**Gradient Boosting Regression Trees**

In [9]:
# Create model with default parameters
gbrt = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, #lower learning rate requires more trees
                                 max_depth=3, max_leaf_nodes=None,
                                 random_state=0)

# Estimate model performance with cross validation on the train set (scoring: MAPE and RMSE)
MAPE, RMSE, computation_time = fun_train_score(gbrt, X_train, y_train, cv=10, return_results=True)

# Save results to dictionary
results_dict['Gradient Boosting Regression Tree'] = {'MAPE': MAPE, 'RMSE': RMSE, 'CV computation time': computation_time, 'Train size': train_size}

  CV MAPE train data:  8.390400000000001 %
  CV RMSE train data:  2.2089
  CV computation time: 4 min, 42 sec


**Extreme Gradient Boosting: XGBoost-Package**

In [10]:
# Create model with default parameters
xgboost = xgb.XGBRegressor(objective='reg:squarederror',
                           n_estimators=None, 
                           learning_rate=None,
                           max_depth=None)

# Estimate model performance with cross validation on the train set (scoring: MAPE and RMSE)
MAPE, RMSE, computation_time = fun_train_score(xgboost, X_train, y_train, cv=10, return_results=True)

# Save results to dictionary
results_dict['XGBoost'] = {'MAPE': MAPE, 'RMSE': RMSE, 'CV computation time': computation_time, 'Train size': train_size}

  CV MAPE train data:  5.8328999999999995 %
  CV RMSE train data:  1.7093
  CV computation time: 12 sec


# **Support Vector Machines with Kernels - Kernel Machines**

**Gaussian Kernel**

In [11]:
# Compare the scaling methods
best_MAPE = 100
for i in [StandardScaler(), MinMaxScaler(), RobustScaler()]:

    # Scale the train set first
    scaler = i
    X_train_scaled = scaler.fit_transform(X_train_small)

    # Create model with default parameters
    svm = SVR(kernel='rbf', C=1, gamma=1/n_features)

    # Estimate model performance with cross validation on the train set (scoring: MAPE and RMSE)
    print('Method:', i)
    MAPE, RMSE, computation_time = fun_train_score(svm, X_train_scaled, y_train_small, cv=3, return_results=True)

    # Save best result
    if MAPE < best_MAPE:
        best_MAPE = MAPE
        best_RMSE = RMSE
        best_computation_time = computation_time

# Save results to dictionary
results_dict['SVM Gaussian Kernel'] = {'MAPE': MAPE, 'RMSE': RMSE, 'CV computation time': best_computation_time, 'Train size': train_size_small}

Method: StandardScaler()
  CV MAPE train data:  11.7727 %
  CV RMSE train data:  3.8696
  CV computation time: 1 min, 33 sec
Method: MinMaxScaler()
  CV MAPE train data:  19.5699 %
  CV RMSE train data:  4.1749
  CV computation time: 1 min, 33 sec
Method: RobustScaler()
  CV MAPE train data:  13.3898 %
  CV RMSE train data:  3.9637
  CV computation time: 1 min, 41 sec


**Polynomial Kernel**

In [3]:
# Compare the scaling methods
best_MAPE = 100
for i in [StandardScaler(), MinMaxScaler()]:

    # Scale the train set first
    scaler = i
    X_train_scaled = scaler.fit_transform(X_train_small)

    # Create model with default parameters
    svm = SVR(kernel='poly', C=1, gamma=1/n_features, degree=3)

    # Estimate model performance with cross validation on the train set (scoring: MAPE and RMSE)
    print('Method:', i)
    MAPE, RMSE, computation_time = fun_train_score(svm, X_train_scaled, y_train_small, cv=3, return_results=True)

    # Save best result
    if MAPE < best_MAPE:
        best_MAPE = MAPE
        best_RMSE = RMSE
        best_computation_time = computation_time

# Save results to dictionary
results_dict['SVM Polynomial Kernel'] = {'MAPE': MAPE, 'RMSE': RMSE, 'CV computation time': best_computation_time, 'Train size': train_size_small}

Method: StandardScaler()
  CV MAPE train data:  27.285999999999998 %
  CV RMSE train data:  6.0196
  CV computation time: 5 sec
Method: MinMaxScaler()
  CV MAPE train data:  74.039 %
  CV RMSE train data:  17.7553
  CV computation time: 3 sec
Method: RobustScaler()


# **Neural Network**

In [3]:
# Compare the scaling methods
best_MAPE = 100
for i in [StandardScaler(), MinMaxScaler(), RobustScaler()]:

    # Scale the train set first
    scaler = i
    X_train_scaled = scaler.fit_transform(X_train)

    # Create model with default parameters
    mlp = MLPRegressor(hidden_layer_sizes=(100,), alpha=0.0001,
                       activation='relu', solver='adam', max_iter=1000, random_state=0)

    # Estimate model performance with cross validation on the train set (scoring: MAPE and RMSE)
    print('Method:', i)
    MAPE, RMSE, computation_time = fun_train_score(mlp, X_train_scaled, y_train, cv=3, return_results=True)

    # Save best result
    if MAPE < best_MAPE:
        best_MAPE = MAPE
        best_RMSE = RMSE
        best_computation_time = computation_time

# Save results to dictionary
results_dict['Neural Network'] = {'MAPE': MAPE, 'RMSE': RMSE, 'CV computation time': best_computation_time, 'Train size': train_size}

Method: StandardScaler()
  CV MAPE train data:  3.8944 %
  CV RMSE train data:  0.941
  CV computation time: 55 sec
Method: MinMaxScaler()


# **Compare Results**

In [None]:
display(pd.DataFrame(results_dict).sort_values(by='MAPE', axis=1))
print('Total script computation time:', fun_convert_time(start=start_script, end=time.time()))

Unnamed: 0,Neural Network,XGBoost,Random Forest,Gradient Boosting Regression Tree,SVM Gaussian Kernel,Decision Tree,Linear Regression,Ridge Regression,Linear SVM,Lasso Regression,SVM Polynomial Kernel,KNN
MAPE,3.8686,5.8153,6.6329,8.5405,9.2823,10.8992,16.6011,16.6023,16.8364,19.7085,33.5229,51.7483
RMSE,0.9121,1.6959,2.1148,2.2269,2.8699,3.6824,3.293,3.2931,3.3775,4.1037,7.3372,12.4712
CV computation time,39 sec,10 sec,"12 min, 55 sec","4 min, 0 sec","1 min, 34 sec",10 sec,2 sec,0 sec,"1 min, 58 sec",3 sec,"1 min, 7 sec",12 sec
Train size,75 %,75 %,75 %,75 %,40 %,75 %,75 %,75 %,40 %,75 %,40 %,75 %


Total script computation time: 43 min, 19 sec
