# Evaluation Of All Models

In [1]:
import sys
import os
sys.path.append("../")

In [2]:
from utils import display_Results
from models.read_processed_data_csv import get_train_test_data

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.neural_network import MLPRegressor
from xgboost.sklearn import XGBRegressor
from catboost import CatBoostRegressor

In [11]:
data_processed_dir = "../../../Data/Data HCM Thay Hien Processed/"
feature_type = "Sensor"

In [12]:
if feature_type == "Sensor":
        data_folder_name = "Sensor Features"
        
        train_data_standardized_name = "train_data_sensor_standardized.csv"
        train_labels_name  = "train_label_sensor.csv"
        test_data_standardized_name = "test_data_sensor_standardized.csv"
        test_labels_name = "test_label_sensor.csv"
        
elif feature_type == "Images":
    data_folder_name = "Image Features"

    train_data_standardized_name = "train_data_image_standardized.csv"
    train_labels_name = "train_label_image.csv"
    test_data_standardized_name = "test_data_image_standardized.csv"
    test_labels_name = "test_label_image.csv"

elif feature_type == "Combined":
    data_folder_name = "Combined Features"

    train_data_standardized_name = "train_data_combined_standardized.csv"
    train_labels_name = "train_label_combined.csv"
    test_data_standardized_name = "test_data_combined_standardized.csv"
    test_labels_name = "test_label_combined.csv"

elif feature_type == "Combined+GlobalWeather":
    data_folder_name = "Combined Features + Global Weather"

    train_data_standardized_name = "train_data_combined_global_weather_standardized.csv"
    train_labels_name = "train_label_combined_global_weather.csv"
    test_data_standardized_name = "test_data_combined_global_weather_standardized.csv"
    test_labels_name = "test_label_combined_global_weather.csv"

elif feature_type == "Sensor+GlobalWeather":
    data_folder_name = "Sensor Features + Global Weather"

    train_data_standardized_name = "train_data_sensor_global_weather_standardized.csv"
    train_labels_name = "train_label_sensor_global_weather.csv"
    test_data_standardized_name = "test_data_sensor_global_weather_standardized.csv"
    test_labels_name = "test_label_sensor_global_weather.csv"

In [13]:
random_split_dir = os.path.join(data_processed_dir, data_folder_name, 'Random split')
X_train_random_split, y_train_random_split, X_test_random_split, y_test_random_split = get_train_test_data(random_split_dir, train_data_standardized_name, train_labels_name, test_data_standardized_name, test_labels_name)

In [45]:
models_map = {
         "Linear Regression" : LinearRegression,
         "SVM": SVR,
         "Decision Tree": DecisionTreeRegressor,
         "Random Forest": RandomForestRegressor,
         "Extra Trees": ExtraTreesRegressor,
         "Neural Network": MLPRegressor,
         "Catboost": CatBoostRegressor,
         "XGBoost": XGBRegressor
        }
    
LR_params = {
                'fit_intercept': True,
                'normalize': False,
                'copy_X': True,
                'n_jobs': None
            }
SVR_params = {
                'kernel': 'rbf',
                'degree': 3,
                'gamma': 'scale',
                'coef0': 0.0,
                'tol': 1e-3,
                'C': 100,
                'epsilon': 0.5,
                'shrinking': True,
                'cache_size': 200,
                'verbose': False,
                'max_iter': -1
             }
DecTree_params = {  
                    'criterion': 'mse',
                    'splitter': 'best',
                    'max_depth': 30,
                    'min_samples_split': 16,
                    'min_samples_leaf': 1,
                    'min_weight_fraction_leaf':0.0,
                    'max_features': "sqrt",
                    'random_state': 24,
                    'max_leaf_nodes': None,
                    'min_impurity_decrease': 0,
                    'ccp_alpha': 0.0
                  }
RF_params = { 
                'n_estimators': 1000,
                'criterion': 'mse',
                'max_depth': 20,
                'min_samples_split': 4,
                'min_samples_leaf': 1,
                'min_weight_fraction_leaf':0.0,
                'max_features': 'sqrt',
                'max_leaf_nodes': None,
                'min_impurity_decrease': 0.0,
                'bootstrap': True,
                'oob_score': True,
                'n_jobs': None,
                'random_state': 24,
                'verbose': 0,
                'warm_start': False,
                'ccp_alpha': 0.0
            }

ExTree_params = {
                    'n_estimators': 1000,
                    'criterion': 'mse',
                    'max_depth': 20,
                    'min_samples_split': 2,
                    'min_samples_leaf': 1,
                    'min_weight_fraction_leaf': 0.0,
                    'max_features': 'auto',
                    'max_leaf_nodes': None,
                    'min_impurity_decrease': 0.0,
                    'bootstrap': True,
                    'oob_score': True,
                    'n_jobs': None,
                    'random_state': 24,
                    'verbose': 0,
                    'warm_start': False,
                    'ccp_alpha': 0.0,
                    'max_samples': None,
                }

NN_params = {
                'hidden_layer_sizes': (10, 5, 3),
                'activation': 'relu',
                'solver': 'adam',
                'alpha': 0.0001,
                'batch_size': 'auto',
                'learning_rate_init': 0.05,
                'learning_rate': 'invscaling',
                'power_t': 0.5,
                'max_iter': 2000,
                'shuffle': True,
                'random_state': 24,
                'tol': 1e-4,
                'verbose': False,
                'warm_start': False,
                'momentum': 0.9,
                'nesterovs_momentum': True,
                'early_stopping': True,
                'validation_fraction': 0.1,
                'beta_1': 0.9,
                'beta_2': 0.999,
                'epsilon': 1e-8,
                'n_iter_no_change': 50,
                'max_fun': 15000
            }

CB_params = {
                'loss_function': 'RMSE',
                'n_estimators': 1000,
                'learning_rate': 0.03,
                'depth': 10,
                'l2_leaf_reg': 3.0,
                'eval_metric': 'RMSE',
                'random_seed': 24,
                'verbose': 0,
            }

XGB_params = {
                'objective': 'reg:squarederror',
                'n_estimators': 1000,
                "learning_rate": 0.05,
                'max_depth': 10,
                'min_child_weight': 1,
                'verbosity': 0,
                'random_state': 24   
             }

params_map = {
                "Linear Regression" : LR_params,
                 "SVM": SVR_params,
                 "Decision Tree": DecTree_params,
                 "Random Forest": RF_params,
                 "Extra Trees": ExTree_params,
                 "Neural Network": NN_params,
                 "Catboost": CB_params,
                 "XGBoost": XGB_params
             }

In [33]:
from sklearn.model_selection import cross_validate
scoring = ['r2', 'neg_root_mean_squared_error', 'neg_mean_absolute_error']

## Linear Regression

In [36]:
# Load model and evaluate

model_choice = "Linear Regression"
params = params_map[model_choice]
model = models_map[model_choice](**params)
model_name = model_choice

print("Fitting model {}...".format(model_choice))
model.fit(X_train_random_split, y_train_random_split.pm25)
print("Done")

print("Performing cross-validation...")
scores = cross_validate(model, X_train_random_split, y_train_random_split.pm25, cv=5, scoring=scoring)
print("Done")

print("Mean R2: {:.2f} (+/- {:.2f})".format(scores['test_r2'].mean(), scores['test_r2'].std()))
print("Mean RMSE: {:.2f} (+/- {:.2f})".format(scores['test_neg_root_mean_squared_error'].mean(), scores['test_neg_root_mean_squared_error'].std()))
print("Mean MAE: {:.2f} (+/- {:.2f})".format(scores['test_neg_mean_absolute_error'].mean(), scores['test_neg_mean_absolute_error'].std()))

print("")
print("Evaluating with Hold out Test set...")
preds = model.predict(X_test_random_split)
print("Done!")

print("Showing evaluation metrics results")
display_Results(y_test_random_split, preds, writeFile = False)


Fitting model Linear Regression...
Done
Performing cross-validation...
Done
Mean R2: 0.26 (+/- 0.07)
Mean RMSE: -74.33 (+/- 3.29)
Mean MAE: -56.32 (+/- 0.63)

Evaluating with Hold out Test set...
Done!
Showing evaluation metrics results
Regression Score for PM2.5 values
RMSE: 72.15200879812033
MAE: 56.15985303663634
R2 score: 0.2967873049333023
Regression Score for AQI values
RMSE: 71.4523265420677
MAE: 50.28756086800994
R2 score: 0.20482118419254614
Classification score for AQI rank
Accuracy Score: 52.43%
F1 score: 47.96%
Confusion Matrix:
[[   0    0    0  171  146    0]
 [   0    0    0    0    3    0]
 [   0    0    0    0    3    0]
 [   0    0    0  319 1306  153]
 [   0    0    0   27 3027  280]
 [   0    0    0    0 2124 1297]]


## Support Vector Machine

In [40]:
# Load model and evaluate

model_choice = "SVM"
params = params_map[model_choice]
model = models_map[model_choice](**params)
model_name = model_choice

print("Fitting model {}...".format(model_choice))
model.fit(X_train_random_split, y_train_random_split.pm25)
print("Done")

print("Performing cross-validation...")
scores = cross_validate(model, X_train_random_split, y_train_random_split.pm25, cv=5, scoring=scoring)
print("Done")

print("Mean R2: {:.2f} (+/- {:.2f})".format(scores['test_r2'].mean(), scores['test_r2'].std()))
print("Mean RMSE: {:.2f} (+/- {:.2f})".format(scores['test_neg_root_mean_squared_error'].mean(), scores['test_neg_root_mean_squared_error'].std()))
print("Mean MAE: {:.2f} (+/- {:.2f})".format(scores['test_neg_mean_absolute_error'].mean(), scores['test_neg_mean_absolute_error'].std()))

print("")
print("Evaluating with Hold out Test set...")
preds = model.predict(X_test_random_split)
print("Done!")

print("Showing evaluation metrics results")
display_Results(y_test_random_split, preds, writeFile = False)


Fitting model SVM...
Done
Performing cross-validation...
Done
Mean R2: 0.75 (+/- 0.01)
Mean RMSE: -43.37 (+/- 0.47)
Mean MAE: -28.30 (+/- 0.24)

Evaluating with Hold out Test set...
Done!
Showing evaluation metrics results
Regression Score for PM2.5 values
RMSE: 42.854324449697174
MAE: 28.021570883969964
R2 score: 0.7519271961288309
Regression Score for AQI values
RMSE: 42.27567022482552
MAE: 25.904603741313764
R2 score: 0.721636008074015
Classification score for AQI rank
Accuracy Score: 71.78%
F1 score: 72.27%
Confusion Matrix:
[[ 251   56   10    0    0    0]
 [   3    0    0    0    0    0]
 [   3    0    0    0    0    0]
 [  14   30   28  998  668   40]
 [   0    0    0  162 2435  737]
 [   0    0    0    1  747 2673]]


## Decision Tree

In [38]:
# Load model and evaluate

model_choice = "Decision Tree"
params = params_map[model_choice]
model = models_map[model_choice](**params)
model_name = model_choice

print("Fitting model {}...".format(model_choice))
model.fit(X_train_random_split, y_train_random_split.pm25)
print("Done")

print("Performing cross-validation...")
scores = cross_validate(model, X_train_random_split, y_train_random_split.pm25, cv=5, scoring=scoring)
print("Done")

print("Mean R2: {:.2f} (+/- {:.2f})".format(scores['test_r2'].mean(), scores['test_r2'].std()))
print("Mean RMSE: {:.2f} (+/- {:.2f})".format(scores['test_neg_root_mean_squared_error'].mean(), scores['test_neg_root_mean_squared_error'].std()))
print("Mean MAE: {:.2f} (+/- {:.2f})".format(scores['test_neg_mean_absolute_error'].mean(), scores['test_neg_mean_absolute_error'].std()))

print("")
print("Evaluating with Hold out Test set...")
preds = model.predict(X_test_random_split)
print("Done!")

print("Showing evaluation metrics results")
display_Results(y_test_random_split, preds, writeFile = False)


Fitting model Decision Tree...
Done
Performing cross-validation...
Done
Mean R2: 0.91 (+/- 0.01)
Mean RMSE: -25.45 (+/- 0.79)
Mean MAE: -10.36 (+/- 0.32)

Evaluating with Hold out Test set...
Done!
Showing evaluation metrics results
Regression Score for PM2.5 values
RMSE: 24.99842047298847
MAE: 10.069561345509854
R2 score: 0.9155859016707635
Regression Score for AQI values
RMSE: 24.50478994925832
MAE: 9.155727981578867
R2 score: 0.906473642335538
Classification score for AQI rank
Accuracy Score: 90.20%
F1 score: 90.20%
Confusion Matrix:
[[ 312    1    0    4    0    0]
 [   1    2    0    0    0    0]
 [   0    2    0    1    0    0]
 [   0    0    3 1557  200   18]
 [   0    0    0  110 2937  287]
 [   0    0    0   12  229 3180]]


## Random Forest

In [39]:
# Load model and evaluate

model_choice = "Random Forest"
params = params_map[model_choice]
model = models_map[model_choice](**params)
model_name = model_choice

print("Fitting model {}...".format(model_choice))
model.fit(X_train_random_split, y_train_random_split.pm25)
print("Done")

print("Performing cross-validation...")
scores = cross_validate(model, X_train_random_split, y_train_random_split.pm25, cv=5, scoring=scoring)
print("Done")

print("Mean R2: {:.2f} (+/- {:.2f})".format(scores['test_r2'].mean(), scores['test_r2'].std()))
print("Mean RMSE: {:.2f} (+/- {:.2f})".format(scores['test_neg_root_mean_squared_error'].mean(), scores['test_neg_root_mean_squared_error'].std()))
print("Mean MAE: {:.2f} (+/- {:.2f})".format(scores['test_neg_mean_absolute_error'].mean(), scores['test_neg_mean_absolute_error'].std()))

print("")
print("Evaluating with Hold out Test set...")
preds = model.predict(X_test_random_split)
print("Done!")

print("Showing evaluation metrics results")
display_Results(y_test_random_split, preds, writeFile = False)


Fitting model Random Forest...
Done
Performing cross-validation...
Done
Mean R2: 0.95 (+/- 0.00)
Mean RMSE: -18.65 (+/- 0.30)
Mean MAE: -8.40 (+/- 0.08)

Evaluating with Hold out Test set...
Done!
Showing evaluation metrics results
Regression Score for PM2.5 values
RMSE: 17.623793123961928
MAE: 7.853225644257528
R2 score: 0.9580445270471247
Regression Score for AQI values
RMSE: 18.81716087752003
MAE: 7.239902906813344
R2 score: 0.9448506689602466
Classification score for AQI rank
Accuracy Score: 91.79%
F1 score: 91.77%
Confusion Matrix:
[[ 316    1    0    0    0    0]
 [   2    1    0    0    0    0]
 [   0    2    0    1    0    0]
 [   0    0    0 1532  238    8]
 [   0    0    0   50 3027  257]
 [   0    0    0    1  167 3253]]


## Extra Trees

In [41]:
# Load model and evaluate

model_choice = "Extra Trees"
params = params_map[model_choice]
model = models_map[model_choice](**params)
model_name = model_choice

print("Fitting model {}...".format(model_choice))
model.fit(X_train_random_split, y_train_random_split.pm25)
print("Done")

print("Performing cross-validation...")
scores = cross_validate(model, X_train_random_split, y_train_random_split.pm25, cv=5, scoring=scoring)
print("Done")

print("Mean R2: {:.2f} (+/- {:.2f})".format(scores['test_r2'].mean(), scores['test_r2'].std()))
print("Mean RMSE: {:.2f} (+/- {:.2f})".format(scores['test_neg_root_mean_squared_error'].mean(), scores['test_neg_root_mean_squared_error'].std()))
print("Mean MAE: {:.2f} (+/- {:.2f})".format(scores['test_neg_mean_absolute_error'].mean(), scores['test_neg_mean_absolute_error'].std()))

print("")
print("Evaluating with Hold out Test set...")
preds = model.predict(X_test_random_split)
print("Done!")

print("Showing evaluation metrics results")
display_Results(y_test_random_split, preds, writeFile = False)


Fitting model Extra Trees...
Done
Performing cross-validation...
Done
Mean R2: 0.95 (+/- 0.00)
Mean RMSE: -19.18 (+/- 0.25)
Mean MAE: -9.01 (+/- 0.08)

Evaluating with Hold out Test set...
Done!
Showing evaluation metrics results
Regression Score for PM2.5 values
RMSE: 18.306232804110564
MAE: 8.631508046241223
R2 score: 0.9547323644386707
Regression Score for AQI values
RMSE: 19.363892501979286
MAE: 7.952036456571447
R2 score: 0.9415993905933762
Classification score for AQI rank
Accuracy Score: 91.11%
F1 score: 91.10%
Confusion Matrix:
[[ 317    0    0    0    0    0]
 [   2    1    0    0    0    0]
 [   0    2    0    1    0    0]
 [   0    0    0 1506  266    6]
 [   0    0    0   34 3009  291]
 [   0    0    0    1  184 3236]]


## Neural Network Regression

In [46]:
# Load model and evaluate

model_choice = "Neural Network"
params = params_map[model_choice]
model = models_map[model_choice](**params)
model_name = model_choice

print("Fitting model {}...".format(model_choice))
model.fit(X_train_random_split, y_train_random_split.pm25)
print("Done")

print("Performing cross-validation...")
scores = cross_validate(model, X_train_random_split, y_train_random_split.pm25, cv=5, scoring=scoring)
print("Done")

print("Mean R2: {:.2f} (+/- {:.2f})".format(scores['test_r2'].mean(), scores['test_r2'].std()))
print("Mean RMSE: {:.2f} (+/- {:.2f})".format(scores['test_neg_root_mean_squared_error'].mean(), scores['test_neg_root_mean_squared_error'].std()))
print("Mean MAE: {:.2f} (+/- {:.2f})".format(scores['test_neg_mean_absolute_error'].mean(), scores['test_neg_mean_absolute_error'].std()))

print("")
print("Evaluating with Hold out Test set...")
preds = model.predict(X_test_random_split)
print("Done!")

print("Showing evaluation metrics results")
display_Results(y_test_random_split, preds, writeFile = False)


Fitting model Neural Network...
Done
Performing cross-validation...
Done
Mean R2: 0.77 (+/- 0.01)
Mean RMSE: -41.74 (+/- 0.71)
Mean MAE: -29.70 (+/- 0.70)

Evaluating with Hold out Test set...
Done!
Showing evaluation metrics results
Regression Score for PM2.5 values
RMSE: 42.54010507246805
MAE: 30.99886832025937
R2 score: 0.7555517315957951
Regression Score for AQI values
RMSE: 43.08923459107507
MAE: 29.021795036191488
R2 score: 0.7108190951960724
Classification score for AQI rank
Accuracy Score: 67.99%
F1 score: 68.65%
Confusion Matrix:
[[ 127  176   14    0    0    0]
 [   0    3    0    0    0    0]
 [   1    2    0    0    0    0]
 [   0    6   25  950  780   17]
 [   0    0    1  181 2283  869]
 [   0    0    0    0  763 2658]]


In [51]:
model.n_layers_

5

## Catboost

In [43]:
# Load model and evaluate

model_choice = "Catboost"
params = params_map[model_choice]
model = models_map[model_choice](**params)
model_name = model_choice

print("Fitting model {}...".format(model_choice))
model.fit(X_train_random_split, y_train_random_split.pm25)
print("Done")

print("Performing cross-validation...")
scores = cross_validate(model, X_train_random_split, y_train_random_split.pm25, cv=5, scoring=scoring)
print("Done")

print("Mean R2: {:.2f} (+/- {:.2f})".format(scores['test_r2'].mean(), scores['test_r2'].std()))
print("Mean RMSE: {:.2f} (+/- {:.2f})".format(scores['test_neg_root_mean_squared_error'].mean(), scores['test_neg_root_mean_squared_error'].std()))
print("Mean MAE: {:.2f} (+/- {:.2f})".format(scores['test_neg_mean_absolute_error'].mean(), scores['test_neg_mean_absolute_error'].std()))

print("")
print("Evaluating with Hold out Test set...")
preds = model.predict(X_test_random_split)
print("Done!")

print("Showing evaluation metrics results")
display_Results(y_test_random_split, preds, writeFile = False)


Fitting model Catboost...
Done
Performing cross-validation...
Done
Mean R2: 0.93 (+/- 0.00)
Mean RMSE: -23.20 (+/- 0.41)
Mean MAE: -14.03 (+/- 0.21)

Evaluating with Hold out Test set...
Done!
Showing evaluation metrics results
Regression Score for PM2.5 values
RMSE: 22.63923822909437
MAE: 13.7940851130315
R2 score: 0.9307669494871388
Regression Score for AQI values
RMSE: 24.50849050222627
MAE: 12.85708225207424
R2 score: 0.9064453927260555
Classification score for AQI rank
Accuracy Score: 85.54%
F1 score: 85.51%
Confusion Matrix:
[[ 317    0    0    0    0    0]
 [   2    1    0    0    0    0]
 [   1    1    0    1    0    0]
 [   0    0    2 1377  393    6]
 [   0    0    0   79 2756  499]
 [   0    0    0    2  295 3124]]


## XGBoost

In [44]:
# Load model and evaluate

model_choice = "XGBoost"
params = params_map[model_choice]
model = models_map[model_choice](**params)
model_name = model_choice

print("Fitting model {}...".format(model_choice))
model.fit(X_train_random_split, y_train_random_split.pm25)
print("Done")

print("Performing cross-validation...")
scores = cross_validate(model, X_train_random_split, y_train_random_split.pm25, cv=5, scoring=scoring)
print("Done")

print("Mean R2: {:.2f} (+/- {:.2f})".format(scores['test_r2'].mean(), scores['test_r2'].std()))
print("Mean RMSE: {:.2f} (+/- {:.2f})".format(scores['test_neg_root_mean_squared_error'].mean(), scores['test_neg_root_mean_squared_error'].std()))
print("Mean MAE: {:.2f} (+/- {:.2f})".format(scores['test_neg_mean_absolute_error'].mean(), scores['test_neg_mean_absolute_error'].std()))

print("")
print("Evaluating with Hold out Test set...")
preds = model.predict(X_test_random_split)
print("Done!")

print("Showing evaluation metrics results")
display_Results(y_test_random_split, preds, writeFile = False)


Fitting model XGBoost...
Done
Performing cross-validation...
Done
Mean R2: 0.95 (+/- 0.00)
Mean RMSE: -19.98 (+/- 0.32)
Mean MAE: -9.37 (+/- 0.14)

Evaluating with Hold out Test set...
Done!
Showing evaluation metrics results
Regression Score for PM2.5 values
RMSE: 19.392632074792843
MAE: 9.157300942887863
R2 score: 0.9492000406784373
Regression Score for AQI values
RMSE: 20.290311417750537
MAE: 8.432324628306588
R2 score: 0.935877642908949
Classification score for AQI rank
Accuracy Score: 90.50%
F1 score: 90.50%
Confusion Matrix:
[[ 315    2    0    0    0    0]
 [   1    2    0    0    0    0]
 [   0    2    0    1    0    0]
 [   0    0    3 1541  225    9]
 [   0    0    0  101 2943  290]
 [   0    0    0    4  203 3214]]
