In [1]:
import os
import time
from glob import glob

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import datetime
import joblib
from functools import partial

# Import ML Packages
import sklearn
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer

import xgboost

In [2]:
file_list = glob("../data/input_1D/*.csv")

pm_df = [pd.read_csv(file, low_memory=False) for file in file_list]
pm_df = pd.concat(pm_df, ignore_index=True)

print("Data loading finished!")

Data loading finished!


# XGBoost

In [3]:
def train_RF(all_df, buffer_avg=True, target="047", validation=False):
    
    if buffer_avg:
        feature_list = ['aod_047', 'aod_055', 'aod_buffer_047', 'aod_buffer_055',  # 'avg_pm25',
                        'day_cos', 'day_sin', 'daymet_dayl', 'daymet_lat', 'daymet_lon',
                        'daymet_prcp', 'daymet_srad', 'daymet_tmax', 'daymet_tmin', 'daymet_vp',
                        'dem', 'gridmet_th', 'gridmet_vs',  # 'knnidw_distance', 'knnidw_pm25', 'knnidw_pm25_val',
                        'month_cos', 'month_sin', 'ndvi', 'wildfire_smoke',
                        'year']

    else:
        feature_list = ['aod_047', 'aod_055',  # 'aod_buffer_047', 'aod_buffer_055',  # 'avg_pm25',
                        'day_cos', 'day_sin', 'daymet_dayl', 'daymet_lat', 'daymet_lon',
                        'daymet_prcp', 'daymet_srad', 'daymet_tmax', 'daymet_tmin', 'daymet_vp',
                        'dem', 'gridmet_th', 'gridmet_vs',  # 'knnidw_distance', 'knnidw_pm25', 'knnidw_pm25_val',
                        'month_cos', 'month_sin', 'ndvi', 'wildfire_smoke',
                        'year']

    all_df = all_df[feature_list]
    # Filter out records have grount truth
    all_df = all_df[all_df[f'aod_{target}'].notnull()]
    if buffer_avg:
        all_df = all_df[all_df[f'aod_buffer_{target}'].notnull()]
        print(f"Total Samples | Buffer: {buffer_avg}: {all_df.shape}")

    # Fill NaNs or RF cannot work
    all_df = all_df.fillna(-1)

    # Setting Hyperparameters. Please refer to the SI for more information
    if buffer_avg:
        xgb_params = dict(learning_rate=np.arange(0.05, 0.5, 0.05),
                          eta=np.arange(0.1, 0.5, 0.1),
                          n_estimators=np.arange(100, 1500, 100),
                          gamma=np.arange(1, 10, 1),
                          subsample=np.arange(0.1, 1.0, 0.01),
                          max_depth=[int(i) for i in np.arange(3, 12, 1)],
                          colsample_bytree=np.arange(0.1, 1.0, 0.2),
                          )
    else:
        xgb_params = dict(learning_rate=np.arange(0.01, 0.3, 0.03),
                          eta=np.arange(0.1, 0.5, 0.1),
                          n_estimators=np.arange(100, 1500, 100),
                          gamma=np.arange(1, 10, 1),
                          subsample=np.arange(0.8, 1.0, 0.02),
                          max_depth=[int(i) for i in np.arange(1, 12, 1)],
                          colsample_bytree=np.arange(0.1, 1.0, 0.2),
                          )
    
    num_fold = 1
    kfold = KFold(n_splits=10, shuffle=True)
    
    for train_idx, test_idx in kfold.split(all_df):
        
        print(f"Num Fold: {num_fold}")
        
        val_df = all_df.iloc[test_idx]
        train_df = all_df.iloc[train_idx]
        

        # inititalization
        xgb_regressor = xgboost.XGBRegressor(booster='gbtree',
                                             # objective='reg:squarederror',
                                             verbosity=0,
                                             tree_method='gpu_hist')

        # find optimal parameters for random forest regressor using  RandomizedSearchCV.
        # Set random_state=42 and be careful about scoring type
        xgb_regressor_cv = RandomizedSearchCV(xgb_regressor, xgb_params, cv=5,
                                              n_iter=50,
                                              scoring='neg_root_mean_squared_error',
                                              n_jobs=12)

        if validation:
            train_df, val_df = train_test_split(train_df, test_size=0.1)

            X_train = train_df.drop(columns=['aod_047', 'aod_055'])
            y_train = train_df[[f'aod_{target}']]

            X_val = val_df.drop(columns=['aod_047', 'aod_055'])
            y_val = val_df[[f'aod_{target}']]
        else:
            val_df = None
            X_train = train_df.drop(columns=['aod_047', 'aod_055'])
            y_train = train_df[[f'aod_{target}']]

        xgb_regressor_cv.fit(X_train, y_train.values.ravel())
        best_params = xgb_regressor_cv.best_params_
        print(best_params)

        # create best_rf_regressor sunig the parameters above and fit it to training data
        best_xgb_regressor = xgb_regressor_cv.best_estimator_
        best_xgb_regressor.fit(X_train, y_train.values.ravel())
        # model evaluation for training set
        train_r2_xgb = round(best_xgb_regressor.score(X_train, y_train), 2)
        print('Training R2 score of XGBoost is {}'.format(train_r2_xgb))
        y_train_predicted_xgb = best_xgb_regressor.predict(X_train)
        rmse_train_xgb = (np.sqrt(mean_squared_error(y_train, y_train_predicted_xgb)))
        print('RMSE on the training set for the XGBoost model is: {}'.format(rmse_train_xgb))
        mbe_train_xgb = np.mean(y_train_predicted_xgb - y_train.values.squeeze())
        print("MBE on training set is for the XGBoost model is: {}".format(mbe_train_xgb))

        if validation:
            # model evaluation for test set
            y_test_predicted_xgb = best_xgb_regressor.predict(X_val)
            rmse_test_xgb = (np.sqrt(mean_squared_error(y_val, y_test_predicted_xgb)))
            print("RMSE on testing set is for the XGBoost model is: {}".format(rmse_test_xgb))

            mbe_test_xgb = np.mean(y_test_predicted_xgb - y_val.values.squeeze())
            print("MBE on testing set is for the XGBoost model is: {}".format(mbe_test_xgb))
        
        num_fold += 1

    print("================================================================================")

In [4]:
# Start Training Model with Buffer Average
# print("Start Training Model with Buffer Average...")
# train_RF(all_df=pm_df, buffer_avg=True, target="047", validation=True)
# train_RF(all_df=pm_df, buffer_avg=True, target="055", validation=True)

# Start Training Model with Buffer Average
print("Start Training Model w/o Buffer Average...")
train_RF(all_df=pm_df, buffer_avg=False, target="047", validation=True)
train_RF(all_df=pm_df, buffer_avg=False, target="055", validation=True)

Start Training Model w/o Buffer Average...
Num Fold: 1




{'subsample': 0.8200000000000001, 'n_estimators': 700, 'max_depth': 11, 'learning_rate': 0.19, 'gamma': 3, 'eta': 0.30000000000000004, 'colsample_bytree': 0.9000000000000001}
Training R2 score of XGBoost is 0.94
RMSE on the training set for the XGBoost model is: 36.43885098130988
MBE on training set is for the XGBoost model is: -0.006838233702145408
RMSE on testing set is for the XGBoost model is: 82.54945891876815
MBE on testing set is for the XGBoost model is: 0.7028795780456856
Num Fold: 2




{'subsample': 0.9600000000000002, 'n_estimators': 1100, 'max_depth': 11, 'learning_rate': 0.22, 'gamma': 1, 'eta': 0.30000000000000004, 'colsample_bytree': 0.7000000000000001}
Training R2 score of XGBoost is 0.97
RMSE on the training set for the XGBoost model is: 24.55542521116732
MBE on training set is for the XGBoost model is: -0.00465296548366278
RMSE on testing set is for the XGBoost model is: 80.91742591930247
MBE on testing set is for the XGBoost model is: 0.4661441871866485
Num Fold: 3




{'subsample': 0.8, 'n_estimators': 600, 'max_depth': 11, 'learning_rate': 0.19, 'gamma': 1, 'eta': 0.1, 'colsample_bytree': 0.7000000000000001}
Training R2 score of XGBoost is 0.92
RMSE on the training set for the XGBoost model is: 41.357271871910406
MBE on training set is for the XGBoost model is: -0.00228407204733947
RMSE on testing set is for the XGBoost model is: 82.98095509536745
MBE on testing set is for the XGBoost model is: 0.3254806427097767
Num Fold: 4
{'subsample': 0.9000000000000001, 'n_estimators': 1400, 'max_depth': 10, 'learning_rate': 0.19, 'gamma': 4, 'eta': 0.4, 'colsample_bytree': 0.9000000000000001}
Training R2 score of XGBoost is 0.95
RMSE on the training set for the XGBoost model is: 31.96631310039628
MBE on training set is for the XGBoost model is: -0.008127053272803778
RMSE on testing set is for the XGBoost model is: 81.37127323700919
MBE on testing set is for the XGBoost model is: 0.957002674591931
Num Fold: 5




{'subsample': 0.9800000000000002, 'n_estimators': 1100, 'max_depth': 11, 'learning_rate': 0.06999999999999999, 'gamma': 8, 'eta': 0.30000000000000004, 'colsample_bytree': 0.9000000000000001}
Training R2 score of XGBoost is 0.89
RMSE on the training set for the XGBoost model is: 48.36963892907919
MBE on training set is for the XGBoost model is: -0.006484482623156848
RMSE on testing set is for the XGBoost model is: 80.60226761776337
MBE on testing set is for the XGBoost model is: -0.026053608634408566
Num Fold: 6




{'subsample': 0.8400000000000001, 'n_estimators': 900, 'max_depth': 11, 'learning_rate': 0.06999999999999999, 'gamma': 9, 'eta': 0.30000000000000004, 'colsample_bytree': 0.9000000000000001}
Training R2 score of XGBoost is 0.88
RMSE on the training set for the XGBoost model is: 50.87487964089153
MBE on training set is for the XGBoost model is: -0.01065743914026476
RMSE on testing set is for the XGBoost model is: 81.8484879627173
MBE on testing set is for the XGBoost model is: 0.24519954064187524
Num Fold: 7




{'subsample': 0.9600000000000002, 'n_estimators': 1000, 'max_depth': 11, 'learning_rate': 0.09999999999999999, 'gamma': 9, 'eta': 0.1, 'colsample_bytree': 0.9000000000000001}
Training R2 score of XGBoost is 0.91
RMSE on the training set for the XGBoost model is: 42.69163065932037
MBE on training set is for the XGBoost model is: -0.006675145970290243
RMSE on testing set is for the XGBoost model is: 81.52046342577799
MBE on testing set is for the XGBoost model is: 0.14079648147742896
Num Fold: 8




{'subsample': 0.8800000000000001, 'n_estimators': 500, 'max_depth': 11, 'learning_rate': 0.16, 'gamma': 2, 'eta': 0.4, 'colsample_bytree': 0.9000000000000001}
Training R2 score of XGBoost is 0.9
RMSE on the training set for the XGBoost model is: 47.00636527810286
MBE on training set is for the XGBoost model is: -0.0063428540848732996
RMSE on testing set is for the XGBoost model is: 83.88874966261724
MBE on testing set is for the XGBoost model is: -0.2192263934691749
Num Fold: 9




{'subsample': 0.8400000000000001, 'n_estimators': 1000, 'max_depth': 11, 'learning_rate': 0.16, 'gamma': 1, 'eta': 0.30000000000000004, 'colsample_bytree': 0.9000000000000001}
Training R2 score of XGBoost is 0.95
RMSE on the training set for the XGBoost model is: 31.468517328675066
MBE on training set is for the XGBoost model is: -0.00543976438425297
RMSE on testing set is for the XGBoost model is: 82.61628303604512
MBE on testing set is for the XGBoost model is: 0.11012735284023649
Num Fold: 10




{'subsample': 0.9000000000000001, 'n_estimators': 500, 'max_depth': 11, 'learning_rate': 0.22, 'gamma': 7, 'eta': 0.1, 'colsample_bytree': 0.9000000000000001}
Training R2 score of XGBoost is 0.92
RMSE on the training set for the XGBoost model is: 40.9857090794889
MBE on training set is for the XGBoost model is: -0.014208427204259006
RMSE on testing set is for the XGBoost model is: 82.48829569498876
MBE on testing set is for the XGBoost model is: -0.050285440053400875
Num Fold: 1




{'subsample': 0.9200000000000002, 'n_estimators': 1300, 'max_depth': 11, 'learning_rate': 0.04, 'gamma': 5, 'eta': 0.1, 'colsample_bytree': 0.5000000000000001}
Training R2 score of XGBoost is 0.85
RMSE on the training set for the XGBoost model is: 41.22629816905262
MBE on training set is for the XGBoost model is: -0.009717351406406352
RMSE on testing set is for the XGBoost model is: 60.43983008678054
MBE on testing set is for the XGBoost model is: 0.09101393856327651
Num Fold: 2




{'subsample': 0.8600000000000001, 'n_estimators': 800, 'max_depth': 11, 'learning_rate': 0.16, 'gamma': 7, 'eta': 0.2, 'colsample_bytree': 0.9000000000000001}
Training R2 score of XGBoost is 0.94
RMSE on the training set for the XGBoost model is: 26.74124059211814
MBE on training set is for the XGBoost model is: -0.00011739805579883692
RMSE on testing set is for the XGBoost model is: 59.8481129662812
MBE on testing set is for the XGBoost model is: 0.29040258334422087
Num Fold: 3




{'subsample': 0.8400000000000001, 'n_estimators': 700, 'max_depth': 11, 'learning_rate': 0.19, 'gamma': 1, 'eta': 0.30000000000000004, 'colsample_bytree': 0.7000000000000001}
Training R2 score of XGBoost is 0.94
RMSE on the training set for the XGBoost model is: 27.253038681353893
MBE on training set is for the XGBoost model is: -0.008307569282939015
RMSE on testing set is for the XGBoost model is: 59.47655226127396
MBE on testing set is for the XGBoost model is: -0.02194302342376786
Num Fold: 4




{'subsample': 0.8400000000000001, 'n_estimators': 1000, 'max_depth': 11, 'learning_rate': 0.09999999999999999, 'gamma': 8, 'eta': 0.1, 'colsample_bytree': 0.5000000000000001}
Training R2 score of XGBoost is 0.91
RMSE on the training set for the XGBoost model is: 32.2995380680289
MBE on training set is for the XGBoost model is: -0.0034414681122831867
RMSE on testing set is for the XGBoost model is: 59.95067327052892
MBE on testing set is for the XGBoost model is: 0.19140121806281912
Num Fold: 5




{'subsample': 0.8600000000000001, 'n_estimators': 500, 'max_depth': 11, 'learning_rate': 0.13, 'gamma': 7, 'eta': 0.4, 'colsample_bytree': 0.9000000000000001}
Training R2 score of XGBoost is 0.88
RMSE on the training set for the XGBoost model is: 37.351053311010126
MBE on training set is for the XGBoost model is: -0.013071872775581543
RMSE on testing set is for the XGBoost model is: 61.301434958837106
MBE on testing set is for the XGBoost model is: 0.20164535549902238
Num Fold: 6




{'subsample': 0.8400000000000001, 'n_estimators': 1400, 'max_depth': 11, 'learning_rate': 0.04, 'gamma': 4, 'eta': 0.2, 'colsample_bytree': 0.9000000000000001}
Training R2 score of XGBoost is 0.87
RMSE on the training set for the XGBoost model is: 38.62708162319665
MBE on training set is for the XGBoost model is: -0.003811717407529024
RMSE on testing set is for the XGBoost model is: 60.54099886403806
MBE on testing set is for the XGBoost model is: 0.046434276742628275
Num Fold: 7




{'subsample': 0.9400000000000002, 'n_estimators': 900, 'max_depth': 10, 'learning_rate': 0.25, 'gamma': 4, 'eta': 0.2, 'colsample_bytree': 0.5000000000000001}
Training R2 score of XGBoost is 0.93
RMSE on the training set for the XGBoost model is: 28.961854382501745
MBE on training set is for the XGBoost model is: -0.005218178976377413
RMSE on testing set is for the XGBoost model is: 63.105615111941056
MBE on testing set is for the XGBoost model is: -0.17798231125616895
Num Fold: 8




{'subsample': 0.9600000000000002, 'n_estimators': 500, 'max_depth': 11, 'learning_rate': 0.16, 'gamma': 2, 'eta': 0.1, 'colsample_bytree': 0.9000000000000001}
Training R2 score of XGBoost is 0.9
RMSE on the training set for the XGBoost model is: 34.631931946288056
MBE on training set is for the XGBoost model is: -0.0045113964518076035
RMSE on testing set is for the XGBoost model is: 60.99886865534004
MBE on testing set is for the XGBoost model is: -0.10839745904989904
Num Fold: 9




{'subsample': 0.9000000000000001, 'n_estimators': 700, 'max_depth': 11, 'learning_rate': 0.09999999999999999, 'gamma': 6, 'eta': 0.2, 'colsample_bytree': 0.9000000000000001}
Training R2 score of XGBoost is 0.89
RMSE on the training set for the XGBoost model is: 35.90272054665214
MBE on training set is for the XGBoost model is: -0.005363469615559755
RMSE on testing set is for the XGBoost model is: 61.073729165868514
MBE on testing set is for the XGBoost model is: 0.14931762658867348
Num Fold: 10




{'subsample': 0.8200000000000001, 'n_estimators': 1400, 'max_depth': 10, 'learning_rate': 0.13, 'gamma': 9, 'eta': 0.4, 'colsample_bytree': 0.5000000000000001}
Training R2 score of XGBoost is 0.92
RMSE on the training set for the XGBoost model is: 31.112699664341584
MBE on training set is for the XGBoost model is: 0.0019438448611496133
RMSE on testing set is for the XGBoost model is: 59.56033344751647
MBE on testing set is for the XGBoost model is: 0.15738527140541875


# Random Forest

In [5]:
def train_RF(all_df, buffer_avg=True, target="047", validation=False):
    
    if buffer_avg:
        feature_list = ['aod_047', 'aod_055', 'aod_buffer_047', 'aod_buffer_055',  # 'avg_pm25',
                        'day_cos', 'day_sin', 'daymet_dayl', 'daymet_lat', 'daymet_lon',
                        'daymet_prcp', 'daymet_srad', 'daymet_tmax', 'daymet_tmin', 'daymet_vp',
                        'dem', 'gridmet_th', 'gridmet_vs',  # 'knnidw_distance', 'knnidw_pm25', 'knnidw_pm25_val',
                        'month_cos', 'month_sin', 'ndvi', 'wildfire_smoke',
                        'year']

    else:
        feature_list = ['aod_047', 'aod_055',  # 'aod_buffer_047', 'aod_buffer_055',  # 'avg_pm25',
                        'day_cos', 'day_sin', 'daymet_dayl', 'daymet_lat', 'daymet_lon',
                        'daymet_prcp', 'daymet_srad', 'daymet_tmax', 'daymet_tmin', 'daymet_vp',
                        'dem', 'gridmet_th', 'gridmet_vs',  # 'knnidw_distance', 'knnidw_pm25', 'knnidw_pm25_val',
                        'month_cos', 'month_sin', 'ndvi', 'wildfire_smoke',
                        'year']

    all_df = all_df[feature_list]
    # Filter out records have grount truth
    all_df = all_df[all_df[f'aod_{target}'].notnull()]
    if buffer_avg:
        all_df = all_df[all_df[f'aod_buffer_{target}'].notnull()]
        print(f"Total Samples | Buffer: {buffer_avg}: {all_df.shape}")
    else:
        print(f"Total Samples | Buffer: {buffer_avg}: {all_df.shape}")

    # Fill NaNs or RF cannot work
    all_df = all_df.fillna(-1)

    

    rf_params = {'n_estimators': np.arange(30, 200, 10),
                 'max_depth': np.arange(1, 15, 1),
                 'min_samples_split': np.arange(2, 50, 1),
                 'min_samples_leaf': np.arange(2, 50, 1),
                 'max_features': ['sqrt', 'log2']}  # could also add 'criterion':['mse', 'mae'],
    
    num_fold = 1
    kfold = KFold(n_splits=10, shuffle=True)
    
    for train_idx, test_idx in kfold.split(all_df):
        
        print(f"Num Fold: {num_fold}")
        
        val_df = all_df.iloc[test_idx]
        train_df = all_df.iloc[train_idx]
        

        # inititalization
        # rf_regressor = RandomForestRegressor(random_state=42)
        rf_regressor = RandomForestRegressor()

        # find optimal parameters for random forest regressor using  RandomizedSearchCV.
        # Set random_state=42 and be careful about scoring type
        xgb_regressor_cv = RandomizedSearchCV(rf_regressor, rf_params, cv=5,
                                              n_iter=50,
                                              scoring='neg_root_mean_squared_error',
                                              n_jobs=16)

        if validation:
            train_df, val_df = train_test_split(train_df, test_size=0.1)

            X_train = train_df.drop(columns=['aod_047', 'aod_055'])
            y_train = train_df[[f'aod_{target}']]

            X_val = val_df.drop(columns=['aod_047', 'aod_055'])
            y_val = val_df[[f'aod_{target}']]
        else:
            val_df = None
            X_train = train_df.drop(columns=['aod_047', 'aod_055'])
            y_train = train_df[[f'aod_{target}']]

        xgb_regressor_cv.fit(X_train, y_train.values.ravel())
        best_params = xgb_regressor_cv.best_params_
        print(best_params)

        # create best_rf_regressor sunig the parameters above and fit it to training data
        best_xgb_regressor = xgb_regressor_cv.best_estimator_
        best_xgb_regressor.fit(X_train, y_train.values.ravel())
        # model evaluation for training set
        train_r2_xgb = round(best_xgb_regressor.score(X_train, y_train), 2)
        print('Training R2 score of RF is {}'.format(train_r2_xgb))
        y_train_predicted_xgb = best_xgb_regressor.predict(X_train)
        rmse_train_xgb = (np.sqrt(mean_squared_error(y_train, y_train_predicted_xgb)))
        print('RMSE on the training set for the RF model is: {}'.format(rmse_train_xgb))
        mbe_train_xgb = np.mean(y_train_predicted_xgb - y_train.values.squeeze())
        print("MBE on training set is for the RF model is: {}".format(mbe_train_xgb))

        if validation:
            # model evaluation for test set
            y_test_predicted_xgb = best_xgb_regressor.predict(X_val)
            rmse_test_xgb = (np.sqrt(mean_squared_error(y_val, y_test_predicted_xgb)))
            print("RMSE on testing set is for the RF model is: {}".format(rmse_test_xgb))

            mbe_test_xgb = np.mean(y_test_predicted_xgb - y_val.values.squeeze())
            print("MBE on testing set is for the RF model is: {}".format(mbe_test_xgb))
        
        num_fold += 1

    print("================================================================================")

In [7]:
# Start Training Model with Buffer Average
print("Start Training Model with Buffer Average...")
train_RF(all_df=pm_df, buffer_avg=True, target="047", validation=True)
train_RF(all_df=pm_df, buffer_avg=True, target="055", validation=True)

Start Training Model with Buffer Average...
Total Samples | Buffer: True: (1275632, 22)
Num Fold: 1
{'n_estimators': 170, 'min_samples_split': 8, 'min_samples_leaf': 8, 'max_features': 'sqrt', 'max_depth': 13}
Training R2 score of RF is 0.96
RMSE on the training set for the RF model is: 30.360861978791387
MBE on training set is for the RF model is: -0.007318139551022955
RMSE on testing set is for the RF model is: 32.48922624657474
MBE on testing set is for the RF model is: -0.0825099251621066
Num Fold: 2
{'n_estimators': 80, 'min_samples_split': 5, 'min_samples_leaf': 9, 'max_features': 'sqrt', 'max_depth': 13}
Training R2 score of RF is 0.96
RMSE on the training set for the RF model is: 30.58117788928966
MBE on training set is for the RF model is: -0.009090653213499117
RMSE on testing set is for the RF model is: 31.627178899643855
MBE on testing set is for the RF model is: -0.021065943605175344
Num Fold: 3
{'n_estimators': 110, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_feat

RMSE on the training set for the RF model is: 22.245747298061254
MBE on training set is for the RF model is: -0.003723200249356273
RMSE on testing set is for the RF model is: 22.87750866474507
MBE on testing set is for the RF model is: 0.052472698039162984


In [6]:
# Start Training Model with Buffer Average
print("Start Training Model with Buffer Average...")
# train_RF(all_df=pm_df, buffer_avg=True, target="047", validation=True)
# train_RF(all_df=pm_df, buffer_avg=True, target="055", validation=True)

# Start Training Model with Buffer Average
print("Start Training Model w/o Buffer Average...")
train_RF(all_df=pm_df, buffer_avg=False, target="047", validation=True)
train_RF(all_df=pm_df, buffer_avg=False, target="055", validation=True)

Start Training Model with Buffer Average...
Start Training Model w/o Buffer Average...
Total Samples | Buffer: False: (1275632, 20)
Num Fold: 1
{'n_estimators': 100, 'min_samples_split': 22, 'min_samples_leaf': 13, 'max_features': 'sqrt', 'max_depth': 13}
Training R2 score of RF is 0.54
RMSE on the training set for the RF model is: 98.64197380543906
MBE on training set is for the RF model is: 0.007558329525689623
RMSE on testing set is for the RF model is: 104.02996707387894
MBE on testing set is for the RF model is: 0.30160752654664413
Num Fold: 2
{'n_estimators': 60, 'min_samples_split': 41, 'min_samples_leaf': 6, 'max_features': 'log2', 'max_depth': 14}
Training R2 score of RF is 0.56
RMSE on the training set for the RF model is: 97.03707708524625
MBE on training set is for the RF model is: 0.006239897757268297
RMSE on testing set is for the RF model is: 99.948024047647
MBE on testing set is for the RF model is: -0.060663130540534876
Num Fold: 3
{'n_estimators': 120, 'min_samples_sp

RMSE on the training set for the RF model is: 70.83957046814007
MBE on training set is for the RF model is: -0.007291361835540957
RMSE on testing set is for the RF model is: 74.89681763942345
MBE on testing set is for the RF model is: -0.1351011943226997
