# Hyperparameter Tuning

Using Scikit-Learn’s RandomizedSearchCV method, we can define a grid of hyperparameter ranges, and randomly sample from the grid, performing K-Fold CV with each combination of values.

In [1]:
import time
import numpy as np
import pandas as pd
from pprint import pprint
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [2]:
base_regressor = RandomForestRegressor(random_state = 42)

print('Parameters currently in use:\n')
pprint(base_regressor.get_params())

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


In [3]:
bootstrap = [True, False]

criteria = ["squared_error", "absolute_error", "friedman_mse", "poisson"]

max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

max_features = ['sqrt', 'log2', 1.0]
max_features.append(None)

min_samples_leaf = [1, 2, 4, 6]

min_samples_split = [1, 2, 5, 10]

n_estimators = [int(x) for x in np.linspace(start = 10, stop = 2000, num = 10)]

warm_start = [True, False]

# Create the random grid
random_grid = {
    'bootstrap'        : bootstrap,
    'criterion'        : criteria,
    'max_depth'        : max_depth,
    'max_features'     : max_features,
    'min_samples_leaf' : min_samples_leaf,
    'min_samples_split': min_samples_split,
    'n_estimators'     : n_estimators,
    'warm_start'       : warm_start
}

pprint(random_grid)

{'bootstrap': [True, False],
 'criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['sqrt', 'log2', 1.0, None],
 'min_samples_leaf': [1, 2, 4, 6],
 'min_samples_split': [1, 2, 5, 10],
 'n_estimators': [10, 231, 452, 673, 894, 1115, 1336, 1557, 1778, 2000],
 'warm_start': [True, False]}


In [4]:
base_path = '/kaggle/input/self-reported-qol/20230625-processed-'

df_physical      = pd.read_csv(base_path + 'physical-qol.csv')
df_psychological = pd.read_csv(base_path + 'psychological-qol.csv')

df_physical.drop('id', axis=1, inplace=True)         # id
df_physical.drop('day', axis=1, inplace=True)        # day
df_psychological.drop('id', axis=1, inplace=True)    # id
df_psychological.drop('day', axis=1, inplace=True)   # day

phy_all_tmp = df_physical.copy()
phy_init_set_tmp = phy_all_tmp.query("group in ('Initial Set')").copy()
phy_ufpi_ufc_tmp = phy_all_tmp.query("group in ('UFPI', 'UFC')").copy()

psy_all_tmp = df_psychological.copy()
psy_init_set_tmp = psy_all_tmp.query("group in ('Initial Set')").copy()
psy_ufpi_ufc_tmp = psy_all_tmp.query("group in ('UFPI', 'UFC')").copy()

phy_all_tmp.drop("group", axis=1, inplace=True)
phy_init_set_tmp.drop("group", axis=1, inplace=True)
phy_ufpi_ufc_tmp.drop("group", axis=1, inplace=True)

psy_all_tmp.drop("group", axis=1, inplace=True)
psy_init_set_tmp.drop("group", axis=1, inplace=True)
psy_ufpi_ufc_tmp.drop("group", axis=1, inplace=True)

# Separating predictors from the values to be predicted.
phy_all           = phy_all_tmp.drop("phy_ref_score", axis=1)
phy_all_pred      = phy_all_tmp["phy_ref_score"].copy()
phy_init_set      = phy_init_set_tmp.drop("phy_ref_score", axis=1)
phy_init_set_pred = phy_init_set_tmp["phy_ref_score"].copy()
phy_ufpi_ufc      = phy_ufpi_ufc_tmp.drop("phy_ref_score", axis=1)
phy_ufpi_ufc_pred = phy_ufpi_ufc_tmp["phy_ref_score"].copy()

psy_all           = psy_all_tmp.drop("psy_ref_score", axis=1)
psy_all_pred      = psy_all_tmp["psy_ref_score"].copy()
psy_init_set      = psy_init_set_tmp.drop("psy_ref_score", axis=1)
psy_init_set_pred = psy_init_set_tmp["psy_ref_score"].copy()
psy_ufpi_ufc      = psy_ufpi_ufc_tmp.drop("psy_ref_score", axis=1)
psy_ufpi_ufc_pred = psy_ufpi_ufc_tmp["psy_ref_score"].copy()

In [5]:
def getDataset(domain_group):
    return {
        'phy_all': (phy_all, phy_all_pred),
        'psy_all': (psy_all, psy_all_pred),
        
        'phy_init_set': (phy_init_set, phy_init_set_pred),
        'psy_init_set': (psy_init_set, psy_init_set_pred),
        
        'phy_ufpi_ufc': (phy_ufpi_ufc, phy_ufpi_ufc_pred),
        'psy_ufpi_ufc': (psy_ufpi_ufc, psy_ufpi_ufc_pred),
    }[domain_group] 

In [6]:
features_phy, labels_phy = getDataset('phy_all')
train_features_phy, test_features_phy, train_labels_phy, test_labels_phy = train_test_split(features_phy, labels_phy, test_size = 0.3, random_state = 42)

In [7]:
features_psy, labels_psy = getDataset('psy_all')
train_features_psy, test_features_psy, train_labels_psy, test_labels_psy = train_test_split(features_psy, labels_psy, test_size = 0.3, random_state = 42)

In [8]:
def evaluate_mae_rmse(name, model, test_features, test_labels):
    predictions = model.predict(test_features)
    mae = mean_absolute_error(test_labels, predictions)
    rmse = mean_squared_error(test_labels, predictions, squared=False)
    print('Model Performance of', name, 'MAE = {:0.4f} RMSE = {:0.4f}'.format(mae, rmse))
    return mae, rmse

In [9]:
def get_baseline_metrics(domain_group):
    return {
        'phy_all': (5.6870, 8.0745),
        'psy_all': (5.4534, 7.7493),
    }[domain_group]

In [10]:
def get_test_features_labels(domain_group):
    return {
        'phy_all': (test_features_phy, test_labels_phy),
        'psy_all': (test_features_psy, test_labels_psy),
    }[domain_group]

In [11]:
def check_improvement(new_model, domain_group):
    b_mae, b_rmse = get_baseline_metrics(domain_group)
    test_features, test_labels = get_test_features_labels(domain_group)
    
    n_mae, n_rmse = evaluate_mae_rmse('New Model', new_model, test_features, test_labels)
    
    imp_mae  = (n_mae - b_mae) / b_mae
    imp_rmse = (n_rmse - b_rmse) / b_rmse
    
    print('Improvement of {:0.2f}% in MAE ({:0.2f})'.format(100 * imp_mae, n_mae))
    print('Improvement of {:0.2f}% in RMSE ({:0.2f})'.format(100 * imp_rmse, n_rmse))

In [12]:
def check_improvement_v2(df, domain_group):
    b_mae, b_rmse = get_baseline_metrics(domain_group) 
    n_mae  = df["mae_mean"].mean() 
    n_rmse = df["rmse_mean"].mean() 
    
    imp_mae  = abs(n_mae - b_mae) / b_mae
    imp_rmse = abs(n_rmse - b_rmse) / b_rmse
    
    print('Improvement of {:0.2f}% in MAE ({:0.2f})'.format(100 * imp_mae, n_mae))
    print('Improvement of {:0.2f}% in RMSE ({:0.2f})'.format(100 * imp_rmse, n_rmse))

In [13]:
cols_result_df = ['exec_index', 'name', 'model', 'n_folds', 'feature_selection', 'mae_mean', 'mae_std', 'rmse_mean', 'rmse_std', 'r2_mean', 'r2_std', 'time(s)']

In [14]:
iterations = 30
executions = 30
cv_number = 10

In [15]:
def rmsle_cv(exec_index, name, new_model, domain_group):
    start = time.time()
    
    X, y = getDataset(domain_group)
    
    kf = KFold(cv_number, shuffle = True, random_state = 42).get_n_splits(X.values)
    rmse = np.sqrt(-cross_val_score(new_model, X.values, y, scoring = "neg_mean_squared_error", cv = kf))
    mae  = -cross_val_score(new_model, X.values, y, scoring = "neg_mean_absolute_error", cv = kf)
    r2 = cross_val_score(new_model, X.values, y, scoring = "r2", cv = kf)
    
    end = time.time()
    exec_time = end - start
    scores = pd.Series([exec_index, name, new_model, cv_number, 'None',
                      round(mae.mean(), 4),  round(mae.std(), 4), 
                      round(rmse.mean(), 4), round(rmse.std(), 4), 
                      round(r2.mean(), 4),   round(r2.std(), 4), 
                      round(exec_time, 4)],  index=cols_result_df)
    
    #print('MAE:', round(mae.mean(), 4), 'RMSE:',  round(rmse.mean(), 4), 'Time:', exec_time, '\n')
    return scores

In [16]:
def train_model(name, new_model, domain_group):
    df = pd.DataFrame(columns = cols_result_df)
    for exec_index in range(executions):
        print('|_ Training', name, 'Execution', exec_index, '...')
        result = rmsle_cv(exec_index, name, new_model, domain_group)
        df = pd.concat([df, result.to_frame().T], ignore_index=True)
    return df

In [17]:
rscv_phy = RandomizedSearchCV(estimator = base_regressor, param_distributions = random_grid, n_iter = iterations, cv = cv_number, verbose=1, random_state=42, n_jobs = -1)
rscv_phy.fit(train_features_phy, train_labels_phy)

Fitting 10 folds for each of 30 candidates, totalling 300 fits


40 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/sklearn/ensemble/_forest.py", line 340, in fit
    self._validate_params()
  File "/opt/conda/lib/python3.10/site-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "/opt/conda/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 97, in validate_parameter_constraints
    raise InvalidParameterError(
sk

In [18]:
rscv_psy = RandomizedSearchCV(estimator = base_regressor, param_distributions = random_grid, n_iter = iterations, cv = cv_number, verbose=1, random_state=42, n_jobs = -1)
rscv_psy.fit(train_features_psy, train_labels_psy)

Fitting 10 folds for each of 30 candidates, totalling 300 fits


40 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/sklearn/ensemble/_forest.py", line 340, in fit
    self._validate_params()
  File "/opt/conda/lib/python3.10/site-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "/opt/conda/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 97, in validate_parameter_constraints
    raise InvalidParameterError(
sk

In [19]:
print('Best hyperparameters for physical dataset')
rscv_phy.best_params_

Best hyperparameters for physical dataset


{'warm_start': False,
 'n_estimators': 894,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 1.0,
 'max_depth': 10,
 'criterion': 'squared_error',
 'bootstrap': True}

In [20]:
print('Best hyperparameters for psychological dataset')
rscv_psy.best_params_

Best hyperparameters for psychological dataset


{'warm_start': False,
 'n_estimators': 1778,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': None,
 'criterion': 'poisson',
 'bootstrap': False}

In [21]:
df_phy = train_model('RandomForest RSCV', rscv_phy.best_estimator_, "phy_all")
check_improvement_v2(df_phy, "phy_all")

|_ Training RandomForest RSCV Execution 0 ...
|_ Training RandomForest RSCV Execution 1 ...
|_ Training RandomForest RSCV Execution 2 ...
|_ Training RandomForest RSCV Execution 3 ...
|_ Training RandomForest RSCV Execution 4 ...
|_ Training RandomForest RSCV Execution 5 ...
|_ Training RandomForest RSCV Execution 6 ...
|_ Training RandomForest RSCV Execution 7 ...
|_ Training RandomForest RSCV Execution 8 ...
|_ Training RandomForest RSCV Execution 9 ...
|_ Training RandomForest RSCV Execution 10 ...
|_ Training RandomForest RSCV Execution 11 ...
|_ Training RandomForest RSCV Execution 12 ...
|_ Training RandomForest RSCV Execution 13 ...
|_ Training RandomForest RSCV Execution 14 ...
|_ Training RandomForest RSCV Execution 15 ...
|_ Training RandomForest RSCV Execution 16 ...
|_ Training RandomForest RSCV Execution 17 ...
|_ Training RandomForest RSCV Execution 18 ...
|_ Training RandomForest RSCV Execution 19 ...
|_ Training RandomForest RSCV Execution 20 ...
|_ Training RandomFores

In [22]:
df_psy = train_model('RandomForest RSCV', rscv_psy.best_estimator_, "psy_all")
check_improvement_v2(df_psy, "psy_all")

|_ Training RandomForest RSCV Execution 0 ...
|_ Training RandomForest RSCV Execution 1 ...
|_ Training RandomForest RSCV Execution 2 ...
|_ Training RandomForest RSCV Execution 3 ...
|_ Training RandomForest RSCV Execution 4 ...
|_ Training RandomForest RSCV Execution 5 ...
|_ Training RandomForest RSCV Execution 6 ...
|_ Training RandomForest RSCV Execution 7 ...
|_ Training RandomForest RSCV Execution 8 ...
|_ Training RandomForest RSCV Execution 9 ...
|_ Training RandomForest RSCV Execution 10 ...
|_ Training RandomForest RSCV Execution 11 ...
|_ Training RandomForest RSCV Execution 12 ...
|_ Training RandomForest RSCV Execution 13 ...
|_ Training RandomForest RSCV Execution 14 ...
|_ Training RandomForest RSCV Execution 15 ...
|_ Training RandomForest RSCV Execution 16 ...
|_ Training RandomForest RSCV Execution 17 ...
|_ Training RandomForest RSCV Execution 18 ...
|_ Training RandomForest RSCV Execution 19 ...
|_ Training RandomForest RSCV Execution 20 ...
|_ Training RandomFores

In [23]:
df_phy.to_csv('20230711-randomforest-rscv-physical.csv', index = False)
df_psy.to_csv('20230711-randomforest-rscv-psychological.csv', index = False)