# Studi Kasus Hyperparameter Tuning

## Latihan Regresi

### Import libraries

In [10]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error
from skopt import BayesSearchCV
import time
import numpy as np

### Dataset fetching 

In [2]:
X, y = fetch_california_housing(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print("Shape of training data: ", X_train.shape)
print("Shape of test data: ", X_test.shape)
# a = y_train 
# print(a)

Shape of training data:  (14448, 8)
Shape of test data:  (6192, 8)


### Model training 

In [3]:
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
initial_mse = mean_squared_error(y_test, y_pred)
print(f"Initial MSE on test set (without tuning): {initial_mse:.2f}")

Initial MSE on test set (without tuning): 0.26


### Tuning

#### Grid Search

In [15]:
start_time = time.time()

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth' : [10, 20, 30],
    'min_samples_split' : [2, 5, 10],
    'min_samples_leaf' : [1, 2, 4],
    'bootstrap' : [True, False]
}

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=1, verbose=2)

grid_search.fit(X_train, y_train)

print(f"Best parameters (Grid Search): {grid_search.best_params_}")
best_rf_grid = grid_search.best_estimator_


y_pred_grid = best_rf_grid.predict(X_test)
grid_search_mse = mean_squared_error(y_test, y_pred_grid)
print(f"MSE after Grid Search: {grid_search_mse:.2f}")

end_time = time.time()
execution_time = end_time - start_time
print(f"Waktu eksekusi: {execution_time:.4f} detik")

Fitting 3 folds for each of 162 candidates, totalling 486 fits
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   5.7s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   5.5s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   5.5s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  12.2s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  10.9s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  10.8s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=  16.3s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=  16.2s
[

  _data = np.array(data, dtype=dtype, copy=copy,


Best parameters (Grid Search): {'bootstrap': True, 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
MSE after Grid Search: 0.25
Waktu eksekusi: 8083.5414 detik


Best parameters (Grid Search): {'bootstrap': True, 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
MSE after Grid Search: 0.25
Waktu eksekusi: 8083.5414 detik

#### Randomized Search

In [8]:
start_time = time.time()

param_dist = {
    'n_estimators': np.arange(100, 500, 100),
    'max_depth': [None] + list(np.arange(10, 50, 10)),
    'min_samples_split': np.arange(2,11,2),
    'min_samples_leaf': np.arange(1,5),
    'bootstrap': [True, False]
}

random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=5, cv=3, n_jobs=-1, verbose=2, random_state=42)
random_search.fit(X_train, y_train)

print(f"Best parameters (Random Search): {random_search.best_params_}")

best_rf_random = random_search.best_estimator_

y_pred_random = best_rf_random.predict(X_test)
random_search_mse = mean_squared_error(y_test, y_pred_random)
print(f"MSE after Random Search: {random_search_mse:.2f}")

end_time = time.time()
execution_time = end_time - start_time
print(f"Waktu eksekusi: {execution_time:.4f} detik")

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[CV] END bootstrap=True, max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time=  22.5s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time=  23.5s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time=  23.7s
[CV] END bootstrap=True, max_depth=30, min_samples_leaf=2, min_samples_split=6, n_estimators=300; total time=  43.8s
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=2, min_samples_split=8, n_estimators=400; total time= 1.4min
[CV] END bootstrap=True, max_depth=30, min_samples_leaf=2, min_samples_split=6, n_estimators=300; total time=  39.4s
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=2, min_samples_split=8, n_estimators=400; total time= 1.4min
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=2, min_samples_split=8, n_estimators=400; total time= 1.5min
[CV] END bootstrap=True, max_depth=30, min_samples_leaf

  _data = np.array(data, dtype=dtype, copy=copy,


Best parameters (Random Search): {'n_estimators': np.int64(300), 'min_samples_split': np.int64(6), 'min_samples_leaf': np.int64(2), 'max_depth': np.int64(30), 'bootstrap': True}
MSE after Random Search: 0.25
Waktu eksekusi: 209.5697 detik


#### Bayesian Optimization

In [4]:
start_time = time.time()

param_space = {
    'n_estimators': (100, 500),
    'max_depth': (10, 50),
    'min_samples_split': (2,10),
    'min_samples_leaf': (1,4),
    'bootstrap': [True, False]
}

bayes_search = BayesSearchCV(estimator=rf, search_spaces=param_space, n_iter=32, cv=3, n_jobs=1, verbose=2, random_state=42)
bayes_search.fit(X_train, y_train)

print(f"Best parameters (Bayesian Optimization): {bayes_search.best_params_}")
best_rf_bayes = bayes_search.best_estimator_

y_pred_bayes = best_rf_bayes.predict(X_test)
bayes_mse = mean_squared_error(y_test, y_pred_bayes)
print(f"MSE after Bayesian Optimization: {bayes_mse:.2f}")

end_time = time.time()
execution_time = end_time - start_time
print(f"Waktu eksekusi: {execution_time:.4f} detik")

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END bootstrap=True, max_depth=39, min_samples_leaf=4, min_samples_split=5, n_estimators=368; total time=  24.0s
[CV] END bootstrap=True, max_depth=39, min_samples_leaf=4, min_samples_split=5, n_estimators=368; total time=  23.7s
[CV] END bootstrap=True, max_depth=39, min_samples_leaf=4, min_samples_split=5, n_estimators=368; total time=  23.7s
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END bootstrap=False, max_depth=45, min_samples_leaf=2, min_samples_split=10, n_estimators=446; total time=  48.9s
[CV] END bootstrap=False, max_depth=45, min_samples_leaf=2, min_samples_split=10, n_estimators=446; total time=  47.0s
[CV] END bootstrap=False, max_depth=45, min_samples_leaf=2, min_samples_split=10, n_estimators=446; total time=  47.4s
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END bootstrap=True, max_depth=47, min_samples_leaf=1, min_samples_split=5, n_estimators=175; total time=  13.8

## Latihan Klasifikasi

### Data load and Data Preprocessing

In [9]:
from sklearn.datasets import fetch_openml
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

X, y = fetch_openml(name='credit-g', version=1, return_X_y=True, as_frame=True)
# print(X)
# print(y)
le = LabelEncoder()
y = le.fit_transform(y)

X_encoded = pd.get_dummies(X, drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42)

print("Shape of training data: ", X_train.shape)
print("Shape of testing data: ", X_test.shape)


Shape of training data:  (700, 48)
Shape of testing data:  (300, 48)


### Model training

In [12]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

initial_score = rf.score(X_test, y_test)
print(f"Initial accuracy on test set (without tuning): {initial_score:.2f}")

Initial accuracy on test set (without tuning): 0.76


### Parameter Tuning

#### Grid Search

In [13]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'criterion': ['gini', 'entropy']
}

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=1, verbose=2)
grid_search.fit(X_train, y_train)

print(f"Best parameters (Grid Search): {grid_search.best_params_}")
best_rf_grid = grid_search.best_estimator_

grid_search_score = best_rf_grid.score(X_test, y_test)
print(f"Accuracy after Grid Search: {grid_search_score:.2f}")

Fitting 3 folds for each of 54 candidates, totalling 162 fits
[CV] END criterion=gini, max_depth=10, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END criterion=gini, max_depth=10, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END criterion=gini, max_depth=10, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END criterion=gini, max_depth=10, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END criterion=gini, max_depth=10, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END criterion=gini, max_depth=10, min_samples_split=2, n_estimators=200; total time=   0.3s
[CV] END criterion=gini, max_depth=10, min_samples_split=2, n_estimators=300; total time=   0.5s
[CV] END criterion=gini, max_depth=10, min_samples_split=2, n_estimators=300; total time=   0.5s
[CV] END criterion=gini, max_depth=10, min_samples_split=2, n_estimators=300; total time=   0.5s
[CV] END criterion=gini, max_depth=10, min_samples_split=5, n_est

#### Random Search

In [14]:
param_dist = {
    'n_estimators': np.linspace(100, 500, 5, dtype=int),
    'max_depth': np.linspace(10,50,5, dtype=int),
    'min_samples_split': [2, 5, 10],
    'criterion': ['gini', 'entropy']
}

random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=20, cv=3, n_jobs=1, verbose=2, random_state=42)
random_search.fit(X_train, y_train)

print(f"Best parameters (Random Search): {random_search.best_params_}")
best_rf_random = random_search.best_estimator_
 
random_search_score = best_rf_random.score(X_test, y_test)
print(f"Accuracy after Random Search: {random_search_score:.2f}")

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END criterion=gini, max_depth=50, min_samples_split=10, n_estimators=400; total time=   0.7s
[CV] END criterion=gini, max_depth=50, min_samples_split=10, n_estimators=400; total time=   0.7s
[CV] END criterion=gini, max_depth=50, min_samples_split=10, n_estimators=400; total time=   0.7s
[CV] END criterion=gini, max_depth=20, min_samples_split=2, n_estimators=400; total time=   0.7s
[CV] END criterion=gini, max_depth=20, min_samples_split=2, n_estimators=400; total time=   0.7s
[CV] END criterion=gini, max_depth=20, min_samples_split=2, n_estimators=400; total time=   0.7s
[CV] END criterion=entropy, max_depth=30, min_samples_split=10, n_estimators=400; total time=   0.7s
[CV] END criterion=entropy, max_depth=30, min_samples_split=10, n_estimators=400; total time=   0.7s
[CV] END criterion=entropy, max_depth=30, min_samples_split=10, n_estimators=400; total time=   0.9s
[CV] END criterion=entropy, max_depth=10, min_sampl

In [15]:
param_space = {
    'n_estimators': (100, 500),
    'max_depth': (10, 50),
    'min_samples_split': (2, 10),
    'criterion': ['gini', 'entropy']
}

bayes_search = BayesSearchCV(estimator=rf, search_spaces=param_space, n_iter=32, cv=3, n_jobs=1, verbose=2, random_state=42)
bayes_search.fit(X_train, y_train)

print(f"Best parameters (Bayesian Optimization): {bayes_search.best_params_}")
best_rf_bayes = bayes_search.best_estimator_
 
bayes_search_score = best_rf_bayes.score(X_test, y_test)
print(f"Accuracy after Bayesian Optimization: {bayes_search_score:.2f}")

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END criterion=gini, max_depth=39, min_samples_split=9, n_estimators=226; total time=   0.4s
[CV] END criterion=gini, max_depth=39, min_samples_split=9, n_estimators=226; total time=   0.5s
[CV] END criterion=gini, max_depth=39, min_samples_split=9, n_estimators=226; total time=   0.4s
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END criterion=entropy, max_depth=45, min_samples_split=4, n_estimators=480; total time=   0.9s
[CV] END criterion=entropy, max_depth=45, min_samples_split=4, n_estimators=480; total time=   1.0s
[CV] END criterion=entropy, max_depth=45, min_samples_split=4, n_estimators=480; total time=   0.9s
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END criterion=gini, max_depth=47, min_samples_split=3, n_estimators=273; total time=   0.5s
[CV] END criterion=gini, max_depth=47, min_samples_split=3, n_estimators=273; total time=   0.5s
[CV] END criterion=gini, max_depth=47,