# Grid Search & Random Search
- sci-kit learn 활용 랜덤포레스트 하이퍼파라미터 튜닝 실습

In [1]:
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

import warnings 
warnings.simplefilter('ignore')

In [2]:
# Boston 데이터셋 준비

data = load_boston()
X = data.data
y = data.target

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.15, random_state=42)

print(f"X_train shape: {X_train.shape}, X_valid shape: {X_valid.shape}")
print(f"y_train shape: {y_train.shape}, y_valid shape: {y_valid.shape}")

X_train shape: (430, 13), X_valid shape: (76, 13)
y_train shape: (430,), y_valid shape: (76,)


In [3]:
# 하이퍼파라미터 튜닝을 하지 않은 모델 성능 확인 (기본값)

from sklearn.model_selection import GridSearchCV

model = RandomForestRegressor(random_state=42, n_jobs=-1)

params_grid = {}

grid_model = GridSearchCV(model, param_grid=params_grid, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
grid_model.fit(X_train, y_train)

print('best parameters : ', grid_model.best_params_)
print('best score : ', grid_model.best_score_)

em = grid_model.best_estimator_
pred = em.predict(X_valid)

mean_squared_error(y_valid, pred)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
best parameters :  {}
best score :  -13.683935255813953


9.108397644736856

In [4]:
# GridSearchCV를 이용한 하이퍼파라미터 튜닝 진행

from sklearn.model_selection import GridSearchCV

model = RandomForestRegressor(random_state=42, n_jobs=-1)

params_grid = {
    'max_depth': [20, 30],
    'max_features': ['sqrt', 'auto'],
    'min_samples_split': [2, 3],
    'n_estimators': [100, 200],

    ###### TODO #####
    # 'hyper_parameter_0': [...],
    # 'hyper_parameter_1': [...],
    #################
    
}

grid_model = GridSearchCV(model, param_grid=params_grid, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
grid_model.fit(X_train, y_train)

print('best parameters : ', grid_model.best_params_)
print('best score : ', grid_model.best_score_)

em = grid_model.best_estimator_
pred = em.predict(X_valid)

mean_squared_error(y_valid, pred)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
best parameters :  {'max_depth': 30, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 100}
best score :  -12.288394558139522


4.60246228947369

In [5]:
# RandomSearchCV를 이용한 하이퍼파라미터 튜닝 진행

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, truncnorm, randint

model = RandomForestRegressor(random_state=42)

params_dist = {
    'max_depth': randint(0, 32),
    'max_features': ['sqrt', 'auto'],
    'min_samples_split': randint(0, 20),
    'n_estimators': randint(50, 300),

    ###### TODO #####
    # 'hyper_parameter_0': random_range,
    # 'hyper_parameter_1': random_range,
    #################
}

rm_model = RandomizedSearchCV(model, params_dist, verbose=1, random_state=42, scoring='neg_mean_squared_error', n_jobs=-1, n_iter=10)
rm_model.fit(X_train, y_train)

print('best parameters : ', rm_model.best_params_)
print('best score : ', rm_model.best_score_)

em = rm_model.best_estimator_
em = rm_model.best_estimator_
pred = em.predict(X_valid)
print(mean_squared_error(y_valid, pred))

Fitting 5 folds for each of 10 candidates, totalling 50 fits
best parameters :  {'max_depth': 23, 'max_features': 'sqrt', 'min_samples_split': 3, 'n_estimators': 153}
best score :  -12.976966884432539
4.1196958110748225
