## 하이퍼파라미터 튜닝

### 라이브러리 준비

In [3]:
!pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-24.7.0-py3-none-any.whl.metadata (11 kB)
Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.8/107.8 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyaml-24.7.0-py3-none-any.whl (24 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-24.7.0 scikit-optimize-0.10.2


In [1]:
# 필요한 라이브러리 임포트
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# 샘플 데이터 생성
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=42)

# 훈련 세트와 테스트 세트로 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 랜덤 포레스트 모델 초기화
rf = RandomForestClassifier(random_state=42)

# 간단한 모델 훈련 및 평가
rf.fit(X_train, y_train)
print(f"기본 모델 정확도: {rf.score(X_test, y_test):.4f}")


기본 모델 정확도: 0.9000


### 1. 그리드 서치

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

# 데이터 생성
X, y = make_classification(n_samples=1000, n_features=20, n_informative=2, n_redundant=10, random_state=42)

# 모델 및 파라미터 그리드 정의
rf = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

# Grid Search 수행
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X, y)

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

Best parameters: {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 200}
Best cross-validation score: 0.9280000000000002


### 2. 랜덤 서치

In [5]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

# 랜덤 탐색을 위한 파라미터 분포 정의
param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': [None] + list(randint(5, 50).rvs(10)),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 10),
}

# Random Search 수행
random_search = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=100, cv=5, scoring='accuracy', n_jobs=-1, random_state=42)
random_search.fit(X, y)

print("Best parameters:", random_search.best_params_)
print("Best cross-validation score:", random_search.best_score_)

Best parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 152}
Best cross-validation score: 0.9190000000000002


### 3. 베이지안 최적화

In [4]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

# 탐색 공간 정의
search_spaces = {
    'n_estimators': Integer(100, 500),
    'max_depth': Categorical([None] + list(range(5, 30))),
    'min_samples_split': Integer(2, 20),
    'min_samples_leaf': Integer(1, 10),
}

# Bayesian Optimization 수행
bayes_search = BayesSearchCV(rf, search_spaces, n_iter=50, cv=5, scoring='accuracy', n_jobs=-1, random_state=42)
bayes_search.fit(X, y)

print("Best parameters:", bayes_search.best_params_)
print("Best cross-validation score:", bayes_search.best_score_)




Best parameters: OrderedDict({'max_depth': 16, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500})
Best cross-validation score: 0.917


### 4. 교차 검증

In [8]:
from sklearn.model_selection import cross_val_score

# 최적의 하이퍼파라미터로 모델 생성
best_rf = RandomForestClassifier(**bayes_search.best_params_, random_state=42)

# 교차 검증 수행
cv_scores = cross_val_score(best_rf, X, y, cv=5, scoring='accuracy')

print("Cross-validation scores:", cv_scores)
print("Mean CV score:", cv_scores.mean())
print("Standard deviation of CV scores:", cv_scores.std())

Cross-validation scores: [0.955 0.925 0.905 0.915 0.92 ]
Mean CV score: 0.924
Standard deviation of CV scores: 0.016852299546352693
