# 07. 랜덤 포레스트 (Random Forest)

## 학습 목표
- 앙상블 학습과 배깅 이해
- 랜덤 포레스트 작동 원리
- 하이퍼파라미터 튜닝

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.datasets import load_iris, load_wine, fetch_california_housing
import seaborn as sns

plt.rcParams['font.family'] = 'DejaVu Sans'

## 1. 랜덤 포레스트 분류

In [None]:
# Wine 데이터셋 로드
wine = load_wine()
X, y = wine.data, wine.target

print(f"Features: {wine.feature_names}")
print(f"Classes: {wine.target_names}")
print(f"Shape: {X.shape}")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

In [None]:
# 랜덤 포레스트 모델
rf_clf = RandomForestClassifier(
    n_estimators=100,
    max_depth=5,
    random_state=42,
    n_jobs=-1
)
rf_clf.fit(X_train, y_train)

y_pred = rf_clf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=wine.target_names))

## 2. 트리 개수에 따른 성능

In [None]:
# 트리 개수에 따른 성능 변화
n_trees = [1, 5, 10, 20, 50, 100, 200, 500]
train_scores = []
test_scores = []
oob_scores = []

for n in n_trees:
    rf = RandomForestClassifier(n_estimators=n, random_state=42, oob_score=True)
    rf.fit(X_train, y_train)
    train_scores.append(rf.score(X_train, y_train))
    test_scores.append(rf.score(X_test, y_test))
    oob_scores.append(rf.oob_score_)

plt.figure(figsize=(10, 6))
plt.plot(n_trees, train_scores, 'b-o', label='Train Score')
plt.plot(n_trees, test_scores, 'r-o', label='Test Score')
plt.plot(n_trees, oob_scores, 'g-o', label='OOB Score')
plt.xlabel('Number of Trees')
plt.ylabel('Accuracy')
plt.title('Random Forest: Performance vs Number of Trees')
plt.legend()
plt.grid(True, alpha=0.3)
plt.xscale('log')
plt.show()

## 3. 특성 중요도

In [None]:
# 특성 중요도
importance = pd.DataFrame({
    'Feature': wine.feature_names,
    'Importance': rf_clf.feature_importances_
}).sort_values('Importance', ascending=True)

plt.figure(figsize=(12, 8))
plt.barh(importance['Feature'], importance['Importance'])
plt.xlabel('Feature Importance')
plt.title('Random Forest Feature Importance - Wine Dataset')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 4. 하이퍼파라미터 튜닝

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7, None],
    'min_samples_split': [2, 5, 10],
    'max_features': ['sqrt', 'log2', None]
}

grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42, n_jobs=-1),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best CV Score: {grid_search.best_score_:.4f}")
print(f"Test Score: {grid_search.score(X_test, y_test):.4f}")

## 5. 결정 트리 vs 랜덤 포레스트 비교

In [None]:
from sklearn.tree import DecisionTreeClassifier

# 두 모델 비교
dt = DecisionTreeClassifier(max_depth=5, random_state=42)
rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)

dt_scores = cross_val_score(dt, X, y, cv=10)
rf_scores = cross_val_score(rf, X, y, cv=10)

print(f"Decision Tree: {dt_scores.mean():.4f} (+/- {dt_scores.std()*2:.4f})")
print(f"Random Forest: {rf_scores.mean():.4f} (+/- {rf_scores.std()*2:.4f})")

# 박스플롯 비교
plt.figure(figsize=(8, 6))
plt.boxplot([dt_scores, rf_scores], labels=['Decision Tree', 'Random Forest'])
plt.ylabel('Accuracy')
plt.title('Model Comparison: Decision Tree vs Random Forest')
plt.grid(True, alpha=0.3)
plt.show()

## 6. 랜덤 포레스트 회귀

In [None]:
# California Housing 데이터
housing = fetch_california_housing()
X_h, y_h = housing.data, housing.target

X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(
    X_h, y_h, test_size=0.2, random_state=42
)

# 랜덤 포레스트 회귀
rf_reg = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
rf_reg.fit(X_train_h, y_train_h)

y_pred_h = rf_reg.predict(X_test_h)

from sklearn.metrics import r2_score, mean_absolute_error

print(f"R² Score: {r2_score(y_test_h, y_pred_h):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test_h, y_pred_h)):.4f}")
print(f"MAE: {mean_absolute_error(y_test_h, y_pred_h):.4f}")

In [None]:
# 특성 중요도 (회귀)
importance_reg = pd.DataFrame({
    'Feature': housing.feature_names,
    'Importance': rf_reg.feature_importances_
}).sort_values('Importance', ascending=True)

plt.figure(figsize=(10, 6))
plt.barh(importance_reg['Feature'], importance_reg['Importance'])
plt.xlabel('Feature Importance')
plt.title('Random Forest Regressor Feature Importance')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 정리

### 핵심 개념
- **배깅 (Bagging)**: Bootstrap Aggregating, 여러 모델의 예측을 평균/투표
- **랜덤 특성 선택**: 각 분할에서 일부 특성만 고려
- **OOB (Out-of-Bag) Score**: 부트스트랩에 포함되지 않은 샘플로 평가

### 주요 하이퍼파라미터
- `n_estimators`: 트리 개수 (많을수록 좋지만 수익 체감)
- `max_depth`: 트리 깊이 (과적합 방지)
- `max_features`: 분할 시 고려할 특성 수
- `min_samples_split`: 분할을 위한 최소 샘플 수

### 다음 단계
- Gradient Boosting (XGBoost, LightGBM)