# 06. 결정 트리 (Decision Tree)

## 학습 목표
- 결정 트리의 작동 원리 이해
- 정보 이득, 지니 불순도 개념
- 과적합 방지 (가지치기)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error
from sklearn.datasets import load_iris, make_classification
import seaborn as sns

plt.rcParams['font.family'] = 'DejaVu Sans'

## 1. 결정 트리 분류

In [None]:
# Iris 데이터셋
iris = load_iris()
X, y = iris.data, iris.target

# 2개 특성만 사용 (시각화 용이)
X_2d = X[:, [2, 3]]  # petal length, petal width

X_train, X_test, y_train, y_test = train_test_split(
    X_2d, y, test_size=0.3, random_state=42
)

In [None]:
# Why: max_depth=3 limits tree complexity to prevent overfitting — deeper trees
# memorize training data. Decision trees are greedy (locally optimal splits),
# so constraining depth is the primary regularization mechanism.
tree_clf = DecisionTreeClassifier(max_depth=3, random_state=42)
tree_clf.fit(X_train, y_train)

y_pred = tree_clf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=iris.target_names))

## 2. 트리 시각화

In [None]:
# 결정 트리 시각화
plt.figure(figsize=(20, 10))
plot_tree(tree_clf, 
          feature_names=['petal length', 'petal width'],
          class_names=iris.target_names,
          filled=True,
          rounded=True,
          fontsize=12)
plt.title('Decision Tree - Iris Dataset')
plt.tight_layout()
plt.show()

In [None]:
# 결정 경계 시각화
def plot_decision_boundary_tree(model, X, y, feature_names, class_names):
    x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
    y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200),
                         np.linspace(y_min, y_max, 200))
    
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    plt.figure(figsize=(10, 8))
    plt.contourf(xx, yy, Z, alpha=0.3, cmap='RdYlBu')
    
    colors = ['blue', 'green', 'red']
    for i, (color, name) in enumerate(zip(colors, class_names)):
        idx = y == i
        plt.scatter(X[idx, 0], X[idx, 1], c=color, label=name, 
                    edgecolors='black', alpha=0.7)
    
    plt.xlabel(feature_names[0])
    plt.ylabel(feature_names[1])
    plt.title('Decision Tree Decision Boundary')
    plt.legend()
    plt.show()

plot_decision_boundary_tree(tree_clf, X_2d, y, 
                            ['petal length', 'petal width'],
                            iris.target_names)

## 3. 과적합 분석

In [None]:
# Why: This plot reveals the bias-variance tradeoff. Train score approaches 1.0
# as depth increases (memorization), while test score peaks and then degrades.
# The optimal depth is where test score is maximized — the sweet spot.
depths = range(1, 20)
train_scores = []
test_scores = []

for depth in depths:
    tree = DecisionTreeClassifier(max_depth=depth, random_state=42)
    tree.fit(X_train, y_train)
    train_scores.append(tree.score(X_train, y_train))
    test_scores.append(tree.score(X_test, y_test))

plt.figure(figsize=(10, 6))
plt.plot(depths, train_scores, 'b-o', label='Train Score')
plt.plot(depths, test_scores, 'r-o', label='Test Score')
plt.xlabel('Max Depth')
plt.ylabel('Accuracy')
plt.title('Decision Tree: Train vs Test Score by Depth')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## 4. 특성 중요도

In [None]:
# 전체 특성 사용 모델
X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(
    X, y, test_size=0.3, random_state=42
)

tree_full = DecisionTreeClassifier(max_depth=4, random_state=42)
tree_full.fit(X_train_full, y_train_full)

# 특성 중요도
importance = pd.DataFrame({
    'Feature': iris.feature_names,
    'Importance': tree_full.feature_importances_
}).sort_values('Importance', ascending=True)

plt.figure(figsize=(10, 6))
plt.barh(importance['Feature'], importance['Importance'])
plt.xlabel('Feature Importance')
plt.title('Decision Tree Feature Importance')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 5. 하이퍼파라미터 튜닝

In [None]:
from sklearn.model_selection import GridSearchCV

# Why: min_samples_split and min_samples_leaf complement max_depth for pruning.
# min_samples_split prevents splits on tiny subsets, min_samples_leaf ensures
# each leaf has enough data for a reliable prediction. Tuning all three together
# via CV finds the best regularization combination.
param_grid = {
    'max_depth': [2, 3, 4, 5, 6, 7, 8],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8]
}

grid_search = GridSearchCV(
    DecisionTreeClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='accuracy',
    return_train_score=True
)

grid_search.fit(X_train_full, y_train_full)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best CV Score: {grid_search.best_score_:.4f}")
print(f"Test Score: {grid_search.score(X_test_full, y_test_full):.4f}")

## 6. 결정 트리 회귀

In [None]:
# 회귀 데이터 생성
np.random.seed(42)
X_reg = np.sort(5 * np.random.rand(200, 1), axis=0)
y_reg = np.sin(X_reg).ravel() + np.random.randn(200) * 0.1

# 모델 학습
tree_reg = DecisionTreeRegressor(max_depth=4, random_state=42)
tree_reg.fit(X_reg, y_reg)

# 예측
X_test_reg = np.linspace(0, 5, 500).reshape(-1, 1)
y_pred_reg = tree_reg.predict(X_test_reg)

plt.figure(figsize=(12, 5))
plt.scatter(X_reg, y_reg, alpha=0.5, label='Data')
plt.plot(X_test_reg, y_pred_reg, 'r-', linewidth=2, label='Prediction')
plt.xlabel('X')
plt.ylabel('y')
plt.title('Decision Tree Regression')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## 정리

### 핵심 개념
- **분할 기준**: 정보 이득 (엔트로피) 또는 지니 불순도
- **가지치기**: max_depth, min_samples_split, min_samples_leaf
- **장점**: 해석 용이, 전처리 불필요
- **단점**: 과적합 경향, 작은 변화에 민감

### 다음 단계
- 앙상블 학습 (Random Forest, Gradient Boosting)
- 교차 검증을 통한 하이퍼파라미터 최적화