# 11. 클러스터링 (Clustering)

## 학습 목표
- K-Means 클러스터링 이해
- DBSCAN 알고리즘
- 클러스터 평가 지표

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, calinski_harabasz_score
from sklearn.datasets import make_blobs, make_moons
import seaborn as sns

plt.rcParams['font.family'] = 'DejaVu Sans'

## 1. K-Means 클러스터링

In [None]:
# 데이터 생성
X, y_true = make_blobs(n_samples=300, centers=4, cluster_std=0.6, random_state=42)

plt.figure(figsize=(10, 6))
plt.scatter(X[:, 0], X[:, 1], alpha=0.6, edgecolors='black')
plt.title('Original Data (Unlabeled)')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# K-Means 학습
# Why: n_init=10 runs the algorithm 10 times with different centroid seeds and picks
# the best result, mitigating K-Means' sensitivity to initial centroid placement.
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
y_kmeans = kmeans.fit_predict(X)

# 결과 시각화
plt.figure(figsize=(10, 6))
scatter = plt.scatter(X[:, 0], X[:, 1], c=y_kmeans, cmap='viridis', 
                      alpha=0.6, edgecolors='black')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1],
            c='red', marker='X', s=200, label='Centroids')
plt.title('K-Means Clustering (K=4)')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.colorbar(scatter, label='Cluster')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

print(f"Inertia: {kmeans.inertia_:.2f}")
print(f"Silhouette Score: {silhouette_score(X, y_kmeans):.4f}")

## 2. 최적의 K 찾기 (Elbow Method)

In [None]:
# Elbow Method
# Why: We sweep K=2..10 and look for the "elbow" in inertia where adding more clusters
# gives diminishing returns — the elbow indicates the natural cluster count.
inertias = []
silhouettes = []
K_range = range(2, 11)

for k in K_range:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    km.fit(X)
    inertias.append(km.inertia_)
    silhouettes.append(silhouette_score(X, km.labels_))

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Elbow Plot
axes[0].plot(K_range, inertias, 'b-o')
axes[0].set_xlabel('Number of Clusters (K)')
axes[0].set_ylabel('Inertia')
axes[0].set_title('Elbow Method')
axes[0].grid(True, alpha=0.3)

# Silhouette Score
axes[1].plot(K_range, silhouettes, 'g-o')
axes[1].set_xlabel('Number of Clusters (K)')
axes[1].set_ylabel('Silhouette Score')
axes[1].set_title('Silhouette Score vs K')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 3. DBSCAN

In [None]:
# 비구형 데이터 생성
X_moons, y_moons = make_moons(n_samples=300, noise=0.05, random_state=42)

# K-Means vs DBSCAN 비교
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# 원본 데이터
axes[0].scatter(X_moons[:, 0], X_moons[:, 1], alpha=0.6, edgecolors='black')
axes[0].set_title('Original Data (Moons)')
axes[0].grid(True, alpha=0.3)

# K-Means
km_moons = KMeans(n_clusters=2, random_state=42, n_init=10)
y_km_moons = km_moons.fit_predict(X_moons)
axes[1].scatter(X_moons[:, 0], X_moons[:, 1], c=y_km_moons, cmap='viridis', 
                alpha=0.6, edgecolors='black')
axes[1].set_title('K-Means (K=2)')
axes[1].grid(True, alpha=0.3)

# Why: DBSCAN groups points by density rather than distance to centroids, so it can
# discover arbitrarily-shaped clusters (like these crescents) that K-Means cannot.
# eps=0.2 defines the neighborhood radius; min_samples=5 sets the core-point threshold.
dbscan = DBSCAN(eps=0.2, min_samples=5)
y_dbscan = dbscan.fit_predict(X_moons)
axes[2].scatter(X_moons[:, 0], X_moons[:, 1], c=y_dbscan, cmap='viridis', 
                alpha=0.6, edgecolors='black')
axes[2].set_title('DBSCAN (eps=0.2, min_samples=5)')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"K-Means Silhouette: {silhouette_score(X_moons, y_km_moons):.4f}")
print(f"DBSCAN Silhouette: {silhouette_score(X_moons, y_dbscan):.4f}")
print(f"DBSCAN found {len(set(y_dbscan)) - (1 if -1 in y_dbscan else 0)} clusters")
print(f"DBSCAN noise points: {sum(y_dbscan == -1)}")

## 4. 계층적 클러스터링

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage

# 덴드로그램
X_small = X[:50]  # 시각화를 위해 일부 데이터만 사용
# Why: Ward linkage minimizes within-cluster variance at each merge, producing compact,
# spherical clusters — it is the most commonly used linkage for Euclidean data.
linkage_matrix = linkage(X_small, method='ward')

plt.figure(figsize=(15, 7))
dendrogram(linkage_matrix)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Sample Index')
plt.ylabel('Distance')
plt.show()

In [None]:
# Agglomerative Clustering
agg = AgglomerativeClustering(n_clusters=4)
y_agg = agg.fit_predict(X)

plt.figure(figsize=(10, 6))
plt.scatter(X[:, 0], X[:, 1], c=y_agg, cmap='viridis', alpha=0.6, edgecolors='black')
plt.title('Agglomerative Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.grid(True, alpha=0.3)
plt.show()

print(f"Silhouette Score: {silhouette_score(X, y_agg):.4f}")

## 5. 실제 데이터 예제

In [None]:
from sklearn.datasets import load_iris

# Iris 데이터 클러스터링
iris = load_iris()
X_iris = iris.data

# Why: Scaling is essential before K-Means because it uses Euclidean distance —
# without scaling, features with larger ranges dominate the distance calculation.
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_iris)

# K-Means
kmeans_iris = KMeans(n_clusters=3, random_state=42, n_init=10)
y_kmeans_iris = kmeans_iris.fit_predict(X_scaled)

# 결과 비교
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# 실제 레이블
axes[0].scatter(X_iris[:, 0], X_iris[:, 1], c=iris.target, cmap='viridis', 
                alpha=0.6, edgecolors='black')
axes[0].set_title('True Labels')
axes[0].set_xlabel('Sepal Length')
axes[0].set_ylabel('Sepal Width')

# K-Means 결과
axes[1].scatter(X_iris[:, 0], X_iris[:, 1], c=y_kmeans_iris, cmap='viridis', 
                alpha=0.6, edgecolors='black')
axes[1].set_title('K-Means Clustering')
axes[1].set_xlabel('Sepal Length')
axes[1].set_ylabel('Sepal Width')

plt.tight_layout()
plt.show()

print(f"Silhouette Score: {silhouette_score(X_scaled, y_kmeans_iris):.4f}")

## 정리

### 알고리즘 비교

| 알고리즘 | 장점 | 단점 |
|---------|------|------|
| K-Means | 빠름, 간단 | K 지정 필요, 구형 클러스터 가정 |
| DBSCAN | K 불필요, 비구형 가능, 노이즈 처리 | eps, min_samples 설정 |
| Hierarchical | 덴드로그램, 다양한 K | 대규모 데이터에 느림 |

### 평가 지표
- **Silhouette Score**: -1~1, 높을수록 좋음
- **Inertia**: 클러스터 내 분산, 낮을수록 좋음
- **Calinski-Harabasz**: 클러스터 간/내 분산 비율