## Setup

First, let's import our libraries and load the data:

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

iris = datasets.load_iris()
X = iris.data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"Dataset shape: {X_scaled.shape}")
print(f"Features: {iris.feature_names}")

## Method 1: Elbow Method

The Elbow Method plots the Within-Cluster Sum of Squares (WCSS) against the number of clusters.

**Formula:** 
$$WCSS(k) = \sum_{i=1}^{k} \sum_{x \in C_i} ||x - \mu_i||^2$$

In [None]:
k_range = range(2, 11)
wcss = []

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_)
    print(f"k={k}: WCSS = {kmeans.inertia_:.2f}")

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(k_range, wcss, 'bo-', linewidth=2, markersize=10)
plt.xlabel('Number of Clusters (k)', fontsize=12)
plt.ylabel('WCSS', fontsize=12)
plt.title('Elbow Method', fontsize=14, fontweight='bold')
plt.axvline(x=3, color='r', linestyle='--', label='Elbow at k=3')
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

**Interpretation:** The "elbow" appears around **k=3**.

## Method 2: Silhouette Analysis

**Formula:** 
$$s(i) = \frac{b(i) - a(i)}{\max\{a(i), b(i)\}}$$

Score range: **-1 to +1** (higher is better)

In [None]:
silhouette_scores = []

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(X_scaled)
    score = silhouette_score(X_scaled, labels)
    silhouette_scores.append(score)
    print(f"k={k}: Silhouette = {score:.4f}")

optimal_k = k_range[np.argmax(silhouette_scores)]
print(f"\nOptimal k = {optimal_k}")

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(k_range, silhouette_scores, 'go-', linewidth=2, markersize=10)
plt.xlabel('Number of Clusters (k)', fontsize=12)
plt.ylabel('Silhouette Score', fontsize=12)
plt.title('Silhouette Analysis', fontsize=14, fontweight='bold')
plt.axvline(x=optimal_k, color='r', linestyle='--', label=f'Optimal k={optimal_k}')
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

## Detailed Silhouette Plots

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

for idx, k in enumerate([2, 3]):
    ax = axes[idx]
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(X_scaled)
    silhouette_vals = silhouette_samples(X_scaled, labels)
    avg_score = silhouette_score(X_scaled, labels)
    
    y_lower = 10
    for i in range(k):
        cluster_vals = silhouette_vals[labels == i]
        cluster_vals.sort()
        size = cluster_vals.shape[0]
        y_upper = y_lower + size
        
        color = plt.cm.viridis(float(i) / k)
        ax.fill_betweenx(np.arange(y_lower, y_upper), 0, cluster_vals,
                         facecolor=color, alpha=0.7)
        ax.text(-0.05, y_lower + 0.5 * size, f'C{i}')
        y_lower = y_upper + 10
    
    ax.set_xlabel('Silhouette Coefficient')
    ax.set_title(f'k={k}, Score={avg_score:.3f}', fontweight='bold')
    ax.axvline(x=avg_score, color='red', linestyle='--', linewidth=2)
    ax.set_yticks([])

plt.tight_layout()
plt.show()

## Final Clustering (k=3)

In [None]:
kmeans_final = KMeans(n_clusters=3, random_state=42, n_init=10)
labels_final = kmeans_final.fit_predict(X_scaled)
X_2d = X_scaled[:, :2]

plt.figure(figsize=(10, 7))
scatter = plt.scatter(X_2d[:, 0], X_2d[:, 1], c=labels_final, 
                     cmap='viridis', s=100, alpha=0.6, edgecolors='black')
centroids = kmeans_final.cluster_centers_[:, :2]
plt.scatter(centroids[:, 0], centroids[:, 1], c='red', marker='X', 
           s=300, edgecolors='black', linewidths=2, label='Centroids')
plt.xlabel('Sepal Length (scaled)')
plt.ylabel('Sepal Width (scaled)')
plt.title('K-Means Clustering (k=3)', fontweight='bold')
plt.colorbar(scatter, label='Cluster')
plt.legend()
plt.tight_layout()
plt.show()