# K-Means: Scratch vs Scikit-Learn

In this notebook, we compare our custom NumPy implementation of K-Means with the industry-standard implementation from `scikit-learn`.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans as sklearnKMeans
from k_means_scratch import KMeans as scratchKMeans
from sklearn.datasets import make_blobs
import time

# Generate data
X, _ = make_blobs(n_samples=500, centers=5, cluster_std=0.7, random_state=42)

## Benchmarking Performance

In [None]:
# Scratch Implementation
start = time.time()
scratch_model = scratchKMeans(k=5)
scratch_model.fit(X)
scratch_time = time.time() - start

# Scikit-Learn Implementation
start = time.time()
sklearn_model = sklearnKMeans(n_clusters=5, n_init=10)
sklearn_model.fit(X)
sklearn_time = time.time() - start

print(f"Scratch Time: {scratch_time:.4f}s")
print(f"Sklearn Time: {sklearn_time:.4f}s")

## Visual Comparison

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

ax1.scatter(X[:, 0], X[:, 1], c=scratch_model.predict(X), cmap='viridis')
ax1.scatter(scratch_model.centroids[:, 0], scratch_model.centroids[:, 1], c='red', marker='X', s=200)
ax1.set_title("Scratch KMeans")

ax2.scatter(X[:, 0], X[:, 1], c=sklearn_model.labels_, cmap='viridis')
ax2.scatter(sklearn_model.cluster_centers_[:, 0], sklearn_model.cluster_centers_[:, 1], c='red', marker='X', s=200)
ax2.set_title("Sklearn KMeans")

plt.show()