Math 5750/6880: Mathematics of Data Science \
Project 2

# 1. Clustering Gaussian Blobs using $k$-means

In [6]:
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler

# Generate 5 Gaussian blobs in 10 dimensions
X, y_true = make_blobs(
    n_samples=1000,
    centers=5,
    n_features=10,
    cluster_std=1.5,
    random_state=1)        # reproducibility
X = StandardScaler().fit_transform(X)

print(type(X),X.shape)
print(type(y_true),y_true.shape)

<class 'numpy.ndarray'> (1000, 10)
<class 'numpy.ndarray'> (1000,)


In [8]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
from scipy.optimize import linear_sum_assignment
import matplotlib.pyplot as plt
import numpy as np

def hungarian_confusion(y_true, y_pred, n_classes=None):
    if n_classes is None:
        n_classes = len(np.unique(y_true))
    C = confusion_matrix(y_true, y_pred, labels=range(n_classes))
    cost = C.max() - C                           # convert to cost matrix
    ri, cj = linear_sum_assignment(cost)         # optimal label assignment
    remap = {cj[i]: ri[i] for i in range(len(ri))}
    y_map = np.vectorize(lambda z: remap.get(z, z))(y_pred)
    C_map = confusion_matrix(y_true, y_map, labels=range(n_classes))
    acc = np.trace(C_map) / np.sum(C_map)
    return C_map, acc, y_map

kmeans = KMeans(n_clusters=5, n_init=20, random_state=42).fit(X)
labels = kmeans.labels_
inertia_smallest = kmeans.inertia_

pca = PCA(n_components=2, random_state=42).fit(X)
X2 = pca.transform(X)
centers2 = pca.transform(kmeans.cluster_centers_)

plt.figure()
plt.scatter(X2[:, 0], X2[:, 1], c=labels, s=8)
plt.scatter(centers2[:, 0], centers2[:, 1], c='k', s=80, marker='x')
plt.title('Blobs: PCA 2D with KMeans centers')
plt.tight_layout()
plt.savefig('blobs_pca_clusters.png', dpi=150)
plt.close()

C_map, acc, labels_map = hungarian_confusion(y_true, labels, n_classes=5)
plt.figure()
plt.imshow(C_map, cmap='Blues')
plt.colorbar()
plt.title('Blobs Confusion (Hungarian Matching)')
plt.xlabel('Predicted')
plt.ylabel('True')
for i in range(C_map.shape[0]):
    for j in range(C_map.shape[1]):
        plt.text(j, i, str(C_map[i, j]), ha='center', va='center')
plt.tight_layout()
plt.savefig('blobs_confusion.png', dpi=150)
plt.close()

Ks, inertias = list(range(2, 11)), []
for k in Ks:
    km = KMeans(n_clusters=k, n_init=10, random_state=42).fit(X)
    inertias.append(km.inertia_)

plt.figure()
plt.plot(Ks, inertias, marker='o')
plt.axvline(5, ls='--', color='gray')
plt.xlabel('k')
plt.ylabel('Inertia')
plt.title('Elbow plot (Blobs)')
plt.tight_layout()
plt.savefig('blobs_elbow.png', dpi=150)
plt.close()

print('===== BLOBS RESULTS (copy these into LaTeX) =====')
print(f'Min k-means inertia (k=5, n_init=20): {inertia_smallest:.2f}')
print(f'Hungarian-matched accuracy: {acc:.4f}')

===== BLOBS RESULTS (copy these into LaTeX) =====
Min k-means inertia (k=5, n_init=20): 924.32
Hungarian-matched accuracy: 1.0000


# 2. Clustering Fashion-MNIST using $k$-means

In [None]:
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler

# Load Fashion-MNIST from OpenML
# Classes (0-9): T-shirt/top, Trouser, Pullover, Dress, Coat, Sandal, Shirt, Sneaker, Bag, Ankle boot
X, y = fetch_openml("Fashion-MNIST", version=1, as_frame=False, parser="auto", return_X_y=True)
y = y.astype(int)

print(type(X),X.shape)
print(type(y),y.shape)

<class 'numpy.ndarray'> (70000, 784)
<class 'numpy.ndarray'> (70000,)


In [9]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
from scipy.optimize import linear_sum_assignment

# Load Fashion-MNIST
X, y = fetch_openml("Fashion-MNIST", version=1, as_frame=False, parser="auto", return_X_y=True)
y = y.astype(int)
X = StandardScaler().fit_transform(X)

# Fit KMeans
kmeans = KMeans(n_clusters=10, n_init=20, random_state=42).fit(X)
labels = kmeans.labels_
inertia = kmeans.inertia_

# Hungarian matching
C = confusion_matrix(y, labels)
r, c = linear_sum_assignment(-C)
acc = C[r, c].sum() / C.sum()

# PCA for visualization
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X2 = pca.fit_transform(X)
centers2 = pca.transform(kmeans.cluster_centers_)

plt.figure()
plt.scatter(X2[:, 0], X2[:, 1], c=labels, s=3)
plt.scatter(centers2[:, 0], centers2[:, 1], c='k', s=80, marker='x')
plt.title('Fashion-MNIST: PCA 2D with KMeans centers')
plt.tight_layout()
plt.savefig('fashion_pca_clusters.png', dpi=150)
plt.close()

print("===== FASHION-MNIST RESULTS (copy into LaTeX) =====")
print(f"Min k-means inertia (k=10, n_init=20): {inertia:.2f}")
print(f"Hungarian-matched accuracy: {acc:.4f}")

===== FASHION-MNIST RESULTS (copy into LaTeX) =====
Min k-means inertia (k=10, n_init=20): 30508820.46
Hungarian-matched accuracy: 0.4844


# 3. Dimensionality reduction for Fashion-MNIST

In [10]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
from scipy.optimize import linear_sum_assignment

pca50 = PCA(n_components=50, random_state=42)
X50 = pca50.fit_transform(X)

kmeans_pca = KMeans(n_clusters=10, n_init=20, random_state=42).fit(X50)
labels_pca = kmeans_pca.labels_
inertia_pca = kmeans_pca.inertia_

C = confusion_matrix(y, labels_pca)
r, c = linear_sum_assignment(-C)
acc_pca = C[r, c].sum() / C.sum()

pca2 = PCA(n_components=2, random_state=42)
X2 = pca2.fit_transform(X50)
centers2 = pca2.transform(kmeans_pca.cluster_centers_)

plt.figure()
plt.scatter(X2[:, 0], X2[:, 1], c=labels_pca, s=3)
plt.scatter(centers2[:, 0], centers2[:, 1], c='k', s=80, marker='x')
plt.title('Fashion-MNIST (PCA-50 -> PCA-2) with KMeans centers')
plt.tight_layout()
plt.savefig('fashion_pca50_clusters.png', dpi=150)
plt.close()

print("===== DIM-RED RESULTS (copy into LaTeX) =====")
print(f"Min k-means inertia (k=10, PCA-50, n_init=20): {inertia_pca:.2f}")
print(f"Hungarian-matched accuracy (PCA-50): {acc_pca:.4f}")

===== DIM-RED RESULTS (copy into LaTeX) =====
Min k-means inertia (k=10, PCA-50, n_init=20): 19610249.81
Hungarian-matched accuracy (PCA-50): 0.4835


# 4. Clustering Fashion-MNIST using spectral clustering

In [11]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import SpectralClustering
from sklearn.metrics import confusion_matrix
from scipy.optimize import linear_sum_assignment

rng = np.random.RandomState(42)
idx = rng.choice(len(X), 10000, replace=False)   # 10k is a good balance
X_sub, y_sub = X[idx], y[idx]

pca50 = PCA(n_components=50, random_state=42)
X50 = pca50.fit_transform(X_sub)

spec = SpectralClustering(
    n_clusters=10,
    affinity='nearest_neighbors',   # fast & robust for high-dim data
    n_neighbors=10,
    assign_labels='kmeans',
    random_state=42
)
labels_spec = spec.fit_predict(X50)

C = confusion_matrix(y_sub, labels_spec)
r, c = linear_sum_assignment(-C)
acc_spec = C[r, c].sum() / C.sum()

pca2 = PCA(n_components=2, random_state=42)
X2 = pca2.fit_transform(X50)

plt.figure()
plt.scatter(X2[:, 0], X2[:, 1], c=labels_spec, s=3)
plt.title('Fashion-MNIST (Spectral, 10k, PCA-50 -> PCA-2)')
plt.tight_layout()
plt.savefig('fashion_spec_clusters.png', dpi=150)
plt.close()

print("===== SPECTRAL RESULTS (copy into LaTeX) =====")
print(f"Subset size: {len(X_sub)}  |  affinity=nearest_neighbors, n_neighbors=10")
print(f"Hungarian-matched accuracy (Spectral, PCA-50, 10k): {acc_spec:.4f}")

===== SPECTRAL RESULTS (copy into LaTeX) =====
Subset size: 10000  |  affinity=nearest_neighbors, n_neighbors=10
Hungarian-matched accuracy (Spectral, PCA-50, 10k): 0.5791
