In [8]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import accuracy_score

In [9]:
data = np.array(pd.read_csv('datasets/kmeans_data/data.csv', header=None))
labels = np.ravel(pd.read_csv('datasets/kmeans_data/label.csv', header=None))
print('Data: ', data.shape)
print('Labels: ', labels.shape)

Data:  (10000, 784)
Labels:  (10000,)


In [10]:
unique_labels = np.unique(labels)
no_of_clusters = unique_labels.size

In [11]:
euclidean_kmeans = KMeans(n_clusters=no_of_clusters, random_state=0)
labels_euclidean = euclidean_kmeans.fit(data)
sse_euclidean = euclidean_kmeans.inertia_
sse_euclidean

25319139512.32306

In [12]:
cosine_distances = pairwise_distances(data, metric='cosine')
cosine_kmeans = KMeans(n_clusters=no_of_clusters, random_state=0)
labels_cosine = cosine_kmeans.fit(cosine_distances)
sse_cosine = cosine_kmeans.inertia_

sse_cosine

672164.9402909297

In [13]:
jaccard_distances = pairwise_distances(data, metric='hamming')
jaccard_kmeans = KMeans(n_clusters=no_of_clusters, random_state=0)
labels_jaccard = jaccard_kmeans.fit(jaccard_distances)
sse_jaccard = jaccard_kmeans.inertia_

sse_jaccard

33029.96230483301

In [15]:
def label_clusters(labels, true_labels):
    unique_labels = np.unique(true_labels)
    cluster_labels = np.zeros(len(labels), dtype=np.int)
    for cluster in range(no_of_clusters):
        cluster_indices = np.where(labels == cluster)[0]
        cluster_true_labels = true_labels[cluster_indices]
        majority_label = np.argmax([np.sum(cluster_true_labels == label) for label in unique_labels])
        cluster_labels[cluster_indices] = majority_label
    return cluster_labels

# Label clusters using majority vote
cluster_labels_euclidean = label_clusters(labels_euclidean.labels_, labels)
cluster_labels_cosine = label_clusters(labels_cosine.labels_, labels)
cluster_labels_jaccard = label_clusters(labels_jaccard.labels_, labels)

# Compute predictive accuracy
accuracy_euclidean = accuracy_score(labels, cluster_labels_euclidean)
accuracy_cosine = accuracy_score(labels, cluster_labels_cosine)
accuracy_jaccard = accuracy_score(labels, cluster_labels_jaccard)

print("Accuracy Euclidean-K-means:", accuracy_euclidean)
print("Accuracy Cosine-K-means:", accuracy_cosine)
print("Accuracy Jaccard-K-means:", accuracy_jaccard)

Accuracy Euclidean-K-means: 0.5922
Accuracy Cosine-K-means: 0.5147
Accuracy Jaccard-K-means: 0.3399


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  cluster_labels = np.zeros(len(labels), dtype=np.int)
