### Task 1

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [3]:
class KMeansClustering:
    
    def __init__(self, k) -> None:
        self.k = k
        self.centroids = None
        self._sse_score = None

    def euclidean_distance(self, data_point, centroids):
        return np.sqrt(np.sum((centroids - data_point)**2, axis=1))

    def __sum_of_squared_errors_calc(self, centroids, data, y):
        sum_of_errors = 0.0
        for idx, d in enumerate(data):
            sum_of_errors += np.sum((centroids[y[idx]] - d) ** 2)

        return sum_of_errors

    def get_sum_of_squared_error(self):
        return self._sse_score
    
    def fit(self, X, max_iterations=200):
        self.centroids = np.random.uniform(
            low=np.amin(X, axis=0),
            high=np.amax(X, axis=0),
            size=(self.k, X.shape[1]))

        y = []
        for _ in range(max_iterations):
            y = []
            for data_point in X:

                distances = self.euclidean_distance(
                    data_point=data_point,
                    centroids=self.centroids)
                # print(distances.shape)
                cluster_num = np.argmin(distances)
                y.append(cluster_num)
            y = np.asarray(y)

            cluster_indices = []

            for idx in range(self.k):
                cluster_indices.append(np.argwhere(y == idx))

            cluster_centers = []

            for i, indices in enumerate(cluster_indices):
                if len(indices) == 0:
                    cluster_centers.append(self.centroids[i])
                else:
                    cluster_centers.append(np.mean(X[indices], axis=0)[0])

            if np.max(self.centroids - np.array(cluster_centers)) < 1e-3:
                break
            else:
                self.centroids = np.array(cluster_centers)

        # Calculate the final SSE after performing K-means
        self._sse_score = self.__sum_of_squared_errors_calc(X, self.centroids, y)
        
        return y

In [4]:
data = np.array(pd.read_csv('datasets/kmeans_data/data.csv', header=None))
labels = np.ravel(pd.read_csv('datasets/kmeans_data/label.csv', header=None))
print('Data: ', data.shape)
print('Labels: ', labels.shape)

Data:  (10000, 784)
Labels:  (10000,)


In [5]:
unique_labels = np.unique(labels)
no_of_clusters = unique_labels.size

In [6]:
euclidean_kmeans_m = KMeansClustering(k=no_of_clusters)
euclidean_kmeans_m_labels = euclidean_kmeans_m.fit(X=data)

In [7]:
cosine_distances = pairwise_distances(data, metric='cosine')
cosine_kmeans_m = KMeansClustering(k=no_of_clusters)
cosine_kmeans_m_labels = cosine_kmeans_m.fit(cosine_distances)

In [8]:
jaccard_distances = pairwise_distances(data, metric='hamming')
jaccard_kmeans_m = KMeansClustering(k=no_of_clusters)
jaccard_kmeans_m_labels = jaccard_kmeans_m.fit(X=jaccard_distances)

#### Q1. Run K-means clustering with Euclidean, Cosine and Jarcard similarity. Specify K= the number of categorical values of y (the number of classifications). Compare the SSEs of Euclidean-K-means, Cosine-K-means, Jarcard-K-means. Which method is better?

From the SSE values, the one with the lowest SSE is Jaccard-K-means. So using Jaccard K-Means is betterwith the lowest SSE of 1059.83.

In [9]:
sse_euclidean_m = euclidean_kmeans_m.get_sum_of_squared_error()
sse_euclidean_m

36425431.45791967

In [10]:
sse_cosine_m = cosine_kmeans_m.get_sum_of_squared_error()
sse_cosine_m

4943.109414510786

In [11]:
see_jaccard_m = jaccard_kmeans_m.get_sum_of_squared_error()
see_jaccard_m

2008.0470855781577

#### Q2. Compare the accuracies of Euclidean-K-means Cosine-K-means, Jarcard-K-means. First, label each cluster using the majority vote label of the data points in that cluster. Later, compute the predictive accuracy of Euclidean-K-means, Cosine-K-means, Jarcard-K-means. Which metric is better? (10 points)

Based on the accuracy computation of majority vote, Euclidean accuracy seems to perform better.

In [12]:
def label_clusters(labels, true_labels):
    unique_labels = np.unique(true_labels)
    cluster_labels = np.zeros(len(labels), dtype=np.int)
    for cluster in range(no_of_clusters):
        cluster_indices = np.where(labels == cluster)[0]
        cluster_true_labels = true_labels[cluster_indices]
        majority_label = np.argmax([np.sum(cluster_true_labels == label) for label in unique_labels])
        cluster_labels[cluster_indices] = majority_label
    return cluster_labels

# Label clusters using majority vote
cluster_labels_euclidean = label_clusters(euclidean_kmeans_m_labels, labels)
cluster_labels_cosine = label_clusters(cosine_kmeans_m_labels, labels)
cluster_labels_jaccard = label_clusters(jaccard_kmeans_m_labels, labels)

# Compute predictive accuracy
accuracy_euclidean = accuracy_score(labels, cluster_labels_euclidean)
accuracy_cosine = accuracy_score(labels, cluster_labels_cosine)
accuracy_jaccard = accuracy_score(labels, cluster_labels_jaccard)

print("Accuracy Euclidean-K-means:", accuracy_euclidean)
print("Accuracy Cosine-K-means:", accuracy_cosine)
print("Accuracy Jaccard-K-means:", accuracy_jaccard)

Accuracy Euclidean-K-means: 0.6046
Accuracy Cosine-K-means: 0.4354
Accuracy Jaccard-K-means: 0.2017


#### Q3: Set up the same stop criteria: “when there is no change in centroid position OR when the SSE value increases in the next iteration OR when the maximum preset value (e.g., 500, you can set the preset value by yourself) of iteration is complete”, for Euclidean-K-means, Cosine-Kmeans, Jarcard-K-means. Which method requires more iterations and times to converge? (10 points)