In [148]:
import pandas as pd
import numpy as np

from scipy import spatial
from sklearn.metrics import accuracy_score
from sklearn.neighbors import BallTree

In [149]:
data = np.genfromtxt('./kmeans_data/data.csv',delimiter=",", dtype=float)
labels = np.genfromtxt('./kmeans_data/label.csv',delimiter=",", dtype=int)

In [150]:
data.shape

(10000, 784)

In [151]:
indices_for_clusters = np.random.choice(data.shape[0], 10)
clusters = data[indices_for_clusters]
clusters.shape

(10, 784)

In [152]:
cluster_labels = np.arange(0,10,1)
cluster_labels

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [153]:
assignments = np.ones(data.shape[0]) * np.inf
assignments.shape

(10000,)

In [154]:
def generalized_jaccard(u, v):
    return 1 - np.sum(np.minimum(u,v))/np.sum(np.maximum(u,v))

In [155]:
cluster_members = np.where(assignments == 1)[0]
a = data[cluster_members] - clusters[1]
a.shape

(0, 784)

In [156]:
i=0
sse = np.inf
while(1):
    #Assign the datapoints to a cluster
    # distances = spatial.KDTree(clusters).query(data)
    distances = BallTree(clusters, metric=spatial.distance.euclidean).query(data)
    # distances = BallTree(clusters, metric=generalized_jaccard).query(data)
    # distances = BallTree(clusters, metric=spatial.distance.cosine).query(data)
    nearest_cluster_idx = distances[1]
    assignments = np.array([cluster_labels[idx] for idx in nearest_cluster_idx])
    
    #Recompute the means of each cluster and update the cluster with the mean
    clusters_before_update = clusters.copy()
    for idx,centroid in enumerate(cluster_labels):
        cluster_members = np.where(assignments == centroid)[0]
        cluster_mean = np.mean(data[cluster_members], axis=0) 
        clusters[idx] = cluster_mean
    
    # Calculate SSE
    last_iter_sse = sse
    sse = 0
    for idx,centroid in enumerate(cluster_labels):
        cluster_members = np.where(assignments == centroid)[0]
        sse += np.sum((data[cluster_members] - clusters[idx])**2)

    i+=1
    print("iteration: ", i, " SSE :", sse, end='\r')
    if i>=100:
        break
        # if np.array_equal(clusters,clusters_before_update): # or i>=max_iterations or sse>last_iter_sse:
        #     break

iteration:  100  SSE : 25670597881.83919

In [157]:
assignments.shape

(10000, 1)

In [158]:
cluster_members = np.where(assignments == 2)[0]
labels[cluster_members]
counts = np.bincount(labels[cluster_members])
np.argmax(counts)

3

In [159]:
predictions = assignments.copy()
for i,_ in enumerate(cluster_labels):
    cluster_members = np.where(assignments == i)[0]
    counts = np.bincount(labels[cluster_members])
    predictions[cluster_members] = np.argmax(counts)

In [160]:
predictions.shape

(10000, 1)

In [161]:
acc = accuracy_score(labels, predictions)
acc

0.5777