In [60]:
import pandas as pd
import numpy as np

from scipy import spatial
from sklearn.metrics import accuracy_score

data = pd.read_csv('./kmeans_data/data.csv')
labels = pd.read_csv('./kmeans_data/label.csv', header=None)
data['label'] = labels



In [61]:
data['assignment'] = np.nan
clean = data.copy()
data.head()

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,...,0.660,0.661,0.662,0.663,0.664,0.665,0.666,0.667,label,assignment
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,7,
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4,


In [62]:
def generalized_jaccard(u, v):
    return 1 - np.sum(np.minimum(u,v))/np.sum(np.maximum(u,v))

In [63]:
class Kmeans:
    def __init__(self, K, data):
        one_of_each_label = data.groupby('label').sample(1) 
        self.clusters = one_of_each_label.sample(K) #K clusters, each with a different label
        self.clusters.sort_values(by='label', inplace=True) #put labels in order so we can use argmin to grab the label later

    def assign_datapoints(self,data):
        distances = spatial.distance.cdist(data.iloc[:,:-2], self.clusters.iloc[:,:-2], metric=self.similarity_metric)
        assignedCluster = distances.argmin(axis=1)
        data.assignment = assignedCluster

    def fit(self, data, max_iterations, metric):
        #'[:-2]' indexing for columns so we ignore the 'label' and 'assignment' columns when doing calculations
        self.similarity_metric = metric # 'euclidean' or 'cosine' or generalized_jaccard
        i=0
        sse = np.inf
        while(1):
            #Assign the datapoints to a cluster
            self.assign_datapoints(data)

            #Recompute the means of each cluster and update the cluster with the mean
            clusters_before_update = self.clusters.copy()
            for idx, centroid in self.clusters.iterrows():
                cluster_mean = data.loc[data.assignment == centroid.label].mean() 
                self.clusters.loc[idx,:-2] = cluster_mean[:-2]
                centroid[:-2] = cluster_mean[:-2]
            
            #Calculate SSE
            last_iter_sse = sse
            sse = 0
            for idx, centroid in self.clusters.iterrows():
                datapoints = data.loc[data.assignment == centroid.label].iloc[:,:-2].to_numpy()
                mean_of_cluster = centroid[:-2].to_numpy()
                mean = np.repeat(mean_of_cluster[None, :], datapoints.shape[0], axis=0) #repeat the mean to match the num of datapoints so we can do vectorized calc.
                sse += np.sum((mean - datapoints)**2)
    
            i+=1
            print("iteration: ", i, " SSE :", sse, end='\r')
            if self.clusters.equals(clusters_before_update): # or i>=max_iterations or sse>last_iter_sse:
                break


In [64]:
kmeans = Kmeans(10,data)

In [65]:
data = clean.copy()
kmeans.fit(data,500, 'euclidean')

  self.clusters.loc[idx,:-2] = cluster_mean[:-2]


iteration:  87  SSE : 25320033541.079747

In [None]:
data = clean.copy()
kmeans.fit(data,500, 'cosine')

In [None]:
data = clean.copy()
kmeans.fit(data,500, generalized_jaccard)

In [66]:
Y_pred = data.label.copy()
for idx, centroid in kmeans.clusters.iterrows():
    pointsInCluster = data[data.assignment == centroid.label].index #indices of clusters within selected centroid
    majorityVoteLabel = data.iloc[pointsInCluster].label.value_counts().idxmax() #the majority vote ground truth labels of those points
    # print(majorityVoteLabel, data.iloc[pointsInCluster].label.value_counts().max())
    Y_pred.iloc[pointsInCluster] = majorityVoteLabel


In [67]:
#compute predictive accuracy
acc = accuracy_score(data.label, Y_pred)
acc

0.16561656165616562