In [1]:
import numpy as np
import pandas as pd
import scipy 
import sklearn
from collections import Counter as Count
from sklearn.metrics import multilabel_confusion_matrix as MCM
from scipy import spatial
from sklearn.model_selection import train_test_split as tt_split

In [2]:
data_load = pd.read_csv(r'/Users/saikumargv/Downloads/kmeans_data/data.csv')
label = pd.read_csv(r'/Users/saikumargv/Downloads/kmeans_data/label.csv',names=['label'],header=None)

In [3]:
data_load.head()

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,...,0.658,0.659,0.660,0.661,0.662,0.663,0.664,0.665,0.666,0.667
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
data_load.count()

0        9999
0.1      9999
0.2      9999
0.3      9999
0.4      9999
         ... 
0.663    9999
0.664    9999
0.665    9999
0.666    9999
0.667    9999
Length: 784, dtype: int64

### Splitting Data

In [5]:
train_data, test_data = tt_split( data_load, test_size=0.08, random_state=42)
train_labels, test_labels = tt_split( label, test_size=0.08, random_state=42)

### KMeans

In [6]:
class KMeans:   
 
    def jaccardi(self,centroid, p):
        intersection = len(list(set(centroid).intersection(p)))
        union = (len(set(centroid)) + len(set(p))) - intersection
        return float(intersection) / union
        
    def Initialize_Centroids(self,data,K):
        row = data.shape[0]
        centroid_value={}
        for x in range(K):
            rand = np.random.randint(0, row-1)
            centroid_value[x] = data.iloc[rand]
        return centroid_value

    def Kmeans_train(self,data,K,max_iter=20,mode=1,tol=10):
    
        centroid_value_dict = self.Initialize_Centroids(data,K)
        new_centroid_value_dict = {}
        count = 0
        centroid_dict = {}
        convergence = False
        while((count<max_iter) and not convergence):
            
            for i in list(centroid_value_dict.keys()):
                centroid_dict[i]=[]
            for i in range(data.shape[0]):
                x = data.iloc[i]
                if mode==1 :
                    distance_measure = [np.linalg.norm(x-centroid_value_dict[k])  for k in centroid_value_dict]
                    idx = np.argmin(distance_measure)
                    centroid_dict[idx].append(i)
                elif mode==2 :
                    distance_measure = [self.jaccardi(list(x),centroid_value_dict[k]) for k in centroid_value_dict]
                    idx = np.argmax(distance_measure)
                    centroid_dict[idx].append(i)
                elif mode==3 :
                    distance_measure = [1-scipy.spatial.distance.cosine(x,list(centroid_value_dict[y]))  for y in centroid_value_dict]
                    idx = np.argmax(distance_measure)
                    centroid_dict[idx].append(i)
                
                prev_centroids=dict(centroid_value_dict)
                
            
            for i in centroid_dict:
                if len(centroid_dict[i]):
                    dps_centroid = centroid_dict[i]
                    centroid_value_dict[i] = np.average(data.iloc[dps_centroid],axis=0)
            
            
            current_tol=-1
            for i in centroid_value_dict:
                prev_centroid_point = prev_centroids[i]
                new_centroid_point = centroid_value_dict[i]
                change = np.sum(np.absolute(new_centroid_point-prev_centroid_point))
                current_tol = max(change, current_tol)
                
            print("Tolerance for the Iteration ",count,": ",current_tol)
            
            count+=1
            if (current_tol<10):
                convergence = True
                break
                
        return centroid_value_dict,centroid_dict
    

In [7]:
def calc_SSE(centroid_value_dict, centroid_dict,data):
    sse_data = 0
    for i in centroid_dict:
        sse_cluster = 0
        for j in centroid_dict[i]:
            dp = list(data.iloc[int(j)])
            for a,b in zip(centroid_value_dict[i],dp):
                sse_cluster += (a-b)**2
        sse_data+=sse_cluster
    return sse_data 

In [8]:
def predict_labels(C, S, labels):
    cluster_labels = np.zeros(10,dtype=int)
    for c in C:
        labels_of_points = []
        for point in S[c]:
            labels_of_points.extend(labels.iloc[point])
        counter = Count(labels_of_points)
        try:
            cluster_labels[c] = max(counter, key=counter.get)
        except:
            cluster_labels[c] = np.random.randint(0,9)
    return cluster_labels


In [9]:
def Acc(centroids, centroid_Labels, test_data, true_labels, mode=1):
    y_true = list(true_labels['label']);
    y_pred = []
    for index in range(test_data.shape[0]):
        featureset = test_data.iloc[index]
        if mode==1:
            distances = [np.linalg.norm(featureset - centroids[centroid]) for centroid in centroids]
            classification = distances.index(min(distances))
            y_pred.append(centroid_Labels[classification])
        elif mode==2:
            similarity = [jaccard_similarity(featureset, centroids[centroid]) for centroid in centroids]
            classification = similarity.index(max(similarity))
            y_pred.append(centroid_Labels[classification]) 
        elif mode==3:
            similarity = [1 - spatial.distance.cosine(featureset, centroids[centroid]) for centroid in centroids]
            classification = similarity.index(max(similarity))
            y_pred.append(centroid_Labels[classification])
    denominator = test_data.shape[0]
    correctly_classified = 0
    for i in range(0,len(y_pred)):
        if y_true[i] == y_pred[i]:
            correctly_classified += 1
    accuracy = correctly_classified/denominator
    return accuracy

In [26]:
Euclidean = KMeans()
Euclidean_centroids,clusters_Euclidean = Euclidean.Kmeans_train(data_load,10, max_iter=100,mode=1)

Tolerance for the Iteration  0 :  26932.795252225522
Tolerance for the Iteration  1 :  5612.4518020882715
Tolerance for the Iteration  2 :  4457.640485182539
Tolerance for the Iteration  3 :  3299.1753438341702
Tolerance for the Iteration  4 :  2435.4944840182648
Tolerance for the Iteration  5 :  1850.7005801305295
Tolerance for the Iteration  6 :  1637.0839222590616
Tolerance for the Iteration  7 :  1402.2196486544804
Tolerance for the Iteration  8 :  1784.7379455308983
Tolerance for the Iteration  9 :  1996.5843640160592
Tolerance for the Iteration  10 :  2559.8516910447124
Tolerance for the Iteration  11 :  2625.5879590321474
Tolerance for the Iteration  12 :  2128.275598977294
Tolerance for the Iteration  13 :  1348.2382435978343
Tolerance for the Iteration  14 :  1135.5661986496204
Tolerance for the Iteration  15 :  1227.7049230570515
Tolerance for the Iteration  16 :  816.892949877156
Tolerance for the Iteration  17 :  446.02043159038203
Tolerance for the Iteration  18 :  430.183

In [28]:
Euclidean_SSE = calc_SSE(Euclidean_centroids,clusters_Euclidean,data_load)
Euclidean_SSE

25321919111.363243

In [29]:
print("Euclidean SSE:",Euclidean_SSE)

Euclidean SSE: 25321919111.363243


In [30]:
cluster_labels_Euclidean = predict_labels(Euclidean_centroids,clusters_Euclidean,label)
cluster_labels_Euclidean

array([6, 4, 2, 7, 3, 0, 0, 1, 9, 5])

In [31]:
Acc_Euclidean = Acc(Euclidean_centroids, cluster_labels_Euclidean,test_data,test_labels)
Acc_Euclidean

0.0875

In [32]:
Jaccardi = KMeans()
Jaccardi_centroids,Jaccardi_clusters = Jaccardi.Kmeans_train(data_load,10, max_iter=100,mode=2)
Jaccardi_SSE = calc_SSE(Jaccardi_centroids,Jaccardi_clusters,data_load)

Tolerance for the Iteration  0 :  33071.849529780564
Tolerance for the Iteration  1 :  4358.628108264696
Tolerance for the Iteration  2 :  2864.822556518783
Tolerance for the Iteration  3 :  1680.4838989965938
Tolerance for the Iteration  4 :  1178.5427676803824
Tolerance for the Iteration  5 :  1004.6728895538307
Tolerance for the Iteration  6 :  1213.0352466526274
Tolerance for the Iteration  7 :  1283.5921951570158
Tolerance for the Iteration  8 :  1189.7274063080465
Tolerance for the Iteration  9 :  823.6759801515099
Tolerance for the Iteration  10 :  1135.201576420246
Tolerance for the Iteration  11 :  1091.8575123149378
Tolerance for the Iteration  12 :  560.3007881321157
Tolerance for the Iteration  13 :  323.1918784018806
Tolerance for the Iteration  14 :  0.0


In [33]:
print("Jacard SSE:",Jaccardi_SSE)

Jacard SSE: 34361687572.938736


In [34]:
cluster_Jaccardi = predict_labels(Jaccardi_centroids,Jaccardi_clusters,label)
cluster_Jaccardi 

array([1, 2, 4, 5, 6, 2, 7, 0, 1, 3])

In [35]:
Acc_Jaccard = Acc(Jaccardi_centroids, cluster_Jaccardi ,test_data,test_labels)
Acc_Jaccard

0.1075

In [38]:
Cosine = KMeans()
Cosine_centroids,Cosine_clusters = Cosine.Kmeans_train(data_load,10, max_iter = 100,mode=3)

Tolerance for the Iteration  0 :  27423.69888820259
Tolerance for the Iteration  1 :  6767.788736118457
Tolerance for the Iteration  2 :  4425.965159390809
Tolerance for the Iteration  3 :  2888.3341517319577
Tolerance for the Iteration  4 :  2290.9843666839797
Tolerance for the Iteration  5 :  1567.1825214576118
Tolerance for the Iteration  6 :  989.6810224089636
Tolerance for the Iteration  7 :  758.6502539183873
Tolerance for the Iteration  8 :  511.0388798222637
Tolerance for the Iteration  9 :  575.9496680204052
Tolerance for the Iteration  10 :  532.5525870000333
Tolerance for the Iteration  11 :  480.833920710268
Tolerance for the Iteration  12 :  568.6960470950789
Tolerance for the Iteration  13 :  423.7719639112787
Tolerance for the Iteration  14 :  465.9633604381205
Tolerance for the Iteration  15 :  469.9062508382433
Tolerance for the Iteration  16 :  404.92197651338284
Tolerance for the Iteration  17 :  338.0262763773254
Tolerance for the Iteration  18 :  320.20111646273136

In [39]:
Cosine_SSE = calc_SSE(Cosine_centroids,Cosine_clusters,data_load)

In [40]:
cluster_labels_cosine = predict_labels(Cosine_centroids,Cosine_clusters,label)
cluster_labels_cosine

array([1, 3, 7, 0, 3, 5, 2, 9, 8, 0])

In [42]:
Acc_Cosine = Acc(Cosine_centroids, cluster_labels_cosine,test_data,test_labels)

In [43]:
print("Euclidean SSE:",Euclidean_SSE)
print("Jacard SSE:",Jaccardi_SSE)
print("Cosine SSE :",Cosine_SSE)

Euclidean SSE: 25321919111.363243
Jacard SSE: 34361687572.938736
Cosine SSE : 25416869026.076195


In [44]:
print("Euclidean accuracy:",Acc_Euclidean)
print("Jacard accuracy:",Acc_Jaccard)
print("Cosine accuracy :",Acc_Cosine)

Euclidean accuracy: 0.0875
Jacard accuracy: 0.1075
Cosine accuracy : 0.0775
