# K Means Clustering


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as py
import random
from sklearn import metrics
from openpyxl import Workbook

In [113]:
def eudist(x1,x2):
    return (sum((x1-x2)**2))**0.5


In [110]:
def KMeans(K, data, n_iter = 50, max_steps = 100):
    best_cluster_centers = np.zeros((K, data.shape[1])) #Initializing best cluster centers to zeros
    min_dist = np.inf 
    
    for i in range(n_iter):
        init = np.random.choice(np.arange(data.shape[0]), K, replace = False) # random initialization of centroids
        centroids = data[init]        
        for j in range(max_steps):
            assigned_clusters = assign_clusters(data, centroids)
            
            #This condition starts the current iteration again if there are any empty clusters
            if(len(np.unique(assigned_clusters)) != K):
                break
            
            prev_cluster_centers = centroids #Storing value of previous cluster centers
            for k in range(K):
                centroids[k] = np.average(data[assigned_clusters == k], axis = 0)
            
            # Stop taking further steps if centroids stop changing
            if(np.sum(centroids == prev_cluster_centers) == np.size(centroids)):
                break 
        
        distance = dist(data, centroids)
        #incase we dont get clusters with no change in centroids within the given iterations
        #then we choose that centroid which gave minimum intracluster distance sum (SSE)
        if(distance < min_dist):        
            best_cluster_centers = centroids
            labels = assigned_clusters
            min_dist = distance
    
    return best_cluster_centers,labels


In [None]:

def assign_clusters(data, centroids):
    K = centroids.shape[0]
    distances = np.zeros((data.shape[0], K))
    
    for k in range(K):
        distances[:, k] = np.sqrt(np.sum((data - centroids[k])**2, axis = 1))
    
    assigned_clusters = np.argmin(distances, axis = 1)
    return assigned_clusters

In [None]:
def dist(data, centroids):
    assigned_clusters = assign_clusters(data, centroids)
    dist = 0
    for k in range(centroids.shape[0]):
        dist += np.sum((data[assigned_clusters == k] - centroids[k])**2)
    
    return dist

In [111]:
silhv= np.zeros((56,9))
dbind= np.zeros((56,9))
for i in range(1,57):
    print(i)
    fname='/Users/durbasatpathi/Desktop/datamining/data/'+str(i)+'.csv'
    data=np.genfromtxt(fname,delimiter=',');
    for j in range(2,11):
        kmeans_labels = KMeans(j,data[:,0:-1])[1]
        silhv[i-1,j-2]=metrics.silhouette_score(data[:,0:-1],kmeans_labels,metric='euclidean'); 
        dbind[i-1,j-2]=metrics.davies_bouldin_score(data[:,0:-1], kmeans_labels)
        print(j,silhv[i-1,j-2],dbind[i-1,j-2])

#df1=pd.DataFrame(silhv)
df2=pd.DataFrame(dbind)
#df1.to_excel('/Users/durbasatpathi/Desktop/2019A7PS0972H_silhv.xlsx',sheet_name='Kmeans')
#df2.to_excel('/Users/durbasatpathi/Desktop/2019A7PS0972H_db.xlsx',sheet_name='Kmeans')


1
2 0.8676572035835833 0.2730556616344951
3 0.48306404715456547 0.5639220428358491
4 0.49115333214976237 0.5888579196990413
5 0.36963014542796774 0.6516535775508785
6 0.45812157148789634 1.0362510748024845
7 0.2941285883083423 0.792318849144821
8 0.3810887212191395 1.0124972948132103
9 0.3339725763141151 0.7546722090064522
10 0.29308368495414433 0.7679780781198999
2
2 0.7538701210081645 0.7475159638498523
3 0.6343325084793264 0.7409951110930653
4 0.628343413668325 0.4620316341064067
5 0.36910558688730244 0.7372458832313915
6 0.3628580209226488 0.6983782679713363
7 0.3748376844608492 0.8620877850512534
8 0.32466230049461897 0.820377743496379
9 0.3355460558929228 0.7446046621888507
10 0.2118673630159442 1.0162849212622886
3
2 0.7950363937363268 0.8206188755832378
3 0.689392264743574 1.0361861920483235
4 0.5564712096669928 0.7727440995021414
5 0.5505823169777596 1.0934055380564325
6 0.188040245607655 1.1708303381128473
7 0.23809108066113013 1.0871130448955064
8 0.2527629539435883 1.344545

6 0.3452857488780335 0.7444450002322126
7 0.34523847219212156 0.9099619156490008
8 0.31068379533123397 0.8431750371123352
9 0.3601411336366831 1.016451106096303
10 0.23193708257682547 1.2430299745761346
24
2 0.6812832522628686 0.5843377758537074
3 0.5506938561473735 0.677454053343851
4 0.5203253437145662 0.6312479885604434
5 0.4963551842871162 0.7536673920152076
6 0.40534071478542155 0.652143512462609
7 0.39645409893079736 0.917213426265664
8 0.40259237546174037 0.7499379701283312
9 0.22902913695046112 0.8772932440651293
10 0.33014250666898864 0.8567837485672672
25
2 0.8703169208799701 0.99494640351877
3 0.11123457388077441 0.5413652496500126
4 0.4542621875607558 0.9005616015911815
5 0.3583021544068013 1.5453405670177365
6 0.2397338596306581 1.094280635412673
7 0.30468084408346535 0.839083499347406
8 0.11626400317662651 1.1093031219495249
9 0.29220916030236876 1.034182739153916
10 0.3113556350194833 0.9877410482720421
26
2 0.9008088392130524 0.2391750313691371
3 0.4549978250711774 0.70

3 0.7456814259839836 0.24791916125942706
4 0.5873709714310557 0.37290371477349155
5 0.5152681837847445 0.4636324362447007
6 0.5054509575873306 0.5088025001411539
7 0.48723023723717757 0.5862999032283892
8 0.3271227336153287 0.6378264175063348
9 0.3288360677184785 0.6230649581239036
10 0.23049673240473492 0.5815025281122299
47
2 0.9329016842365085 0.6899981746603016
3 0.6075873018387133 0.8249904398046399
4 0.17766202925214433 1.6657424804978544
5 0.5103708108439701 0.9826397544120138
6 0.2141180056993268 0.8976290022773581
7 0.09960531558761844 1.1077532810044426
8 0.02581351505963338 1.1684658925524902
9 0.2942249587198144 1.1448213320356304
10 0.10366763419778098 1.4559052765894087
48
2 0.9144621545466917 1.1759372102709666
3 0.6263513065555899 0.6276051027168771
4 0.1942525612354283 1.331629660320783
5 0.21077064241161225 1.4951064190844718
6 0.168390132127061 0.7354197399508293
7 0.327755146571871 0.8998188164304609
8 0.1428051002689102 0.9836178662820169
9 0.21736817746875292 1.22