Import data processing package

In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.metrics.cluster.supervised import contingency_matrix
from sklearn.metrics.cluster import adjusted_rand_score

Import data

In [2]:
def loadDataSet(fileName): 
    data = [] 
    with open(fileName) as f:
        for line in f.readlines():
            line = line.split('\t')
            data.append(line)
        data = np.array(data).astype(np.float32)
    return data

In [3]:
cho_data = loadDataSet('C:/Users/dingl/Desktop/Data Mining/Project1/cho.txt')
iyer_data = loadDataSet('C:/Users/dingl/Desktop/Data Mining/Project1/iyer.txt')

View data

In [4]:
print(cho_data)
print(" ")
print(iyer_data)

[[ 1.00e+00  1.00e+00 -6.90e-01 ... -4.00e-02  1.90e-01  8.20e-01]
 [ 2.00e+00  1.00e+00 -2.10e-01 ... -1.23e+00 -3.25e-01  0.00e+00]
 [ 3.00e+00  1.00e+00 -3.00e-01 ... -1.20e-01 -1.60e-01  6.70e-01]
 ...
 [ 3.84e+02  5.00e+00 -3.12e+00 ...  1.48e+00  2.06e+00  2.36e+00]
 [ 3.85e+02  5.00e+00 -7.90e-01 ...  4.90e-01  8.00e-02  1.50e-01]
 [ 3.86e+02  5.00e+00 -1.16e+00 ...  4.93e-01  1.27e+00  8.70e-01]]
 
[[ 1.00e+00 -1.00e+00  1.00e+00 ...  5.00e-01  6.60e-01  5.20e-01]
 [ 2.00e+00  1.00e+00  1.00e+00 ...  9.00e-01  7.30e-01  7.50e-01]
 [ 3.00e+00  1.00e+00  1.00e+00 ...  7.10e-01  5.70e-01  4.60e-01]
 ...
 [ 5.15e+02  1.00e+01  1.00e+00 ...  2.15e+00  2.05e+00  2.25e+00]
 [ 5.16e+02  1.00e+01  1.00e+00 ...  1.97e+00  2.21e+00  1.79e+00]
 [ 5.17e+02  1.00e+01  1.00e+00 ...  1.86e+00  1.57e+00  1.26e+00]]


View labels for data

In [5]:
cho_y = cho_data[:,1]
iyer_y = iyer_data[:,1]

In [6]:
cho_y = np.array(cho_y.reshape((-1,))).astype(np.int32)
iyer_y = np.array(iyer_y.reshape((-1,))).astype(np.int32)

In [7]:
print(cho_y)
print(" ")
print(iyer_y)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5]
 
[-1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  

Remove invalid data from dataset

In [8]:
cho_data_x = np.delete(cho_data, [0,1], axis=1)
iyer_data_x = np.delete(iyer_data, [0,1,2], axis=1)

In [9]:
print(cho_data_x)
print(" ")
print(iyer_data_x)

[[-0.69  -0.96  -1.16  ... -0.04   0.19   0.82 ]
 [-0.21   0.19   0.86  ... -1.23  -0.325  0.   ]
 [-0.3   -0.56  -0.29  ... -0.12  -0.16   0.67 ]
 ...
 [-3.12  -4.12  -3.54  ...  1.48   2.06   2.36 ]
 [-0.79  -0.56  -0.79  ...  0.49   0.08   0.15 ]
 [-1.16  -1.39  -0.96  ...  0.493  1.27   0.87 ]]
 
[[0.72 0.1  0.57 ... 0.5  0.66 0.52]
 [1.58 1.05 1.15 ... 0.9  0.73 0.75]
 [1.1  0.97 1.   ... 0.71 0.57 0.46]
 ...
 [2.09 3.37 5.52 ... 2.15 2.05 2.25]
 [1.52 4.39 7.03 ... 1.97 2.21 1.79]
 [2.25 4.67 7.94 ... 1.86 1.57 1.26]]


In [10]:
print(cho_data_x.shape)
print(iyer_data_x.shape)

(386, 16)
(517, 11)


Define Euclidean distance function

In [11]:
def dist(A,B):
    return np.sqrt(np.sum(np.power((A-B),2)))

Define randomized center point function

In [12]:
def randCenter(dataset,k):
    m = np.shape(dataset)[1]
    center = np.mat(np.ones((k,m)))
    for i in range(m):
        centmin = min(dataset[:,i])
        centmax = max(dataset[:,i])
        center[:,i] = centmin + (centmax - centmin) * np.random.rand(k,1)
    return center

In [13]:
randCenter(cho_data_x, 5)

matrix([[ 0.17556383, -0.21197994, -2.54446146,  1.56340979,  0.50752404,
          0.9209441 ,  0.29054087,  1.48713705,  0.59190819, -0.97991369,
          0.975531  ,  0.58592445,  1.12883882,  0.63954113,  0.93246233,
         -0.28714325],
        [ 0.77078478, -2.01764499, -1.64059118,  0.89820146, -0.81801197,
          0.73740336,  2.05730965,  2.12607544,  1.46215358,  0.12863871,
         -0.97463498, -0.96355769, -0.13724649, -0.30461523,  1.60810628,
         -0.35192309],
        [-0.31441952, -1.12550418,  0.45879253,  1.46285584, -0.17044371,
         -2.01282723, -0.33705855, -0.74467151, -1.42781525,  1.2592541 ,
          1.19389503, -0.70385746,  0.78850251, -1.20230687,  1.06863452,
         -0.46413066],
        [-1.5581127 ,  1.06174448, -1.12752424,  1.41059039, -0.81788317,
         -0.92648822,  0.94596954, -1.37803095,  1.67327795, -0.16217815,
         -0.18852038,  1.00051445,  0.62063052,  0.24052569,  0.22497743,
         -0.39274645],
        [-1.19633761

Define K-means function

In [14]:
def kMeans(dataset,k):
    m = np.shape(dataset)[0]
    clusterAssment = np.mat(np.zeros((m,1)))
    centroids = randCenter(dataset,k)
    clusterChanged = True
    while clusterChanged:
        clusterChanged = False
        for i in range(m):
            minDist = np.inf
            minIndex = -1
            for j in range(k):
                distJI = dist(dataset[i],centroids[j])
                if distJI < minDist:
                    minDist = distJI
                    minIndex = j
            if clusterAssment[i] != minIndex:
                clusterChanged = True
            clusterAssment[i] = minIndex
        for center in range(k):
            # print(np.nonzero(clusterAssment.A == center))
            ptsIndex = np.nonzero(clusterAssment.A == center)[0]
            ptsInClust = dataset[ptsIndex]
            centroids[center,:] = np.mean(ptsInClust,axis = 0)
    
    # Calculate the internal index SSE
    SSE = 0
    for center in range(k):
        ptsIndex = np.nonzero(clusterAssment.A == center)[0]
        ptsInClust = dataset[ptsIndex]
        for pt in ptsInClust:
            dist2 = dist(pt, centroids[center]) ** 2
            SSE = SSE + dist2
    # print(SSE)
    return clusterAssment.astype(np.int32).reshape((-1,)), SSE

View the minimum and maximum values of each column of data

In [15]:
cho_data_x.min(axis=0)

array([-3.12, -4.12, -3.54, -1.88, -1.89, -2.56, -1.82, -1.86, -1.81,
       -2.19, -2.11, -1.29, -1.5 , -1.63, -1.68, -1.52], dtype=float32)

In [16]:
cho_data_x.max(axis=0)

array([1.95, 1.43, 2.04, 1.61, 1.58, 1.48, 2.45, 2.77, 1.81, 2.16, 1.75,
       1.34, 1.14, 1.48, 2.06, 2.36], dtype=float32)

Define data normalization function

In [17]:
def normalization(data):
    minVals = data.min(axis=0)
    maxVals = data.max(axis=0)
    ranges = maxVals - minVals
    normData = np.zeros(np.shape(data))
    m = data.shape[0]
    normData = data - np.tile(minVals, (m, 1))
    normData = normData/np.tile(ranges, (m, 1))
    return normData

In [18]:
cho_data_norm = normalization(cho_data_x)
iyer_data_norm = normalization(iyer_data_x)

View standardized data

In [19]:
print(cho_data_norm)
print(" ")
print(iyer_data_norm)

[[0.47928995 0.5693694  0.42652333 ... 0.511254   0.5        0.6030928 ]
 [0.5739645  0.7765766  0.78853047 ... 0.12861735 0.3622995  0.39175257]
 [0.556213   0.64144146 0.5824373  ... 0.48553053 0.40641713 0.56443304]
 ...
 [0.         0.         0.         ... 1.         1.         1.        ]
 [0.4595661  0.64144146 0.49283156 ... 0.681672   0.47058827 0.43041238]
 [0.38658777 0.49189192 0.4623656  ... 0.6826366  0.7887701  0.6159794 ]]
 
[[0.13486843 0.         0.02021019 ... 0.02259136 0.03891337 0.03669724]
 [0.41776314 0.1181592  0.06709781 ... 0.04916944 0.04405287 0.05779816]
 [0.25986844 0.10820895 0.05497171 ... 0.03654485 0.03230543 0.03119266]
 ...
 [0.5855263  0.4067164  0.42037186 ... 0.13222592 0.14096916 0.19541284]
 [0.39802635 0.5335821  0.5424414  ... 0.12026578 0.15271659 0.153211  ]
 [0.6381579  0.568408   0.61600643 ... 0.11295681 0.10572688 0.10458715]]


Perform 100 K-means clustering for Cho dataset, and select the one with the lowest SSE as the final result

In [20]:
cho_bestkmCluster = []
cho_bestkmSSE = np.inf
for i in range(100):
    cho_kmCluster, cho_kmSSE = kMeans(cho_data_norm, 5)
    print(cho_kmSSE)
    if cho_kmSSE < cho_bestkmSSE:
        cho_bestkmSSE = cho_kmSSE
        cho_bestkmCluster = cho_kmCluster

61.12483157221722
64.66564259344072
61.05818900159613


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)


65.5693620937913
65.5723642025703
61.07515115615873
61.00426936952586
65.5693620937913
61.03537540157241
61.03963072323042
61.062426904655936
65.61265236545785
61.04472151416553
61.06242690465596
61.0016683852234
65.56936209379134
61.01579707416466
61.01468700250781
62.964503061568706
65.56936209379131
65.56936209379131
60.97874166956927
61.02344750689517
61.05606962814454
65.56936209379134
61.05818900159613
61.10499304343422
65.62567338691977
61.08009644728549
61.04613586539235
65.56936209379134
65.56936209379137
61.09696570373582
61.124831572217275
65.56936209379134
65.57236420257027
65.56936209379133
63.35098162106283
61.01966757950906
76.50215768351873
60.967744723762145
76.50215768351866
65.56936209379134
61.01211617223232
64.88725666448525
61.04485772828964
61.02597060956547
65.5693620937913
61.01815211805701
65.56936209379137
76.55519249044666
61.06228647464156
65.56936209379134
61.02121866148916
61.00904369398085
61.09596373763308
61.01468700250779
91.79550684911716
61.05581529

View the final results of K-means clustering for Cho dataset

In [21]:
print(cho_bestkmCluster)
print(" ")
print(cho_bestkmSSE)

[[3 2 3 0 0 0 0 0 0 0 0 3 3 3 3 3 0 3 0 3 3 3 3 3 2 0 3 2 1 4 3 2 3 3 3 0
  3 3 3 3 3 4 0 0 0 3 3 3 3 3 3 3 0 3 0 0 3 3 0 3 3 0 3 2 3 0 0 2 2 2 2 2
  2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
  2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 1 2 3 1 3 0 2 2 3 3 2 2 1 2 2 2 2 2 2 2
  2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 3 2 3
  3 2 2 2 2 2 3 3 0 3 3 2 2 2 2 2 2 2 3 2 2 2 1 1 1 2 1 1 1 1 1 1 2 1 2 1
  1 1 2 1 3 1 2 2 2 1 1 2 1 2 2 2 2 2 1 1 1 3 1 2 0 1 1 2 2 2 2 1 1 1 1 2
  2 1 1 1 1 1 1 1 1 1 1 2 2 1 2 1 2 1 1 2 2 2 2 1 2 1 1 4 1 1 4 4 1 1 1 1
  1 1 4 1 0 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 1 1 1 1 1 4 4 1 1 1 0
  1 1 1 1 1 1 1 0 4 4 4 4 0 4 4 4 4 1 4 4 4 4 0 4 4 4 0 0 4 0 0 0 0 0 0 0
  1 0 0 0 0 0 4 4 0 0 0 4 4 0 4 4 4 4 0 4 4 0 0 4 0 4]]
 
60.967744723762145


View Confusion Matrix

In [22]:
cho_labels_true = np.array(cho_y)
cho_kmCluster_labels_pred = np.array(cho_bestkmCluster).reshape((-1,))
cho_kmCluster_contingency = contingency_matrix(cho_labels_true, cho_kmCluster_labels_pred, sparse=True)
print(cho_kmCluster_contingency)

  (0, 0)	22
  (0, 1)	1
  (0, 2)	5
  (0, 3)	37
  (0, 4)	2
  (1, 0)	3
  (1, 1)	3
  (1, 2)	113
  (1, 3)	16
  (2, 0)	1
  (2, 1)	43
  (2, 2)	29
  (2, 3)	2
  (3, 0)	4
  (3, 1)	43
  (3, 4)	7
  (4, 0)	25
  (4, 1)	2
  (4, 4)	28


Calculate external index: Adjusted Rand Score

In [23]:
adjusted_rand_score(cho_labels_true, cho_kmCluster_labels_pred)

0.4475000274739754

View sklearn's K-means clustering results of Cho dataset

In [24]:
from sklearn.cluster import KMeans
cho_kmClf = KMeans(n_clusters=5)
cho_kmClf.fit(cho_data_norm)
cho_kmClf.labels_

array([0, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 2, 1, 0, 2, 1, 4, 0, 2, 0, 0, 0, 1, 0, 1, 0, 0, 1, 4, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 2, 0, 1,
       1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2,
       2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 0, 3, 0, 1, 2, 2,
       0, 0, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2,
       2, 0, 2, 0, 0, 2, 2, 2, 2, 2, 0, 0, 1, 0, 1, 2, 2, 2, 2, 2, 2, 2,
       0, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 2, 3, 2, 3, 3, 3, 2, 3,
       0, 3, 2, 2, 2, 3, 3, 2, 3, 2, 2, 2, 2, 2, 3, 3, 3, 0, 3, 2, 1, 3,
       3, 2, 2, 2, 2, 3, 3, 3, 3, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2,
       2, 3, 2, 3, 2, 3, 3, 2, 2, 2, 2, 3, 2, 3, 3, 4, 3, 3, 4, 4, 3, 3,
       3, 3, 3, 1, 4, 3, 1, 1, 3, 3, 1, 3, 1, 3, 3,

View adjusted rand score for sklearn's K-means clustering results of Cho dataset

In [25]:
cho_kmClf_labels = cho_kmClf.labels_
adjusted_rand_score(cho_labels_true, cho_kmClf_labels)

0.445531161593973

Perform 100 K-means clustering for Iyer dataset, and select the one with the lowest SSE as the final result

In [26]:
iyer_bestkmCluster = []
iyer_bestkmSSE = np.inf
for i in range(100):
    iyer_kmCluster, iyer_kmSSE = kMeans(iyer_data_norm, 10)
    print(iyer_kmSSE)
    if iyer_kmSSE < iyer_bestkmSSE:
        iyer_bestkmSSE = iyer_kmSSE
        iyer_bestkmCluster = iyer_kmCluster

  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)


26.89691552907993
29.898273768809
20.58762825640733
20.827435845516636
27.846781787270057
20.068106718673324
28.623805448326852
19.094392284153482
35.645275929837595
20.994225234153415
27.161712277595793
17.310127371971927
18.930867480479332
19.841243189004693
25.355367823533392
17.76530129268801
20.824356931564147
35.645275929837595
15.459221132540348
27.137000863215775
19.284405049872063
26.436401766490462
16.740083582239485
18.930867480479336
17.758322036379244
16.54585964994289
25.393174609731282
27.84774309771296
20.490918409953654
19.678777292140715
19.060333811514607
18.046508044735724
19.362235030750515
17.86937805399424
18.92272494414439
20.38309293644627
28.110801732191916
18.32519264514476
18.989298630457498
18.984977212887028
20.030699658443556
23.497166204751906
27.161712277595797
19.36168255896204
15.208764423889082
17.873558903445787
24.03626933900059
20.83110809655178
19.855501734733576
21.892829519961317
27.339730460475792
19.366461873288454
17.497785203169485
26.67390

View the final results of K-means clustering for Iyer dataset

In [27]:
print(iyer_bestkmCluster)
print(" ")
print(iyer_bestkmSSE)

[[3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
  3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
  3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 8 3 3 3 3 3 3
  3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
  3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
  3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
  3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
  3 3 3 3 3 3 3 3 3 3 5 5 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
  3 3 3 3 3 3 3 3 3 3 3 3 8 9 3 3 3 3 3 8 8 3 9 8 8 8 3 3 3 3 3 8 8 8 8 9
  9 9 9 9 9 8 9 9 9 9 9 9 8 9 9 9 9 9 9 6 1 1 3 1 1 1 8 3 3 3 3 8 8 3 3 8
  8 8 2 8 3 3 3 8 8 3 8 3 8 8 8 3 3 8 8 8 8 8 3 8 8 8 8 8 8 8 8 8 8 9 8 9
  8 9 9 8 8 8 8 8 8 8 8 9 8 8 8 8 8 8 8 8 3 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
  8 8 8 8 8 8 8 8 8 0 8 8 9 9 9 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 9 8 8 8
  8 0 0 0 0 3 8 8 8 3 3 8 8 8 3 8 8 8 

View Confusion Matrix

In [28]:
iyer_labels_true = np.array(iyer_y)
iyer_kmCluster_labels_pred = np.array(iyer_bestkmCluster).reshape((-1,))
iyer_kmCluster_contingency = contingency_matrix(iyer_labels_true, iyer_kmCluster_labels_pred, sparse=True)
print(iyer_kmCluster_contingency)

  (0, 3)	25
  (0, 6)	7
  (0, 7)	1
  (1, 3)	100
  (2, 3)	144
  (2, 6)	1
  (3, 3)	32
  (3, 4)	2
  (4, 3)	11
  (4, 6)	12
  (4, 7)	20
  (5, 1)	5
  (5, 3)	1
  (5, 5)	1
  (6, 2)	1
  (6, 3)	10
  (6, 6)	23
  (7, 6)	10
  (7, 7)	4
  (8, 0)	5
  (8, 3)	1
  (8, 6)	53
  (8, 7)	4
  (9, 0)	1
  (9, 2)	1
  (9, 3)	4
  (9, 6)	13
  (10, 1)	5
  (10, 3)	6
  (10, 5)	13
  (10, 6)	1


Calculate external index: Adjusted Rand Score

In [29]:
adjusted_rand_score(iyer_labels_true, iyer_kmCluster_labels_pred)

0.2671347011479459

View sklearn's K-means clustering results of Iyer dataset

In [30]:
from sklearn.cluster import KMeans
iyer_kmClf = KMeans(n_clusters=10)
iyer_kmClf.fit(iyer_data_norm)
iyer_kmClf.labels_

array([6, 0, 0, 6, 0, 0, 0, 0, 0, 6, 0, 0, 0, 6, 6, 6, 6, 0, 6, 6, 6, 6,
       6, 6, 6, 0, 6, 0, 6, 6, 6, 6, 6, 6, 0, 6, 0, 6, 0, 0, 6, 0, 6, 6,
       6, 6, 0, 6, 6, 6, 6, 6, 0, 6, 6, 6, 6, 0, 0, 6, 6, 6, 0, 0, 0, 0,
       6, 0, 0, 6, 6, 6, 0, 0, 0, 0, 0, 6, 6, 0, 6, 0, 0, 0, 6, 6, 0, 0,
       0, 0, 0, 0, 6, 0, 0, 0, 6, 6, 6, 6, 0, 0, 0, 0, 6, 6, 6, 6, 0, 6,
       6, 6, 6, 6, 6, 0, 0, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 0, 6, 0,
       6, 6, 0, 0, 0, 6, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 6, 0, 0, 6, 0, 6,
       6, 0, 0, 6, 6, 6, 6, 6, 6, 0, 6, 0, 6, 0, 6, 6, 6, 6, 0, 0, 0, 0,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 0, 6, 6, 0, 0, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 0, 6, 0, 6, 0, 6, 6, 6, 0, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 0, 0,
       0, 6, 6, 0, 0, 6, 0, 0, 0, 6, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 6,
       6, 0, 6, 6, 6, 6, 0, 6, 0, 6, 6, 0, 6, 6, 3, 0, 6, 6, 0, 6, 2, 2,
       6, 6, 0, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 6, 0, 0, 0, 0, 0,
       6, 0, 0, 0, 0, 0, 9, 0, 0, 0, 3, 6, 0, 0, 3,

View adjusted rand score for sklearn's K-means clustering results of Iyer dataset

In [31]:
iyer_kmClf_labels = iyer_kmClf.labels_
adjusted_rand_score(iyer_labels_true, iyer_kmClf_labels)

0.22383873517794003

Define the calculation of distance matrix

In [32]:
def distanceMatrix(X):
    X = np.array(X)
    disMatrix = np.zeros((len(X), len(X)))
    for i in range(len(X)):
        for j in range(i+1, len(X)):
            disMatrix[i][j] = dist(X[i], X[j])
            disMatrix[j][i] = disMatrix[i][j]
    return disMatrix

In [33]:
cho_disM = distanceMatrix(cho_data_norm)
iyer_disM = distanceMatrix(iyer_data_norm)

Define the calculation of similarity matrix

In [34]:
def knnMatrix(distanceMatrix, k, sigma=1.0):
    N = len(distanceMatrix)
    knnMatrix = np.zeros((N,N))

    for i in range(N):
        distAndIndex1 = zip(distanceMatrix[i], range(N))
        distAndIndex2 = sorted(distAndIndex1, key=lambda x:x[0]) # Sort by first element

        neighbours_id = [distAndIndex2[m][1] for m in range(k+1)] # xi's k nearest neighbours

        for j in neighbours_id: # xj is xi's neighbour
            knnMatrix[i][j] = np.exp(-distanceMatrix[i][j] / (2 * sigma**2))
            knnMatrix[j][i] = knnMatrix[i][j]
    
    # print(distAndIndex2)
    
    return knnMatrix

In [35]:
cho_knnM = knnMatrix(cho_disM, 50)
iyer_knnM = knnMatrix(iyer_disM, 50)

Define the calculation of Laplacian matrix

In [36]:
def laplacianMatrix(knnMatrix):

    # compute the Degree Matrix: D=sum(A)
    degreeMatrix = np.sum(knnMatrix, axis=1)

    # compute the Laplacian Matrix: L=D-A
    laplacianMatrix = np.diag(degreeMatrix) - knnMatrix

    # normalize: D^(-1/2) L D^(-1/2)
    sqrtDegreeMatrix = np.diag(1.0 / (degreeMatrix ** (0.5)))
    return np.dot(np.dot(sqrtDegreeMatrix, laplacianMatrix), sqrtDegreeMatrix)
    # return laplacianMatrix

In [37]:
cho_laM = laplacianMatrix(cho_knnM)
iyer_laM = laplacianMatrix(iyer_knnM)

Get eigenvalues and eigenvectors

In [38]:
def getEigVec(laM,cluster_num):
    eigValue,eigVector = np.linalg.eig(laM)
    # print(eigValue)
    # print(eigVector)
    dictEigval = dict(zip(eigValue,range(len(eigValue))))
    # print(dictEigval)
    kEig = np.sort(eigValue)[0:cluster_num]
    # print(kEig)
    index = [dictEigval[k] for k in kEig]
    # print(index)
    return eigValue[index],eigVector[:,index]

In [39]:
cho_eigValue,cho_eigVector = getEigVec(cho_laM, 5)
iyer_eigValue,iyer_eigVector = getEigVec(iyer_laM, 10)
# getEigVec(laM, 5)

Perform 100 spectral clustering for Cho dataset, and select the one with the lowest SSE as the final result

In [40]:
cho_bestspCluster = []
cho_bestspSSE = np.inf
for i in range(100):
    cho_spCluster, cho_spSSE = kMeans(cho_eigVector, 5)
    print(cho_spSSE)
    if cho_spSSE < cho_bestspSSE:
        cho_bestspSSE = cho_spSSE
        cho_bestspCluster = cho_spCluster

0.8175190944499277
1.5298035242364207
0.8169329593742042
0.8169329593742034
1.5375917014047507
0.8175190944499279
0.8169329593742037
0.816932959374204
0.8175190944499283
0.816932959374204
0.8171592486571642
0.8171592486571636
0.8169329593742038
1.541710857293372
0.817159248657164
0.816932959374204
0.8175190944499273


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)


1.6514326240712167
0.8169329593742036
1.5417108572933718
0.817519094449928
0.8175190944499283
1.5485409293702275
0.817519094449928
1.644308803426477
1.5576648230285421
0.8169329593742038
0.8171592486571635
0.8169329593742037
0.8169329593742035
1.621786565428736
1.538063163530972
0.8169329593742038
0.8169329593742037
0.8169329593742037
1.6513525601930579
0.8169329593742037
1.6577138648305942
0.8175190944499279
0.8169329593742038
1.4666321511080838
1.5491897382766617
0.8175190944499274
0.8175190944499282
0.8169329593742034
0.8175190944499279
0.8169329593742033
0.8171592486571642
0.8169329593742038
0.8169329593742038
0.8175190944499282
0.8169329593742041
0.8171592486571634
0.8171592486571635
1.4666321511080846
1.5483818094675006
2.4265390899100576
0.8169329593742035
0.816932959374204
0.8169329593742035
1.6446686492192402
0.8169329593742035
0.8169329593742044
0.8175190944499279
0.8169329593742041
0.8175190944499279
0.8171592486571639
0.817159248657164
0.8171592486571635
0.8169329593742038


View the final results of spectral clustering for Cho dataset

In [41]:
print(cho_bestspCluster)
print(" ")
print(cho_bestspSSE)

[[2 4 2 0 0 0 0 2 2 0 2 2 2 2 2 2 2 2 4 2 2 2 2 2 4 4 2 4 4 0 2 4 2 2 2 2
  2 4 2 2 2 0 0 0 0 2 2 2 2 2 2 2 2 2 2 0 2 2 0 2 2 4 2 4 2 0 0 3 3 3 3 3
  3 4 3 3 3 3 4 3 3 4 4 3 3 4 3 3 4 2 3 4 3 3 3 4 4 3 3 4 4 3 3 3 4 3 3 3
  4 4 3 2 3 4 4 3 4 3 4 3 3 3 3 3 1 3 2 1 2 2 4 4 2 2 3 4 1 4 3 3 4 3 4 4
  4 2 4 3 3 4 3 3 3 3 4 3 3 3 4 3 3 4 4 3 3 3 3 4 3 4 4 3 2 4 3 4 3 2 3 2
  2 3 4 4 4 4 2 2 2 2 2 3 4 4 3 4 3 4 2 4 4 4 1 1 1 4 1 1 4 1 1 1 4 1 3 1
  1 1 4 1 2 1 4 4 4 1 1 3 1 4 3 4 4 1 1 1 1 2 1 4 4 4 1 4 4 4 3 1 1 1 1 4
  4 1 1 1 1 1 1 1 4 4 1 4 4 4 4 0 3 1 4 4 3 4 4 1 4 1 1 0 1 1 0 0 1 1 1 1
  1 0 0 1 0 0 1 1 0 1 4 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 0 1 1 1 0
  1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0
  1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
 
0.8169329593742033


View Confusion Matrix

In [42]:
cho_labels_true = np.array(cho_y)
cho_spCluster_labels_pred = np.array(cho_bestspCluster).reshape((-1,))
cho_spCluster_contingency = contingency_matrix(cho_labels_true, cho_spCluster_labels_pred, sparse=True)
print(cho_spCluster_contingency)

  (0, 0)	14
  (0, 2)	43
  (0, 4)	10
  (1, 1)	3
  (1, 2)	18
  (1, 3)	66
  (1, 4)	48
  (2, 0)	1
  (2, 1)	37
  (2, 2)	2
  (2, 3)	6
  (2, 4)	29
  (3, 0)	13
  (3, 1)	40
  (3, 4)	1
  (4, 0)	52
  (4, 1)	2
  (4, 2)	1


Calculate external index: Adjusted Rand Score

In [43]:
adjusted_rand_score(cho_labels_true, cho_spCluster_labels_pred)

0.347206481819754

View sklearn's spectral clustering results of Cho dataset

In [44]:
from sklearn.cluster import SpectralClustering
cho_spClf = SpectralClustering(n_clusters=5)
cho_spClf.fit(cho_data_norm)
cho_spClf.labels_

array([1, 2, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,
       1, 1, 3, 2, 1, 2, 3, 0, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 2, 1, 1, 1, 0,
       3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2,
       2, 1, 2, 2, 2, 2, 2, 3, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
       2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 1, 3, 1, 1, 2, 2,
       1, 1, 2, 2, 3, 3, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 1, 2, 2, 2,
       2, 1, 2, 1, 1, 2, 2, 1, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,
       1, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 2, 3, 2, 3, 3, 3, 1, 3,
       1, 3, 2, 2, 2, 3, 3, 2, 3, 3, 2, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3,
       3, 3, 2, 2, 2, 3, 3, 3, 3, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2,
       2, 3, 2, 3, 2, 3, 3, 2, 2, 2, 3, 3, 2, 3, 3, 0, 3, 3, 0, 0, 3, 3,
       3, 3, 3, 3, 0, 3, 0, 0, 3, 3, 3, 3, 3, 3, 3,

View adjusted rand score for sklearn's spectral clustering results of Cho dataset

In [45]:
cho_spClf_labels = cho_spClf.labels_
adjusted_rand_score(cho_labels_true, cho_spClf_labels)

0.44129189070888103

Perform 100 spectral clustering for Iyer dataset, and select the one with the lowest SSE as the final result

In [46]:
iyer_bestspCluster = []
iyer_bestspSSE = np.inf
for i in range(100):
    iyer_spCluster, iyer_spSSE = kMeans(iyer_eigVector, 10)
    print(iyer_spSSE)
    if iyer_spSSE < iyer_bestspSSE:
        iyer_bestspSSE = iyer_spSSE
        iyer_bestspCluster = iyer_spCluster

2.7594249643615356


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)


3.164378692825339
3.7236879528235716
3.214745615537465
3.1640211046367
3.772946258992515
3.1978385752627423
3.881844457644699
3.3006531275111892
2.760845558748697
2.7597414329135943
2.7614071227202057
3.0106183045284194
2.7607490871358316
2.759128149785421
2.7579104629803615
3.069386651008042
3.344111448177349
3.0506935709620335
3.555148852113517
3.2876022888010343
3.4618901914487603
3.4135272157931085
3.514845082434839
3.1084991842371066
3.3613573980943783
2.7597414329135943
3.516009695582969
3.245431813693014
3.72356185988857
3.4306324225279172
3.7323068019062857
3.194481189436504
3.322745465823036
3.419883227642005
3.1643786928253386
2.7596897137569316
3.138970533276104
3.3644021559376016
3.0528551892508338
3.5332096185114996
2.996770092314188
3.40595144556571
3.230498934325742
2.761096202466826
3.015295083253895
2.7581084094391843
2.929406322822905
3.1628377651164046
3.206694634196074
3.6516030200346568
2.9302751482334264
3.24168577591191
2.7693256904110366
2.7648863568617226
3.759

View the final results of spectral clustering for Iyer dataset

In [47]:
print(iyer_bestspCluster)
print(" ")
print(iyer_bestspSSE)

[[5 7 2 6 2 2 2 2 2 6 2 2 0 5 5 6 6 2 6 5 5 6 6 6 6 7 6 7 5 0 8 5 8 8 7 6
  7 6 7 6 5 0 5 6 5 5 2 0 5 0 5 6 2 6 6 0 6 6 6 6 6 5 2 0 0 6 6 6 6 6 6 6
  6 6 0 2 7 6 6 7 6 7 2 2 6 6 6 2 2 2 2 2 0 2 2 7 6 0 5 6 2 7 7 7 8 8 8 8
  8 8 8 8 8 5 5 7 7 8 8 8 8 8 8 8 5 5 8 8 6 6 6 7 0 0 7 7 7 0 7 7 7 2 6 6
  0 7 2 0 5 7 0 8 7 8 5 7 7 5 6 6 8 8 8 7 6 7 5 7 6 5 5 8 7 7 7 7 8 8 5 5
  5 5 5 5 5 8 8 8 8 7 8 5 5 5 5 8 0 0 5 0 0 7 8 7 0 8 8 7 8 8 5 5 5 8 5 5
  5 8 7 8 0 0 5 7 7 5 2 7 7 8 8 5 5 7 7 7 7 0 0 0 0 5 5 7 0 0 5 5 7 5 7 5
  0 0 5 8 8 8 8 8 8 8 4 4 0 0 0 0 5 0 2 2 2 2 2 2 2 2 2 6 0 2 6 6 2 2 6 2
  2 6 6 0 2 2 2 2 9 6 7 6 3 3 3 8 7 8 7 3 3 8 3 3 3 9 7 8 0 8 7 3 3 3 3 3
  3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 0 4 4 4 4 0 4 4 4 7 9 9 9 4
  4 9 1 4 4 4 9 9 9 9 4 4 9 9 9 9 9 9 1 9 9 9 9 9 9 9 1 4 4 0 9 3 1 3 3 3
  3 3 1 3 3 3 3 3 1 3 9 3 9 9 1 1 9 1 9 9 5 9 9 3 9 3 3 9 1 9 1 1 9 9 9 9
  9 9 1 1 9 1 1 1 1 1 9 3 1 1 1 9 3 1 9 9 9 9 9 1 1 1 9 9 9 9 9 1 1 9 9 1
  1 1 1 1 1 9 9 4 4 0 9 4 1 4 4 4 9 9 

View Confusion Matrix

In [48]:
iyer_labels_true = np.array(iyer_y)
iyer_spCluster_labels_pred = np.array(iyer_bestspCluster).reshape((-1,))
iyer_spCluster_contingency = contingency_matrix(iyer_labels_true, iyer_spCluster_labels_pred, sparse=True)
print(iyer_spCluster_contingency)

  (0, 0)	4
  (0, 1)	1
  (0, 3)	2
  (0, 4)	4
  (0, 5)	6
  (0, 6)	2
  (0, 7)	3
  (0, 8)	7
  (0, 9)	4
  (1, 0)	11
  (1, 2)	23
  (1, 5)	14
  (1, 6)	39
  (1, 7)	10
  (1, 8)	3
  (2, 0)	19
  (2, 2)	3
  (2, 5)	34
  (2, 6)	9
  (2, 7)	38
  (2, 8)	42
  (3, 0)	7
  (3, 2)	18
  (3, 4)	2
  (3, 5)	1
  :	:
  (4, 0)	1
  (4, 3)	32
  (4, 7)	4
  (4, 8)	5
  (4, 9)	1
  (5, 0)	1
  (5, 4)	6
  (6, 1)	3
  (6, 4)	9
  (6, 7)	1
  (6, 9)	21
  (7, 1)	2
  (7, 3)	11
  (7, 9)	1
  (8, 1)	28
  (8, 3)	5
  (8, 5)	1
  (8, 9)	29
  (9, 0)	1
  (9, 1)	3
  (9, 4)	10
  (9, 9)	5
  (10, 0)	3
  (10, 2)	1
  (10, 4)	21


Calculate external index: Adjusted Rand Score

In [49]:
adjusted_rand_score(iyer_labels_true, iyer_spCluster_labels_pred)

0.243739928799015

View sklearn's spectral clustering results of Iyer dataset

In [50]:
from sklearn.cluster import SpectralClustering
iyer_spClf = SpectralClustering(n_clusters=10)
iyer_spClf.fit(iyer_data_norm)
iyer_spClf.labels_

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 9,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0,

View adjusted rand score for sklearn's spectral clustering results of Iyer dataset

In [51]:
iyer_spClf_labels = iyer_spClf.labels_
adjusted_rand_score(iyer_labels_true, iyer_spClf_labels)

0.22409326018025702