Import data processing package

In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.metrics.cluster.supervised import contingency_matrix
from sklearn.metrics.cluster import adjusted_rand_score

Import data

In [2]:
def loadDataSet(fileName): 
    data = [] 
    with open(fileName) as f:
        for line in f.readlines():
            line = line.split('\t')
            data.append(line)
        data = np.array(data).astype(np.float32)
    return data

In [3]:
cho_data = loadDataSet('C:/Users/dingl/Desktop/Data Mining/Project1/cho.txt')
iyer_data = loadDataSet('C:/Users/dingl/Desktop/Data Mining/Project1/iyer.txt')

View data

In [4]:
print(cho_data)
print(" ")
print(iyer_data)

[[ 1.00e+00  1.00e+00 -6.90e-01 ... -4.00e-02  1.90e-01  8.20e-01]
 [ 2.00e+00  1.00e+00 -2.10e-01 ... -1.23e+00 -3.25e-01  0.00e+00]
 [ 3.00e+00  1.00e+00 -3.00e-01 ... -1.20e-01 -1.60e-01  6.70e-01]
 ...
 [ 3.84e+02  5.00e+00 -3.12e+00 ...  1.48e+00  2.06e+00  2.36e+00]
 [ 3.85e+02  5.00e+00 -7.90e-01 ...  4.90e-01  8.00e-02  1.50e-01]
 [ 3.86e+02  5.00e+00 -1.16e+00 ...  4.93e-01  1.27e+00  8.70e-01]]
 
[[ 1.00e+00 -1.00e+00  1.00e+00 ...  5.00e-01  6.60e-01  5.20e-01]
 [ 2.00e+00  1.00e+00  1.00e+00 ...  9.00e-01  7.30e-01  7.50e-01]
 [ 3.00e+00  1.00e+00  1.00e+00 ...  7.10e-01  5.70e-01  4.60e-01]
 ...
 [ 5.15e+02  1.00e+01  1.00e+00 ...  2.15e+00  2.05e+00  2.25e+00]
 [ 5.16e+02  1.00e+01  1.00e+00 ...  1.97e+00  2.21e+00  1.79e+00]
 [ 5.17e+02  1.00e+01  1.00e+00 ...  1.86e+00  1.57e+00  1.26e+00]]


View labels for data

In [5]:
cho_y = cho_data[:,1]
iyer_y = iyer_data[:,1]

In [6]:
cho_y = np.array(cho_y.reshape((-1,))).astype(np.int32)
iyer_y = np.array(iyer_y.reshape((-1,))).astype(np.int32)

In [7]:
print(cho_y)
print(" ")
print(iyer_y)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5]
 
[-1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  

Remove invalid data from dataset

In [8]:
cho_data_x = np.delete(cho_data, [0,1], axis=1)
iyer_data_x = np.delete(iyer_data, [0,1,2], axis=1)

In [9]:
print(cho_data_x)
print(" ")
print(iyer_data_x)

[[-0.69  -0.96  -1.16  ... -0.04   0.19   0.82 ]
 [-0.21   0.19   0.86  ... -1.23  -0.325  0.   ]
 [-0.3   -0.56  -0.29  ... -0.12  -0.16   0.67 ]
 ...
 [-3.12  -4.12  -3.54  ...  1.48   2.06   2.36 ]
 [-0.79  -0.56  -0.79  ...  0.49   0.08   0.15 ]
 [-1.16  -1.39  -0.96  ...  0.493  1.27   0.87 ]]
 
[[0.72 0.1  0.57 ... 0.5  0.66 0.52]
 [1.58 1.05 1.15 ... 0.9  0.73 0.75]
 [1.1  0.97 1.   ... 0.71 0.57 0.46]
 ...
 [2.09 3.37 5.52 ... 2.15 2.05 2.25]
 [1.52 4.39 7.03 ... 1.97 2.21 1.79]
 [2.25 4.67 7.94 ... 1.86 1.57 1.26]]


In [10]:
print(cho_data_x.shape)
print(iyer_data_x.shape)

(386, 16)
(517, 11)


Define Euclidean distance function

In [11]:
def dist(A,B):
    return np.sqrt(np.sum(np.power((A-B),2)))

Define randomized center point function

In [12]:
def randCenter(dataset,k):
    m = np.shape(dataset)[1]
    center = np.mat(np.ones((k,m)))
    for i in range(m):
        centmin = min(dataset[:,i])
        centmax = max(dataset[:,i])
        center[:,i] = centmin + (centmax - centmin) * np.random.rand(k,1)
    return center

In [13]:
randCenter(cho_data_x, 5)

matrix([[-1.67789652, -1.78547654,  1.01962182,  0.92940257,  0.50852839,
          0.15581613, -0.16039744,  2.11925695, -1.58219623,  0.08689678,
         -0.30403266,  1.07182241, -1.48147946,  1.05767999,  1.42533583,
          0.13641391],
        [-2.24103125,  0.72234381, -1.06028723,  1.04717796, -0.31678293,
         -2.26645054, -1.61028618, -0.54056276, -0.56627912, -0.5228394 ,
         -1.50182005,  0.74037016,  0.96969923, -0.97455978,  0.53738741,
         -1.26378022],
        [-0.97937752, -0.88000619, -1.73076059, -1.12774947, -0.48265659,
         -0.48055737,  1.59553774,  1.90215872, -0.33605973,  2.12606754,
         -1.24573145, -0.12444411, -0.83534609, -0.70855247,  1.98905816,
          1.96607636],
        [ 1.210236  , -3.39270771, -0.37691085,  0.49234772, -0.88409641,
         -1.09397443, -0.79255918, -0.77736909,  0.67325115, -0.34641134,
         -1.98934471,  0.78049446,  0.37141908,  0.83075257,  1.94146243,
          2.29524648],
        [ 1.16473632

Define K-means function

In [14]:
def kMeans(dataset,k):
    m = np.shape(dataset)[0]
    clusterAssment = np.mat(np.zeros((m,1)))
    centroids = randCenter(dataset,k)
    clusterChanged = True
    while clusterChanged:
        clusterChanged = False
        for i in range(m):
            minDist = np.inf
            minIndex = -1
            for j in range(k):
                distJI = dist(dataset[i],centroids[j])
                if distJI < minDist:
                    minDist = distJI
                    minIndex = j
            if clusterAssment[i] != minIndex:
                clusterChanged = True
            clusterAssment[i] = minIndex
        for center in range(k):
            print("Index:\n", np.nonzero(clusterAssment.A == center))
            ptsIndex = np.nonzero(clusterAssment.A == center)[0]
            ptsInClust = dataset[ptsIndex]
            centroids[center,:] = np.mean(ptsInClust,axis = 0)
    
    # Calculate the internal index SSE
    SSE = 0
    for center in range(k):
        ptsIndex = np.nonzero(clusterAssment.A == center)[0]
        ptsInClust = dataset[ptsIndex]
        for pt in ptsInClust:
            dist2 = dist(pt, centroids[center]) ** 2
            SSE = SSE + dist2
    # print(SSE)
    return clusterAssment.astype(np.int32).reshape((-1,)), SSE

View the minimum and maximum values of each column of data

In [15]:
cho_data_x.min(axis=0)

array([-3.12, -4.12, -3.54, -1.88, -1.89, -2.56, -1.82, -1.86, -1.81,
       -2.19, -2.11, -1.29, -1.5 , -1.63, -1.68, -1.52], dtype=float32)

In [16]:
cho_data_x.max(axis=0)

array([1.95, 1.43, 2.04, 1.61, 1.58, 1.48, 2.45, 2.77, 1.81, 2.16, 1.75,
       1.34, 1.14, 1.48, 2.06, 2.36], dtype=float32)

Define data normalization function

In [17]:
def normalization(data):
    minVals = data.min(axis=0)
    maxVals = data.max(axis=0)
    ranges = maxVals - minVals
    normData = np.zeros(np.shape(data))
    m = data.shape[0]
    normData = data - np.tile(minVals, (m, 1))
    normData = normData/np.tile(ranges, (m, 1))
    return normData

In [18]:
cho_data_norm = normalization(cho_data_x)
iyer_data_norm = normalization(iyer_data_x)

View standardized data

In [19]:
print(cho_data_norm)
print(" ")
print(iyer_data_norm)

[[0.47928995 0.5693694  0.42652333 ... 0.511254   0.5        0.6030928 ]
 [0.5739645  0.7765766  0.78853047 ... 0.12861735 0.3622995  0.39175257]
 [0.556213   0.64144146 0.5824373  ... 0.48553053 0.40641713 0.56443304]
 ...
 [0.         0.         0.         ... 1.         1.         1.        ]
 [0.4595661  0.64144146 0.49283156 ... 0.681672   0.47058827 0.43041238]
 [0.38658777 0.49189192 0.4623656  ... 0.6826366  0.7887701  0.6159794 ]]
 
[[0.13486843 0.         0.02021019 ... 0.02259136 0.03891337 0.03669724]
 [0.41776314 0.1181592  0.06709781 ... 0.04916944 0.04405287 0.05779816]
 [0.25986844 0.10820895 0.05497171 ... 0.03654485 0.03230543 0.03119266]
 ...
 [0.5855263  0.4067164  0.42037186 ... 0.13222592 0.14096916 0.19541284]
 [0.39802635 0.5335821  0.5424414  ... 0.12026578 0.15271659 0.153211  ]
 [0.6381579  0.568408   0.61600643 ... 0.11295681 0.10572688 0.10458715]]


Perform K-means clustering for Cho dataset

In [20]:
cho_kmCluster, cho_kmSSE = kMeans(cho_data_norm, 5)

Index:
 (array([ 14, 198, 369], dtype=int64), array([0, 0, 0], dtype=int64))
Index:
 (array([  2,   7,  13,  15,  17,  20,  22,  23,  30,  31,  38,  39,  45,
        46,  48,  49,  50,  56,  57,  60,  62,  63,  66,  67,  68,  70,
        76,  82,  86,  87,  88,  89,  93, 111, 115, 119, 120, 122, 123,
       125, 128, 133, 138, 145, 148, 150, 153, 160, 163, 166, 168, 171,
       177, 179, 183, 185, 186, 187, 190, 205, 208, 211, 217, 219, 227,
       237, 238, 246, 248, 254, 262, 276, 284, 286, 291, 293, 296, 297,
       300, 301, 302, 305, 310, 311, 314, 328, 331, 354, 358, 373, 376],
      dtype=int64), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0], dtype=int64))
Index:
 (array([124, 139, 147, 180, 202, 203, 204, 206, 209, 215, 

      dtype=int64))
Index:
 (array([124, 127, 136, 202, 203, 204, 206, 207, 208, 209, 210, 211, 213,
       215, 216, 217, 219, 221, 226, 228, 234, 235, 236, 238, 241, 242,
       247, 248, 249, 250, 253, 254, 255, 256, 257, 258, 259, 260, 261,
       262, 265, 267, 269, 270, 275, 277, 278, 280, 281, 284, 285, 286,
       287, 288, 291, 294, 295, 297, 299, 300, 301, 302, 303, 304, 305,
       306, 307, 308, 309, 310, 311, 313, 314, 315, 316, 317, 320, 321,
       322, 324, 325, 326, 327, 328, 329, 330, 341, 360], dtype=int64), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int64))
Index:
 (array([  3,   4,   5,   6,   9,  29,  41,  42,  44,  55,  58,  65, 279,
       282, 283, 290, 293, 296, 312, 318, 319, 323, 332, 333, 334, 335,
 

Index:
 (array([  0,   2,   8,  10,  11,  12,  13,  14,  15,  17,  19,  20,  21,
        22,  23,  26,  30,  32,  33,  34,  35,  36,  38,  39,  40,  45,
        46,  48,  50,  51,  52,  53,  54,  56,  57,  59,  60,  64,  89,
       126, 128, 133, 180, 187, 189, 190, 198, 237, 358], dtype=int64), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0], dtype=int64))
Index:
 (array([  1,   6,   7,  16,  18,  24,  25,  28,  37,  43,  47,  49,  61,
        62,  63,  66,  78,  81,  85,  88,  95,  96, 104, 109, 111, 114,
       129, 130, 131, 132, 135, 137, 142, 143, 144, 145, 146, 172, 175,
       177, 179, 182, 183, 184, 186, 188, 193, 195, 197, 199, 200, 201,
       212, 218, 220, 229, 232, 240, 243, 251, 252, 263, 264, 266, 273,
       274, 289, 292, 293, 298, 331], dtype=int64), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0

Index:
 (array([  0,   2,   8,  10,  11,  12,  13,  14,  15,  16,  17,  19,  20,
        21,  22,  23,  26,  30,  32,  33,  34,  35,  36,  38,  39,  40,
        45,  46,  48,  49,  50,  51,  52,  53,  54,  56,  57,  59,  60,
        62,  64,  89, 126, 128, 132, 133, 145, 172, 180, 186, 187, 189,
       190, 198, 237, 358], dtype=int64), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64))
Index:
 (array([  1,   6,   7,  18,  24,  25,  28,  31,  37,  43,  47,  61,  63,
        66,  78,  81,  85,  88,  95,  96,  99, 104, 109, 111, 113, 114,
       129, 130, 131, 135, 137, 142, 143, 144, 146, 167, 175, 177, 179,
       182, 183, 184, 185, 188, 193, 195, 197, 199, 200, 201, 212, 218,
       220, 225, 229, 231, 232, 239, 240, 243, 251, 252, 263, 264, 266,
       273, 274, 289, 292, 293, 298, 331], dtype=int64), array([0, 0, 0, 0, 0, 0, 0, 0, 

Index:
 (array([  0,   2,   7,   8,  10,  11,  12,  13,  14,  15,  16,  17,  19,
        20,  21,  22,  23,  26,  30,  32,  33,  34,  35,  36,  38,  39,
        40,  45,  46,  48,  49,  50,  51,  52,  53,  54,  56,  57,  59,
        60,  62,  64,  89, 111, 126, 128, 129, 132, 133, 145, 172, 180,
       186, 187, 188, 189, 190, 198, 220, 237, 358], dtype=int64), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64))
Index:
 (array([  1,  18,  24,  25,  27,  28,  31,  37,  43,  47,  61,  63,  66,
        73,  78,  81,  85,  88,  91,  95,  96,  99, 100, 104, 109, 113,
       114, 118, 130, 131, 135, 137, 140, 142, 143, 144, 146, 149, 161,
       167, 170, 173, 175, 177, 179, 182, 183, 184, 185, 192, 193, 195,
       197, 199, 200, 201, 205, 212, 218, 222, 223, 224, 225, 229, 231,
       232, 239, 240, 241, 243, 244, 245, 251, 2

Index:
 (array([ 67,  68,  69,  70,  71,  72,  74,  75,  76,  77,  79,  80,  83,
        84,  86,  87,  90,  92,  93,  94,  97,  98, 101, 102, 103, 105,
       106, 107, 108, 110, 112, 115, 117, 119, 120, 121, 122, 123, 125,
       134, 138, 139, 147, 148, 150, 152, 155, 156, 157, 158, 159, 160,
       163, 164, 165, 166, 168, 171, 174, 176, 178, 181, 191, 194, 196,
       214, 227, 230, 246, 268, 272], dtype=int64), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0], dtype=int64))


View the final results of K-means clustering for Cho dataset

In [21]:
print(cho_kmCluster)
print(" ")
print(cho_kmSSE)

[[0 1 0 3 3 3 3 0 0 3 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 1 1 3 0 1 0 0 0 0
  0 1 0 0 0 3 3 3 3 0 0 1 0 0 0 0 0 0 0 3 0 0 3 0 0 1 0 1 0 3 3 4 4 4 4 4
  4 1 4 4 4 4 1 4 4 1 1 4 4 1 4 4 1 0 4 1 4 4 4 1 1 4 4 1 1 4 4 4 1 4 4 4
  4 1 4 0 4 1 1 4 1 4 1 4 4 4 4 4 2 4 0 2 0 0 1 1 0 0 4 1 2 1 4 4 1 1 1 1
  1 0 1 4 4 1 4 1 4 1 1 4 4 4 4 4 4 1 1 4 4 4 4 1 4 1 1 4 0 1 4 1 4 0 4 1
  0 4 1 1 1 1 0 0 0 0 0 4 1 1 4 1 4 1 0 1 1 1 2 2 2 1 2 2 2 2 2 2 1 2 4 2
  2 2 1 2 0 2 1 1 1 2 2 4 2 1 4 1 1 1 2 2 2 0 2 1 1 1 2 1 1 1 4 2 2 2 2 1
  1 2 2 2 2 2 2 2 2 2 2 1 1 2 1 2 4 2 1 1 4 1 1 2 1 2 2 3 2 2 3 3 2 2 2 2
  2 2 3 2 3 3 2 2 3 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 3 3 2 2 2 3
  2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 0 3
  2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3]]
 
61.07198573393445


View Confusion Matrix

In [22]:
cho_labels_true = np.array(cho_y)
cho_kmCluster_labels_pred = np.array(cho_kmCluster).reshape((-1,))
cho_kmCluster_contingency = contingency_matrix(cho_labels_true, cho_kmCluster_labels_pred, sparse=True)
print(cho_kmCluster_contingency)

  (0, 0)	42
  (0, 1)	11
  (0, 3)	14
  (1, 0)	17
  (1, 1)	50
  (1, 2)	3
  (1, 4)	65
  (2, 0)	2
  (2, 1)	26
  (2, 2)	41
  (2, 4)	6
  (3, 1)	1
  (3, 2)	42
  (3, 3)	11
  (4, 0)	1
  (4, 2)	2
  (4, 3)	52


Calculate external index: Adjusted Rand Score

In [23]:
adjusted_rand_score(cho_labels_true, cho_kmCluster_labels_pred)

0.3543166467476026

View sklearn's K-means clustering results of Cho dataset

In [24]:
from sklearn.cluster import KMeans
cho_kmClf = KMeans(n_clusters=5)
cho_kmClf.fit(cho_data_norm)
cho_kmClf.labels_

array([1, 4, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,
       1, 1, 0, 2, 1, 4, 0, 3, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 2, 2,
       2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 4, 1, 2,
       2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 4, 4, 4, 4, 4,
       4, 1, 4, 4, 4, 4, 4, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0,
       4, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 1, 0, 1, 1, 4, 4,
       1, 1, 4, 4, 0, 0, 4, 4, 4, 4, 4, 4, 4, 1, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 4, 4, 4,
       4, 1, 4, 1, 1, 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4,
       1, 4, 4, 4, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 4, 0, 4, 0, 0, 0, 4, 0,
       1, 0, 4, 4, 4, 0, 0, 4, 0, 4, 4, 0, 4, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 4, 4, 4, 0, 0, 0, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4,
       4, 0, 4, 0, 4, 0, 0, 4, 4, 4, 0, 0, 4, 0, 0, 3, 0, 0, 3, 3, 0, 0,
       0, 0, 0, 0, 3, 0, 2, 2, 0, 0, 2, 0, 0, 0, 0,

View adjusted rand score for sklearn's K-means clustering results of Cho dataset

In [25]:
cho_kmClf_labels = cho_kmClf.labels_
adjusted_rand_score(cho_labels_true, cho_kmClf_labels)

0.44895658730716054

Perform K-means clustering for Iyer dataset

In [26]:
iyer_kmCluster, iyer_kmSSE = kMeans(iyer_data_norm, 10)

Index:
 (array([], dtype=int64), array([], dtype=int64))
Index:
 (array([], dtype=int64), array([], dtype=int64))
Index:
 (array([], dtype=int64), array([], dtype=int64))
Index:
 (array([362], dtype=int64), array([0], dtype=int64))
Index:
 (array([], dtype=int64), array([], dtype=int64))
Index:
 (array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 12

  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)


 (array([], dtype=int64), array([], dtype=int64))
Index:
 (array([], dtype=int64), array([], dtype=int64))
Index:
 (array([], dtype=int64), array([], dtype=int64))
Index:
 (array([362], dtype=int64), array([0], dtype=int64))
Index:
 (array([], dtype=int64), array([], dtype=int64))
Index:
 (array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,

 (array([], dtype=int64), array([], dtype=int64))
Index:
 (array([], dtype=int64), array([], dtype=int64))
Index:
 (array([], dtype=int64), array([], dtype=int64))
Index:
 (array([362], dtype=int64), array([0], dtype=int64))
Index:
 (array([], dtype=int64), array([], dtype=int64))
Index:
 (array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,

View the final results of K-means clustering for Iyer dataset

In [27]:
print(iyer_kmCluster)
print(" ")
print(iyer_kmSSE)

[[5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
  5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
  5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
  5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
  5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
  5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
  5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
  5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
  5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
  5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
  5 5 3 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
  5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
  5 5 5 5 5 5 5 5 5 8 5 5 5 8 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
  5 8 8 8 8 5 5 5 5 5 5 5 5 5 5 5 5 5 

View Confusion Matrix

In [28]:
iyer_labels_true = np.array(iyer_y)
iyer_kmCluster_labels_pred = np.array(iyer_kmCluster).reshape((-1,))
iyer_kmCluster_contingency = contingency_matrix(iyer_labels_true, iyer_kmCluster_labels_pred, sparse=True)
print(iyer_kmCluster_contingency)

  (0, 1)	33
  (1, 1)	100
  (2, 1)	145
  (3, 1)	34
  (4, 1)	43
  (5, 1)	7
  (6, 0)	1
  (6, 1)	33
  (7, 1)	14
  (8, 1)	57
  (8, 2)	6
  (9, 1)	17
  (9, 2)	2
  (10, 1)	25


Calculate external index: Adjusted Rand Score

In [29]:
adjusted_rand_score(iyer_labels_true, iyer_kmCluster_labels_pred)

0.005361649684800414

View sklearn's K-means clustering results of Iyer dataset

In [30]:
from sklearn.cluster import KMeans
iyer_kmClf = KMeans(n_clusters=10)
iyer_kmClf.fit(iyer_data_norm)
iyer_kmClf.labels_

array([0, 5, 5, 0, 5, 5, 5, 5, 5, 0, 5, 5, 5, 0, 0, 0, 0, 5, 0, 0, 0, 0,
       0, 0, 0, 5, 0, 5, 0, 0, 0, 0, 0, 0, 5, 0, 5, 0, 5, 5, 0, 5, 0, 0,
       0, 0, 5, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 5, 5, 0, 0, 0, 5, 5, 5, 5,
       0, 5, 5, 0, 0, 0, 5, 5, 5, 5, 5, 0, 0, 5, 0, 5, 5, 5, 0, 0, 5, 5,
       5, 5, 5, 5, 0, 5, 5, 5, 0, 0, 0, 0, 5, 5, 5, 5, 0, 0, 0, 0, 5, 0,
       0, 0, 0, 0, 0, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 5,
       0, 0, 5, 5, 5, 0, 5, 5, 5, 5, 0, 5, 5, 5, 5, 5, 0, 5, 5, 0, 5, 0,
       0, 5, 5, 0, 0, 0, 0, 0, 0, 5, 0, 5, 0, 5, 0, 0, 0, 0, 5, 5, 5, 5,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 5, 0, 5, 0, 5, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5,
       5, 0, 0, 5, 5, 0, 5, 5, 5, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 0,
       0, 5, 0, 0, 0, 0, 5, 0, 5, 0, 0, 5, 0, 0, 0, 5, 0, 0, 5, 0, 1, 1,
       0, 0, 5, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 5, 5, 5, 5, 5,
       0, 5, 5, 5, 5, 5, 9, 5, 5, 5, 8, 0, 5, 5, 2,

View adjusted rand score for sklearn's K-means clustering results of Iyer dataset

In [31]:
iyer_kmClf_labels = iyer_kmClf.labels_
adjusted_rand_score(iyer_labels_true, iyer_kmClf_labels)

0.2539361815866091

Define the calculation of distance matrix

In [32]:
def distanceMatrix(X):
    X = np.array(X)
    disMatrix = np.zeros((len(X), len(X)))
    for i in range(len(X)):
        for j in range(i+1, len(X)):
            disMatrix[i][j] = dist(X[i], X[j])
            disMatrix[j][i] = disMatrix[i][j]
    return disMatrix

In [33]:
cho_disM = distanceMatrix(cho_data_norm)
iyer_disM = distanceMatrix(iyer_data_norm)

Define the calculation of similarity matrix

In [34]:
def knnMatrix(distanceMatrix, k, sigma=1.0):
    N = len(distanceMatrix)
    knnMatrix = np.zeros((N,N))

    for i in range(N):
        distAndIndex1 = zip(distanceMatrix[i], range(N))
        distAndIndex2 = sorted(distAndIndex1, key=lambda x:x[0]) # Sort by first element

        neighbours_id = [distAndIndex2[m][1] for m in range(k+1)] # xi's k nearest neighbours

        for j in neighbours_id: # xj is xi's neighbour
            knnMatrix[i][j] = np.exp(-distanceMatrix[i][j] / (2 * sigma**2))
            knnMatrix[j][i] = knnMatrix[i][j]
    
    print("distAndIndex:\n", distAndIndex2)
    
    return knnMatrix

In [35]:
cho_knnM = knnMatrix(cho_disM, 50)
iyer_knnM = knnMatrix(iyer_disM, 50)

distAndIndex:
 [(0.0, 385), (0.2592616379261017, 381), (0.33098694682121277, 367), (0.35315659642219543, 347), (0.35429927706718445, 363), (0.36762332916259766, 337), (0.3968319892883301, 65), (0.39744433760643005, 336), (0.4151173532009125, 364), (0.42536500096321106, 377), (0.4296271800994873, 379), (0.434520959854126, 366), (0.43904078006744385, 382), (0.44663289189338684, 371), (0.46008428931236267, 5), (0.4633939266204834, 59), (0.4701485335826874, 332), (0.4739886224269867, 344), (0.47652068734169006, 41), (0.48354944586753845, 356), (0.49092739820480347, 279), (0.4940636456012726, 345), (0.4945293068885803, 343), (0.495764821767807, 282), (0.49690625071525574, 333), (0.5023133158683777, 19), (0.502435564994812, 351), (0.5041884183883667, 51), (0.5192199945449829, 11), (0.5215776562690735, 348), (0.5216768980026245, 15), (0.5222569704055786, 384), (0.5266531705856323, 335), (0.529018223285675, 44), (0.5295048952102661, 362), (0.5339887738227844, 53), (0.5358448028564453, 33), (0.

distAndIndex:
 [(0.0, 516), (0.21466068923473358, 513), (0.2671765983104706, 515), (0.2948733866214752, 514), (0.3737364411354065, 503), (0.5055197477340698, 344), (0.5327000021934509, 512), (0.5395280718803406, 510), (0.5464602112770081, 345), (0.5625216960906982, 349), (0.5658907890319824, 505), (0.566342830657959, 263), (0.584435760974884, 262), (0.5869863629341125, 497), (0.5936562418937683, 348), (0.594650387763977, 507), (0.6003750562667847, 343), (0.6621884703636169, 500), (0.6740216016769409, 511), (0.6791495084762573, 509), (0.7087994813919067, 508), (0.7199485898017883, 502), (0.7335199117660522, 504), (0.7346075773239136, 481), (0.7359363436698914, 492), (0.7391204237937927, 506), (0.7394900918006897, 292), (0.7419753670692444, 347), (0.7569035887718201, 493), (0.7578974366188049, 350), (0.7630080580711365, 285), (0.8020944595336914, 407), (0.8074976801872253, 346), (0.8092098236083984, 489), (0.8106572031974792, 475), (0.8138717412948608, 483), (0.8158871531486511, 457), (0

Define the calculation of Laplacian matrix

In [36]:
def laplacianMatrix(knnMatrix):

    # compute the Degree Matrix: D=sum(A)
    degreeMatrix = np.sum(knnMatrix, axis=1)

    # compute the Laplacian Matrix: L=D-A
    laplacianMatrix = np.diag(degreeMatrix) - knnMatrix

    # normalize: D^(-1/2) L D^(-1/2)
    sqrtDegreeMatrix = np.diag(1.0 / (degreeMatrix ** (0.5)))
    return np.dot(np.dot(sqrtDegreeMatrix, laplacianMatrix), sqrtDegreeMatrix)
    # return laplacianMatrix

In [37]:
cho_laM = laplacianMatrix(cho_knnM)
iyer_laM = laplacianMatrix(iyer_knnM)

Get eigenvalues and eigenvectors

In [38]:
def getEigVec(laM,cluster_num):
    eigValue,eigVector = np.linalg.eig(laM)
    print("eigValue:\n", eigValue)
    print("eigVector:\n", eigVector)
    dictEigval = dict(zip(eigValue,range(len(eigValue))))
    print("dictEigval:\n", dictEigval)
    kEig = np.sort(eigValue)[0:cluster_num]
    print("kEig:\n", kEig)
    index = [dictEigval[k] for k in kEig]
    print("index:\n", index)
    return eigValue[index],eigVector[:,index]

In [39]:
cho_eigValue,cho_eigVector = getEigVec(cho_laM, 5)
iyer_eigValue,iyer_eigVector = getEigVec(iyer_laM, 10)
# getEigVec(laM, 5)

eigValue:
 [8.32667268e-17 9.21935200e-02 1.79737018e-01 2.94835290e-01
 4.35388344e-01 5.28478611e-01 5.54015865e-01 6.31214989e-01
 6.66543444e-01 6.94579855e-01 7.35919929e-01 7.54364492e-01
 7.67132592e-01 7.80558098e-01 7.97848308e-01 8.08045624e-01
 8.16343695e-01 8.31760684e-01 8.38680952e-01 8.48087251e-01
 8.57803962e-01 8.63449775e-01 8.67511663e-01 1.12955837e+00
 8.71469911e-01 8.76737830e-01 8.75812628e-01 8.81149572e-01
 1.12272682e+00 1.11965609e+00 8.89112935e-01 8.90592085e-01
 8.93181448e-01 8.94538221e-01 1.11771648e+00 1.11430709e+00
 9.01025630e-01 9.03039389e-01 9.05038569e-01 9.09033908e-01
 9.09294106e-01 9.11580254e-01 1.10905545e+00 1.10581325e+00
 1.10295354e+00 1.10026522e+00 1.09913821e+00 1.09797182e+00
 1.09757550e+00 1.09470453e+00 1.09439354e+00 1.09336160e+00
 1.09181936e+00 1.09058631e+00 1.08865804e+00 1.08813208e+00
 1.08716870e+00 1.08585790e+00 1.08395061e+00 9.14487615e-01
 9.16644692e-01 9.17616187e-01 9.18903388e-01 9.19260899e-01
 9.20767403e-

eigValue:
 [6.93889390e-17 8.45407397e-02 1.47181328e-01 2.79583173e-01
 3.25506005e-01 3.73245834e-01 4.47048940e-01 4.73066156e-01
 5.70039469e-01 5.83712664e-01 6.03376129e-01 6.42324038e-01
 6.74385468e-01 7.06519107e-01 7.29965247e-01 7.43153503e-01
 7.61092515e-01 7.78440397e-01 7.92771775e-01 8.06671757e-01
 8.31068022e-01 8.32439648e-01 8.41531292e-01 8.45133315e-01
 8.55327009e-01 8.59209232e-01 8.61793439e-01 8.70677388e-01
 1.14575594e+00 8.76342565e-01 8.82865482e-01 8.86030897e-01
 8.88693963e-01 8.89979822e-01 8.92648988e-01 1.13866096e+00
 8.98742184e-01 1.13508932e+00 9.04119412e-01 9.07069488e-01
 1.13161197e+00 1.12871358e+00 1.12409333e+00 1.12084493e+00
 1.11974224e+00 1.11774129e+00 1.11715621e+00 1.11396144e+00
 1.11190116e+00 1.11102211e+00 1.11034893e+00 1.10925646e+00
 1.10664574e+00 1.10436637e+00 1.10326123e+00 1.10170639e+00
 1.09978759e+00 1.09939101e+00 1.09716850e+00 1.09624747e+00
 1.09628797e+00 1.09394577e+00 1.09353924e+00 1.09234761e+00
 1.09103840e+

Perform spectral clustering for Cho dataset

In [40]:
cho_spCluster, cho_spSSE = kMeans(cho_eigVector, 5)

Index:
 (array([  3,   4,   5,   6,   8,   9,  10,  11,  16,  21,  29,  40,  41,
        42,  43,  44,  54,  55,  58,  65, 279, 282, 293, 323, 331, 332,
       333, 334, 335, 336, 337, 339, 340, 344, 346, 347, 349, 350, 351,
       353, 354, 356, 357, 358, 359, 361, 362, 363, 364, 365, 366, 367,
       368, 370, 371, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382,
       383, 384, 385], dtype=int64), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0], dtype=int64))
Index:
 (array([ 24,  31,  85,  88,  95,  96, 100, 109, 114, 130, 135, 142, 143,
       167, 170, 184, 185, 193, 195, 197, 199, 200, 201, 212, 222, 232,
       263, 264, 266, 273], dtype=int64), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0], dtype=int64))
Index:
 (array([ 99, 104, 144], d

Index:
 (array([  3,   4,   5,   6,   7,   8,   9,  10,  29,  41,  42,  43,  44,
        54,  55,  58,  65,  66, 267, 279, 282, 283, 289, 290, 292, 293,
       296, 312, 317, 318, 319, 323, 331, 332, 333, 334, 335, 336, 337,
       338, 339, 340, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351,
       352, 353, 354, 355, 356, 357, 359, 361, 362, 363, 364, 365, 366,
       367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379,
       380, 381, 382, 383, 384, 385], dtype=int64), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64))
Index:
 (array([  1,  18,  24,  25,  27,  28,  31,  37,  61,  63,  81,  85,  88,
        95,  96,  99, 100, 109, 114, 130, 135, 137, 142, 143, 144, 146,
       167, 170, 175, 182, 184, 185, 188, 193, 195, 197, 19

Index:
 (array([124, 127, 136, 202, 203, 204, 206, 207, 209, 210, 211, 213, 215,
       216, 217, 219, 221, 225, 226, 228, 233, 234, 235, 236, 238, 242,
       247, 248, 249, 250, 253, 254, 255, 256, 257, 258, 259, 262, 269,
       275, 277, 278, 280, 281, 284, 285, 286, 287, 288, 291, 294, 295,
       297, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310,
       311, 313, 314, 315, 316, 320, 321, 322, 324, 325, 326, 327, 328,
       329, 330, 341, 360], dtype=int64), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64))
Index:
 (array([  3,   4,   5,   6,   9,  29,  41,  42,  43,  44,  55,  58,  65,
        66, 267, 279, 282, 283, 289, 290, 292, 293, 296, 312, 317, 318,
       319, 323, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 342,
   

View the final results of spectral clustering for Cho dataset

In [41]:
print(cho_spCluster)
print(" ")
print(cho_spSSE)

[[3 1 3 0 0 0 0 3 3 0 3 3 3 3 3 3 3 3 1 3 3 3 3 3 1 1 3 1 1 0 3 1 3 3 3 3
  3 1 3 3 3 0 0 0 0 3 3 3 3 3 3 3 3 3 3 0 3 3 0 3 3 1 3 1 3 0 0 2 2 2 2 2
  2 1 2 2 2 2 1 2 2 1 1 2 2 1 2 2 1 3 2 1 2 2 2 1 1 2 2 1 1 2 2 2 1 2 2 2
  2 1 2 3 2 1 1 2 2 2 1 2 2 2 2 2 4 2 3 4 3 3 1 1 3 3 2 1 4 1 2 2 1 2 1 1
  1 3 1 2 2 1 2 2 2 2 1 2 2 2 2 2 2 1 1 2 2 2 2 1 2 1 1 2 3 1 2 1 2 3 2 3
  3 2 1 1 1 1 3 3 3 3 3 2 1 1 2 1 2 1 3 1 1 1 4 4 4 1 4 4 1 4 4 4 1 4 2 4
  4 4 1 4 3 4 1 1 1 4 4 2 4 1 2 1 1 4 4 4 4 3 4 1 1 1 4 1 1 1 2 4 4 4 4 1
  1 4 4 4 4 4 4 4 1 1 4 1 1 1 1 0 2 4 1 1 2 1 1 4 2 4 4 0 4 4 0 0 4 4 4 4
  4 0 0 4 0 0 4 4 0 4 1 4 4 4 4 4 4 4 4 4 4 4 4 4 0 4 4 4 4 0 0 0 4 4 4 0
  4 4 4 4 4 4 4 0 0 0 0 0 0 0 0 0 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0
  4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
 
0.817519094449928


View Confusion Matrix

In [42]:
cho_labels_true = np.array(cho_y)
cho_spCluster_labels_pred = np.array(cho_spCluster).reshape((-1,))
cho_spCluster_contingency = contingency_matrix(cho_labels_true, cho_spCluster_labels_pred, sparse=True)
print(cho_spCluster_contingency)

  (0, 0)	14
  (0, 1)	10
  (0, 3)	43
  (1, 1)	45
  (1, 2)	69
  (1, 3)	18
  (1, 4)	3
  (2, 0)	1
  (2, 1)	28
  (2, 2)	7
  (2, 3)	2
  (2, 4)	37
  (3, 0)	13
  (3, 1)	1
  (3, 4)	40
  (4, 0)	52
  (4, 3)	1
  (4, 4)	2


Calculate external index: Adjusted Rand Score

In [43]:
adjusted_rand_score(cho_labels_true, cho_spCluster_labels_pred)

0.35172231241434415

View sklearn's spectral clustering results of Cho dataset

In [44]:
from sklearn.cluster import SpectralClustering
cho_spClf = SpectralClustering(n_clusters=5)
cho_spClf.fit(cho_data_norm)
cho_spClf.labels_

array([4, 0, 4, 1, 1, 1, 1, 4, 4, 1, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 4, 4,
       4, 4, 3, 0, 4, 0, 3, 1, 4, 0, 4, 4, 4, 4, 4, 0, 4, 4, 4, 1, 1, 1,
       1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 4, 4, 1, 4, 4, 0, 4, 4, 4, 1,
       3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0,
       0, 4, 0, 0, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3,
       0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 4, 3, 4, 4, 0, 0,
       4, 4, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 4, 0, 0, 0,
       0, 4, 0, 4, 4, 0, 0, 4, 0, 0, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0,
       4, 0, 0, 0, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 0, 3, 0, 3, 3, 3, 4, 3,
       4, 3, 0, 0, 0, 3, 3, 0, 3, 3, 0, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3,
       3, 3, 0, 0, 0, 3, 3, 3, 3, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0,
       0, 3, 0, 3, 0, 3, 3, 0, 0, 0, 3, 3, 0, 3, 3, 1, 3, 3, 1, 1, 3, 3,
       3, 3, 3, 3, 1, 3, 1, 1, 3, 3, 3, 3, 3, 3, 3,

View adjusted rand score for sklearn's spectral clustering results of Cho dataset

In [45]:
cho_spClf_labels = cho_spClf.labels_
adjusted_rand_score(cho_labels_true, cho_spClf_labels)

0.44129189070888103

Perform spectral clustering for Iyer dataset

In [46]:
iyer_spCluster, iyer_spSSE = kMeans(iyer_eigVector, 10)

Index:
 (array([372, 373, 374, 377, 408, 409, 412, 420, 423, 425, 428, 430, 433,
       436, 442, 450, 451, 459, 460, 461], dtype=int64), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int64))
Index:
 (array([  9,  13,  19,  21,  28,  29,  42,  44,  45,  47,  49,  51,  53,
        55,  59,  60,  61,  84,  85,  92,  97, 128, 133, 137, 148, 166,
       198, 244, 245, 253, 256, 264, 267, 268, 269, 297, 300, 301, 302,
       306, 307, 308, 310, 311, 312, 318, 319, 320, 321, 322, 323, 324,
       325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337,
       338, 339, 340, 341, 342, 351, 352, 354, 355, 356, 357, 366, 369,
       375, 376, 382, 383, 384, 389, 390, 391, 393, 394, 396, 397, 399,
       400, 401, 402, 403, 405, 406, 417, 418, 419, 421, 422, 443, 452,
       453, 454, 462, 473, 484, 492, 498, 501], dtype=int64), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

Index:
 (array([357, 372, 373, 374, 375, 376, 377, 379, 382, 383, 384, 385, 390,
       406, 408, 409, 412, 414, 415, 417, 418, 420, 423, 425, 428, 429,
       430, 431, 432, 433, 436, 442, 447, 450, 451, 452, 453, 454, 458,
       459, 460, 461, 462, 465, 466, 473, 485], dtype=int64), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0], dtype=int64))
Index:
 (array([256, 300, 301, 302, 306, 307, 308, 310, 311, 312, 313, 318, 319,
       320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332,
       333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 391, 393, 394,
       395, 396, 397, 399, 400, 401, 402, 403, 405, 419, 421, 422, 443,
       448], dtype=int64), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64))
Index:
 (array([ 

Index:
 (array([357, 372, 373, 374, 375, 376, 377, 379, 382, 383, 384, 385, 390,
       406, 408, 409, 412, 414, 415, 417, 418, 420, 423, 425, 428, 429,
       430, 431, 432, 433, 436, 442, 447, 450, 451, 452, 453, 454, 458,
       459, 460, 461, 462, 465, 466, 473, 485], dtype=int64), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0], dtype=int64))
Index:
 (array([300, 301, 302, 306, 307, 308, 310, 311, 312, 313, 318, 319, 320,
       321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333,
       334, 335, 336, 337, 338, 339, 340, 341, 342, 391, 393, 394, 395,
       396, 397, 399, 400, 401, 402, 403, 405, 419, 421, 422, 443, 448],
      dtype=int64), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0], dtype=int64))
Index:
 (array([  3,   9, 

Index:
 (array([  8,  12,  41,  46,  47,  49,  55,  63,  64,  74,  87,  89,  91,
        92,  97, 100, 132, 133, 137, 144, 146, 147, 150, 196, 197, 199,
       200, 204, 220, 221, 237, 238, 239, 240, 244, 245, 252, 264, 265,
       266, 267, 269, 270, 271, 272, 273, 274, 277, 278, 280, 281, 287,
       288, 291, 293, 294, 316, 346, 351, 389, 496, 499], dtype=int64), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64))
Index:
 (array([253, 296, 299, 352, 353, 354, 355, 356, 358, 360, 361, 363, 364,
       365, 366, 367, 368, 369, 370, 371, 380, 381, 386, 387, 388, 474,
       477, 478, 479, 482, 483, 484, 486, 487, 488, 489, 493, 494, 495,
       501], dtype=int64), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64))
Index:
 

Index:
 (array([  8,  12,  41,  46,  47,  49,  55,  63,  64,  74,  87,  89,  91,
        92,  97, 100, 132, 133, 137, 144, 146, 147, 150, 196, 197, 199,
       200, 204, 220, 221, 237, 238, 239, 240, 244, 245, 252, 264, 265,
       266, 267, 269, 270, 272, 273, 274, 277, 278, 280, 281, 288, 291,
       294, 316, 346, 351, 389, 496, 499], dtype=int64), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64))
Index:
 (array([253, 296, 299, 352, 353, 354, 355, 356, 358, 360, 361, 363, 364,
       365, 366, 367, 368, 369, 370, 371, 380, 381, 386, 387, 388, 474,
       477, 478, 479, 482, 483, 484, 486, 487, 488, 489, 493, 494, 495,
       501, 502], dtype=int64), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int64))
Index:
 (array([ 3

View the final results of spectral clustering for Iyer dataset

In [47]:
print(iyer_spCluster)
print(" ")
print(iyer_spSSE)

[[4 6 6 2 6 6 6 6 7 2 6 6 7 4 4 2 2 6 2 4 4 2 2 2 2 6 2 6 4 2 9 4 9 9 6 2
  6 2 6 2 4 7 4 2 4 4 7 7 4 7 4 2 6 2 2 7 2 2 2 2 2 4 6 7 7 2 2 2 2 2 2 2
  2 2 7 5 6 2 2 6 2 6 6 6 2 2 2 7 6 7 6 7 7 6 6 6 2 7 4 2 7 6 6 6 9 9 9 9
  9 9 9 9 9 4 4 9 6 9 9 9 9 9 9 9 4 4 9 9 2 2 2 6 7 7 6 6 6 7 6 6 6 6 2 2
  7 6 7 7 4 6 7 9 6 9 4 6 6 4 2 2 9 9 9 6 2 6 4 2 2 4 4 9 6 6 6 6 9 9 4 9
  4 4 4 4 4 9 9 9 9 6 9 4 4 4 4 9 7 7 4 7 7 6 9 9 7 9 9 6 9 9 4 4 9 9 4 4
  4 9 6 9 7 7 4 6 6 4 6 6 6 9 9 4 4 9 6 6 6 7 7 7 7 4 4 6 7 7 4 4 6 4 6 4
  7 8 4 9 9 9 9 9 9 9 5 5 7 7 7 7 4 7 7 5 7 7 7 5 5 7 7 2 7 7 2 2 6 5 2 5
  7 2 2 7 5 5 7 5 8 2 6 8 1 1 1 9 9 9 1 1 1 9 1 1 1 1 6 9 7 9 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 5 5 5 7 5 5 5 5 7 8 8 8 8 8 0 8 5
  8 8 3 8 8 8 8 8 8 8 8 8 0 0 0 0 0 0 3 0 8 8 0 0 0 0 8 8 8 7 0 1 3 1 1 1
  1 1 3 1 1 1 1 1 3 1 0 3 0 0 3 3 0 3 0 0 4 0 0 1 0 1 1 0 3 0 3 3 0 0 0 0
  0 0 3 3 0 3 3 3 3 3 0 1 3 3 3 0 1 3 0 0 0 0 0 3 3 3 0 0 0 0 0 3 3 0 0 3
  3 3 3 3 3 0 8 5 5 8 8 8 3 5 8 8 8 0 

View Confusion Matrix

In [48]:
iyer_labels_true = np.array(iyer_y)
iyer_spCluster_labels_pred = np.array(iyer_spCluster).reshape((-1,))
iyer_spCluster_contingency = contingency_matrix(iyer_labels_true, iyer_spCluster_labels_pred, sparse=True)
print(iyer_spCluster_contingency)

  (0, 0)	3
  (0, 1)	1
  (0, 2)	1
  (0, 3)	2
  (0, 4)	6
  (0, 5)	1
  (0, 6)	3
  (0, 7)	3
  (0, 8)	6
  (0, 9)	7
  (1, 2)	40
  (1, 4)	14
  (1, 5)	1
  (1, 6)	26
  (1, 7)	16
  (1, 9)	3
  (2, 2)	10
  (2, 4)	32
  (2, 6)	36
  (2, 7)	20
  (2, 9)	47
  (3, 2)	6
  (3, 4)	1
  (3, 5)	10
  (3, 6)	1
  :	:
  (4, 1)	35
  (4, 6)	1
  (4, 7)	1
  (4, 9)	6
  (5, 5)	6
  (5, 7)	1
  (6, 0)	12
  (6, 3)	2
  (6, 5)	1
  (6, 8)	19
  (7, 0)	1
  (7, 1)	11
  (7, 3)	2
  (8, 0)	29
  (8, 1)	5
  (8, 3)	28
  (8, 4)	1
  (9, 0)	2
  (9, 3)	3
  (9, 5)	3
  (9, 8)	11
  (10, 4)	1
  (10, 5)	17
  (10, 7)	2
  (10, 8)	5


Calculate external index: Adjusted Rand Score

In [49]:
adjusted_rand_score(iyer_labels_true, iyer_spCluster_labels_pred)

0.25672341147258215

View sklearn's spectral clustering results of Iyer dataset

In [50]:
from sklearn.cluster import SpectralClustering
iyer_spClf = SpectralClustering(n_clusters=10)
iyer_spClf.fit(iyer_data_norm)
iyer_spClf.labels_

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 8,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

View adjusted rand score for sklearn's spectral clustering results of Iyer dataset

In [51]:
iyer_spClf_labels = iyer_spClf.labels_
adjusted_rand_score(iyer_labels_true, iyer_spClf_labels)

0.2171215123632506