In [2]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

import simpsom as sps
from sklearn import metrics #for evaluation




In [None]:
Iris = pd.read_csv('../week08/Iris.csv')
print(Iris.columns)

In [90]:
X = Iris[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']]
y = Iris['Species']

scaler = StandardScaler()
scaler= scaler.fit(X.values)

X_scale = scaler.transform(X.values)



In [108]:
X.shape

(150, 4)

### SOM

In [91]:
net = sps.SOMNet(10, 10, X_scale, PBC=True) 
net.train()


Periodic Boundary Conditions active.
The weights will be initialized with PCA.
The map will be trained with the batch algorithm.
Training SOM... done!


In [92]:
cls = net.cluster(X_scale, clus_type='KMeans') #'MeanShift') #



<Figure size 432x288 with 0 Axes>

In [93]:
#net.nodes_graph(colnum=1)
#net.diff_graph()

In [94]:
cls_id = np.zeros([len(X_scale)])

In [95]:
for i in range(len(cls)):
    cls_id[cls[i]] = i
    

In [96]:
cls_id.shape

(150,)

## Rand Index/Score

In [97]:

rand_score = metrics.rand_score(y, cls_id)
print('Rand Score >> ', rand_score)

Rand Score >>  0.7365548098434005


In [111]:
cls_id

array([7., 0., 0., 0., 7., 2., 7., 7., 0., 0., 2., 7., 0., 0., 2., 2., 2.,
       7., 2., 2., 7., 7., 7., 7., 7., 0., 7., 7., 7., 0., 0., 7., 2., 2.,
       0., 0., 7., 0., 0., 7., 7., 0., 0., 7., 2., 0., 2., 0., 7., 7., 5.,
       3., 5., 6., 4., 1., 3., 6., 5., 6., 6., 1., 6., 1., 1., 5., 1., 6.,
       4., 6., 3., 1., 4., 1., 1., 5., 5., 5., 1., 6., 6., 6., 6., 4., 1.,
       3., 5., 4., 1., 6., 6., 1., 6., 6., 6., 1., 1., 1., 6., 1., 3., 4.,
       5., 4., 3., 5., 6., 5., 4., 5., 3., 4., 5., 4., 4., 3., 5., 5., 5.,
       4., 5., 4., 5., 4., 3., 5., 4., 1., 4., 5., 5., 5., 4., 4., 4., 5.,
       3., 3., 1., 5., 3., 5., 4., 5., 3., 5., 4., 5., 3., 1.])

In [113]:
y

0         Iris-setosa
1         Iris-setosa
2         Iris-setosa
3         Iris-setosa
4         Iris-setosa
            ...      
145    Iris-virginica
146    Iris-virginica
147    Iris-virginica
148    Iris-virginica
149    Iris-virginica
Name: Species, Length: 150, dtype: object

### Enropy

In [99]:
contingency_matrix = metrics.cluster.contingency_matrix(y, cls_id)

In [117]:
#print(contingency_matrix)

#in class 
contingency_matrix = np.array([[0,18,0,0,0,18,0,14],
                               [8,0,17,1,5,0,16,3],
                               [13,0,1,17,16,0,3,0] 
                              ])
print(contingency_matrix)

[[ 0 18  0  0  0 18  0 14]
 [ 8  0 17  1  5  0 16  3]
 [13  0  1 17 16  0  3  0]]


In [124]:

mij = contingency_matrix
mi = contingency_matrix.sum(axis=0)
pij = mij/mi

log2pij = np.log2(pij,out=np.zeros_like(pij), where=(pij!=0))
print(pij.round(2))
print(log2pij.round(2))

ei = pij*log2pij
ei = -1*ei.sum(axis=0)
print('e_i \n', ei.round(2))

m = contingency_matrix.sum()

entropy = ((mi/m)*ei).sum()
print('entropy =', entropy )


[[0.   1.   0.   0.   0.   1.   0.   0.82]
 [0.38 0.   0.94 0.06 0.24 0.   0.84 0.18]
 [0.62 0.   0.06 0.94 0.76 0.   0.16 0.  ]]
[[ 0.    0.    0.    0.    0.    0.    0.   -0.28]
 [-1.39  0.   -0.08 -4.17 -2.07  0.   -0.25 -2.5 ]
 [-0.69  0.   -4.17 -0.08 -0.39  0.   -2.66  0.  ]]
e_i 
 [ 0.96 -0.    0.31  0.31  0.79 -0.    0.63  0.67]
entropy = 0.4752685702626609


### Purity

In [34]:
#purity
np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix)

0.86

In [33]:
np.amax(contingency_matrix, axis=0)

array([16, 21, 17, 17, 16, 18, 11, 13], dtype=int64)

### Kmeans

In [126]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import simpsom as sps
from sklearn import metrics #for evaluation
from sklearn.metrics.cluster import rand_score


In [127]:
df = pd.read_csv('Iris.csv')
print(df.columns)

Index(['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species'],
      dtype='object')


In [128]:
X = df[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']]
y = df[['Species']]

In [129]:

model = KMeans(n_clusters=5, random_state=0)

model.fit(X)


In [130]:
cls_id = model.labels_

### Evaluate by Rand Index/Score

In [140]:
#แบบที่ 1
rand = metrics.cluster.rand_score(y['Species'],cls_id)
print(rand)

0.8432214765100671


In [141]:
#แบบที่ 2  
#ในคลาสมีการตั้งชื่อ rand_score ซ้ำกับชื่อ ฟังก์ชัน เมื่อรันแล้วจะทำให้ overide ฟังก์ชันที่ import มาก่อนหน้า
#เวลากดรันในครั้งถัดไป จึงมอง rand_score ไม่ใช่ฟังก์ชัน  เมื่อ import ใหม่อีกที เพื่อเรียก ฟังก์ชัน rand_score มาทำงาน 
#วิธีที่ดีสุด คือ ไม่ควรตั้งชื่อ ตัวแปร ให้เหมือนฟังก์ชัน 
#from sklearn.metrics.cluster import rand_score

rand = rand_score(y['Species'],cls_id)  #ไม่ควรตั้งชื่อ ตัวแปร ซ้ำกับชื่อฟังก์ชัน 
print(rand)

0.8432214765100671


### Hirarchical Clustering

In [27]:
cls_id.shape

(150,)

In [31]:
metrics.cluster.rand_score(y['Species'].values,cls_id)

0.8432214765100671