In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv('seed.txt', sep='\t', header=None)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,15.26,14.84,0.871,5.763,3.312,2.221,5.22,1
1,14.88,14.57,0.8811,5.554,3.333,1.018,4.956,1
2,14.29,14.09,0.905,5.291,3.337,2.699,4.825,1
3,13.84,13.94,0.8955,5.324,3.379,2.259,4.805,1
4,16.14,14.99,0.9034,5.658,3.562,1.355,5.175,1


In [4]:
x = data.drop([7], axis=1)
x = x.values
x = normalize(x)

In [18]:
from pso import ParticleSwarmOptimizedClustering
from particle import quantization_error, calc_sse
from utils import normalize
from kmeans import KMeans
from sklearn.metrics import silhouette_score

# K-Means

In [5]:
kmeans = KMeans(n_cluster=3, init_pp=False, seed=2018)
kmeans.fit(x)

In [6]:
predicted_kmeans = kmeans.predict(x)
print('Silhouette:', silhouette_score(x, predicted_kmeans))
print('SSE:', kmeans.SSE)
print('Quantization:', quantization_error(centroids=kmeans.centroid, data=x, labels=predicted_kmeans))

Silhouette: 0.42212676242
SSE: 22.0264512266
Quantization: 2.70759080945


In [13]:
kmeans2 = KMeans(n_cluster=3, init_pp=True, seed=2018)
kmeans2.fit(x)
predicted_kmeans2 = kmeans2.predict(x)
print('Silhouette:', silhouette_score(x, predicted_kmeans))
print('SSE:', kmeans2.SSE)
print('Quantization:', quantization_error(centroids=kmeans2.centroid, data=x, labels=predicted_kmeans2))

Silhouette: 0.42212676242
SSE: 22.0243630757
Quantization: 2.70829286554


# PSO

In [23]:
pso = ParticleSwarmOptimizedClustering(
        n_cluster=3, n_particles=10, data=x, hybrid=True, max_iter=2000, print_debug=50)

In [24]:
hist = pso.run()

Initial global best score 2.70829286554
Iteration 0001/2000 current gbest score 2.708292865537892169
Iteration 0051/2000 current gbest score 2.708292865537892169
Iteration 0101/2000 current gbest score 2.707835657925311335
Iteration 0151/2000 current gbest score 2.707781936967318170
Iteration 0201/2000 current gbest score 2.707778992929485096
Iteration 0251/2000 current gbest score 2.707778882641030282
Iteration 0301/2000 current gbest score 2.707778867595734607
Iteration 0351/2000 current gbest score 2.707778433441446087
Iteration 0401/2000 current gbest score 2.707778035112180959
Iteration 0451/2000 current gbest score 2.707776269309019934
Iteration 0501/2000 current gbest score 2.707759852336307560
Iteration 0551/2000 current gbest score 2.707748920943973570
Iteration 0601/2000 current gbest score 2.707748856720776143
Iteration 0651/2000 current gbest score 2.707748856720600283
Iteration 0701/2000 current gbest score 2.707748856720600283
Iteration 0751/2000 current gbest score 2.707

In [25]:
pso_kmeans = KMeans(n_cluster=3, init_pp=False, seed=2018)

In [26]:
pso_kmeans.centroid = pso.gbest_centroids.copy()
pso_kmeans.centroid

array([[ 0.12244661,  0.1742572 ,  0.37766728,  0.18591758,  0.16186663,
         0.50075285,  0.27955135],
       [ 0.38426695,  0.42116004,  0.66906823,  0.36636775,  0.46882598,
         0.26583124,  0.31975311],
       [ 0.76065794,  0.79703446,  0.6949549 ,  0.73270224,  0.77247813,
         0.36896239,  0.760346  ]])

In [28]:
predicted_pso = pso_kmeans.predict(x)
print('Silhouette:', silhouette_score(x, predicted_pso))
print('SSE:', calc_sse(centroids=pso.gbest_centroids, data=x, labels=predicted_pso))
print('Quantization:', pso.gbest_score)

Silhouette: 0.42212676242
SSE: 22.0290053407
Quantization: 2.70774885672


# Repeated Test

### K-Means++

In [20]:
kmeanspp = {
    'silhouette': [],
    'sse' : [],
    'quantization' : [],
}
for _ in range(20):
    kmean_rep = KMeans(n_cluster=3, init_pp=True)
    kmean_rep.fit(x)
    predicted_kmean_rep = kmean_rep.predict(x)
    silhouette = silhouette_score(x, predicted_kmean_rep)
    sse = kmean_rep.SSE
    quantization = quantization_error(centroids=kmean_rep.centroid, data=x, labels=predicted_kmean_rep)
    kmeanspp['silhouette'].append(silhouette)
    kmeanspp['sse'].append(sse)
    kmeanspp['quantization'].append(quantization)

### PSO 

In [29]:
%%time
pso_plain = {
    'silhouette': [],
    'sse' : [],
    'quantization' : [],
}
for _ in range(20):
    pso_rep = ParticleSwarmOptimizedClustering(
        n_cluster=3, n_particles=10, data=x, hybrid=False, max_iter=2000, print_debug=2000)
    pso_rep.run()
    pso_kmeans = KMeans(n_cluster=3, init_pp=False, seed=2018)
    pso_kmeans.centroid = pso_rep.gbest_centroids.copy()
    predicted_pso_rep = pso_kmeans.predict(x)
    
    silhouette = silhouette_score(x, predicted_pso_rep)
    sse = calc_sse(centroids=pso_rep.gbest_centroids, data=x, labels=predicted_pso_rep)
    quantization = pso_rep.gbest_score
    pso_plain['silhouette'].append(silhouette)
    pso_plain['sse'].append(sse)
    pso_plain['quantization'].append(quantization)

Initial global best score 3.37155063658
Iteration 0001/2000 current gbest score 3.371550636577500981
Finish with gbest score 3.101407650631264001
Initial global best score 3.07005197242
Iteration 0001/2000 current gbest score 3.070051972422974007
Finish with gbest score 3.054827626973843024
Initial global best score 3.35830263074
Iteration 0001/2000 current gbest score 3.358302630741595518
Finish with gbest score 2.907495311773472313
Initial global best score 3.60295586272
Iteration 0001/2000 current gbest score 3.263453281827276786
Finish with gbest score 2.939111114265818170
Initial global best score 3.19082682979
Iteration 0001/2000 current gbest score 3.190826829785005092
Finish with gbest score 2.758444669332564825
Initial global best score 3.73038722154
Iteration 0001/2000 current gbest score 3.266340100811584879
Finish with gbest score 2.801590956493404949
Initial global best score 3.35783035244
Iteration 0001/2000 current gbest score 3.098218479789883073
Finish with gbest score

### PSO Hybrid

In [30]:
%%time
pso_hybrid = {
    'silhouette': [],
    'sse' : [],
    'quantization' : [],
}
for _ in range(20):
    pso_rep = ParticleSwarmOptimizedClustering(
        n_cluster=3, n_particles=10, data=x, hybrid=True, max_iter=2000, print_debug=2000)
    pso_rep.run()
    pso_kmeans = KMeans(n_cluster=3, init_pp=False, seed=2018)
    pso_kmeans.centroid = pso_rep.gbest_centroids.copy()
    predicted_pso_rep = pso_kmeans.predict(x)
    
    silhouette = silhouette_score(x, predicted_pso_rep)
    sse = calc_sse(centroids=pso_rep.gbest_centroids, data=x, labels=predicted_pso_rep)
    quantization = pso_rep.gbest_score
    pso_hybrid['silhouette'].append(silhouette)
    pso_hybrid['sse'].append(sse)
    pso_hybrid['quantization'].append(quantization)

Initial global best score 2.70829286554
Iteration 0001/2000 current gbest score 2.708292865537892169
Finish with gbest score 2.707647699944975894
Initial global best score 2.70829286554
Iteration 0001/2000 current gbest score 2.708292865537892169
Finish with gbest score 2.707904781184771181
Initial global best score 2.70759080945
Iteration 0001/2000 current gbest score 2.707590809454386349
Finish with gbest score 2.707590809454386349
Initial global best score 2.70829286554
Iteration 0001/2000 current gbest score 2.708292865537892169
Finish with gbest score 2.707619636849610156
Initial global best score 2.70759080945
Iteration 0001/2000 current gbest score 2.707590809454386349
Finish with gbest score 2.707590809454386349
Initial global best score 2.70759080945
Iteration 0001/2000 current gbest score 2.707590809454386349
Finish with gbest score 2.707590809454386349
Initial global best score 2.70829286554
Iteration 0001/2000 current gbest score 2.708292865537892169
Finish with gbest score

# Comparison

In [33]:
benchmark = {
    'method' : ['K-Means++', 'PSO', 'PSO Hybrid'],
    'sse_mean' : [
        np.around(np.mean(kmeanspp['sse']), decimals=10),
        np.around(np.mean(pso_plain['sse']), decimals=10),
        np.around(np.mean(pso_hybrid['sse']), decimals=10),
    ],
    'sse_stdev' : [
        np.around(np.std(kmeanspp['sse']), decimals=10),
        np.around(np.std(pso_plain['sse']), decimals=10),
        np.around(np.std(pso_hybrid['sse']), decimals=10),
    ],
    'silhouette_mean' : [
        np.around(np.mean(kmeanspp['silhouette']), decimals=10),
        np.around(np.mean(pso_plain['silhouette']), decimals=10),
        np.around(np.mean(pso_hybrid['silhouette']), decimals=10),
    ],
    'silhouette_stdev' : [
        np.around(np.std(kmeanspp['silhouette']), decimals=10),
        np.around(np.std(pso_plain['silhouette']), decimals=10),
        np.around(np.std(pso_hybrid['silhouette']), decimals=10),
    ],
    'quantization_mean' : [
        np.around(np.mean(kmeanspp['quantization']), decimals=10),
        np.around(np.mean(pso_plain['quantization']), decimals=10),
        np.around(np.mean(pso_hybrid['quantization']), decimals=10),
    ],
    'quantization_stdev' : [
        np.around(np.std(kmeanspp['quantization']), decimals=10),
        np.around(np.std(pso_plain['quantization']), decimals=10),
        np.around(np.std(pso_hybrid['quantization']), decimals=10),
    ],
}

In [34]:
benchmark

{'method': ['K-Means++', 'PSO', 'PSO Hybrid'],
 'quantization_mean': [2.7082928654999998,
  2.8964455008000001,
  2.7076448866999998],
 'quantization_stdev': [0.0, 0.1117984798, 8.5579899999999997e-05],
 'silhouette_mean': [0.42210525679999999, 0.3781104596, 0.42212676240000002],
 'silhouette_stdev': [0.0, 0.063243413799999995, 0.0],
 'sse_mean': [22.024363075699998, 28.695969913300001, 22.027339630299998],
 'sse_stdev': [0.0, 5.6574814616999998, 0.0014085896]}

In [35]:
benchmark_df = pd.DataFrame.from_dict(benchmark)
benchmark_df

Unnamed: 0,method,quantization_mean,quantization_stdev,silhouette_mean,silhouette_stdev,sse_mean,sse_stdev
0,K-Means++,2.708293,0.0,0.422105,0.0,22.024363,0.0
1,PSO,2.896446,0.111798,0.37811,0.063243,28.69597,5.657481
2,PSO Hybrid,2.707645,8.6e-05,0.422127,0.0,22.02734,0.001409


In [37]:
benchmark_df.to_excel('benchmark_res.xlsx', index=False)

In [38]:
benchmark_df.to_csv('benchmark_res.csv', index=False)