In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from sklearn import metrics

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.cluster import MeanShift
from sklearn.cluster import Birch
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import MiniBatchKMeans

import warnings
warnings.filterwarnings("ignore")

## to ignore this warning:
## /anaconda3/lib/python3.7/site-packages/sklearn/metrics/cluster/supervised.py:732: FutureWarning: The behavior of AMI will change in version 0.22. To match the behavior of 'v_measure_score', AMI will use average_method='arithmetic' by default.
## FutureWarning)

In [3]:
iris_df = pd.read_csv('datasets/iris.csv', 
                       skiprows=1, 
                       names = ['sepal-length',
                                'sepal-width',
                                'petal-length',
                                'petal-width',
                                'class'])

iris_df.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
iris_df = iris_df.sample(frac=1).reset_index(drop=True)

iris_df.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
0,6.3,2.8,5.1,1.5,Iris-virginica
1,4.9,3.1,1.5,0.1,Iris-setosa
2,6.0,2.9,4.5,1.5,Iris-versicolor
3,5.6,2.5,3.9,1.1,Iris-versicolor
4,5.9,3.0,4.2,1.5,Iris-versicolor


In [5]:
iris_df.shape

(150, 5)

In [6]:
from sklearn import preprocessing

label_encoding = preprocessing.LabelEncoder()

iris_df['class'] = label_encoding.fit_transform(iris_df['class'].astype(str))

iris_df.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
0,6.3,2.8,5.1,1.5,2
1,4.9,3.1,1.5,0.1,0
2,6.0,2.9,4.5,1.5,1
3,5.6,2.5,3.9,1.1,1
4,5.9,3.0,4.2,1.5,1


In [7]:
iris_features = iris_df.drop('class', axis=1)

iris_features.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width
0,6.3,2.8,5.1,1.5
1,4.9,3.1,1.5,0.1
2,6.0,2.9,4.5,1.5
3,5.6,2.5,3.9,1.1
4,5.9,3.0,4.2,1.5


In [8]:
iris_labels = iris_df['class']

iris_labels.sample(5)

101    2
112    1
61     0
133    2
118    2
Name: class, dtype: int64

In [9]:
def build_model(clustering_model, data, labels):
    
    model = clustering_model(data)

    print('homo\tcompl\tv-meas\tARI\tAMI\tsilhouette')
    print(50 * '-')
    
    print('%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
          %(metrics.homogeneity_score(labels, model.labels_),
            metrics.completeness_score(labels, model.labels_),
            metrics.v_measure_score(labels, model.labels_),
            metrics.adjusted_rand_score(labels, model.labels_),
            metrics.adjusted_mutual_info_score(labels,  model.labels_),
            metrics.silhouette_score(data, model.labels_)))

In [10]:
def k_means(data, n_clusters=3, max_iter=1000):
    model = KMeans(n_clusters=n_clusters, max_iter=max_iter).fit(data)
    
    return model

In [11]:
build_model(k_means, iris_features, iris_labels)

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.751	0.765	0.758	0.730	0.748	0.553


In [12]:
def agglomerative_fn(data, n_clusters=3):
    model = AgglomerativeClustering(n_clusters = n_clusters).fit(data)
    
    return model

In [13]:
build_model(agglomerative_fn, iris_features, iris_labels)

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.761	0.780	0.770	0.731	0.758	0.554


In [14]:
def dbscan_fn(data, eps=0.45, min_samples=4):
    model = DBSCAN(eps=eps, min_samples=min_samples).fit(data)
    return model

In [15]:
build_model(dbscan_fn, iris_features, iris_labels)

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.577	0.609	0.593	0.508	0.569	0.372


In [16]:
def mean_shift_fn(data, bandwidth=0.85):
    model = MeanShift(bandwidth=bandwidth).fit(data)
    return model

In [17]:
build_model(mean_shift_fn, iris_features, iris_labels)

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.760	0.772	0.766	0.744	0.757	0.551


In [18]:
def birch_fn(data, n_clusters=3):
    model = Birch(n_clusters=n_clusters).fit(data)
    return model

In [19]:
build_model(birch_fn, iris_features, iris_labels)

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.787	0.809	0.798	0.746	0.784	0.555


In [20]:
def affinity_propagation_fn(data, damping=0.6, max_iter=1000):
    model = AffinityPropagation(damping=damping, max_iter=max_iter).fit(data)
    return model

In [21]:
build_model(affinity_propagation_fn, iris_features, iris_labels)

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.851	0.492	0.623	0.437	0.480	0.349


In [22]:
def mini_batch_kmeans_fn(data, n_clusters=3, max_iter=1000):
    model = MiniBatchKMeans(n_clusters=n_clusters, max_iter=max_iter, batch_size=20).fit(data)
    return model

In [23]:
build_model(mini_batch_kmeans_fn, iris_features, iris_labels)

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.740	0.746	0.743	0.729	0.737	0.544


### Spectral Clustering

In [24]:
from sklearn.cluster import SpectralClustering

In [25]:
SS = 1000 

In [26]:
IS = 10 

In [27]:
LS = 0.01

In [28]:
similarity_mat = [[SS, IS, IS, LS, LS, LS, LS, LS, LS],
                  [IS, SS, IS, LS, LS, LS, LS, LS, LS],
                  [IS, IS, SS, LS, LS, LS, LS, LS, LS],
                  [LS, LS, LS, SS, IS, IS, LS, LS, LS],
                  [LS, LS, LS, IS, SS, IS, LS, LS, LS],
                  [LS, LS, LS, IS, IS, SS, LS, LS, LS],
                  [LS, LS, LS, LS, LS, LS, SS, IS, IS],
                  [LS, LS, LS, LS, LS, LS, IS, SS, IS],
                  [LS, LS, LS, LS, LS, LS, IS, IS, SS]] 

In [29]:
spectral_model = SpectralClustering(n_clusters=3, affinity='precomputed').fit(similarity_mat)

In [30]:
spectral_model.labels_

array([1, 1, 1, 0, 0, 0, 2, 2, 2], dtype=int32)