In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

In [2]:
from sklearn import metrics

from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, MeanShift, Birch, AffinityPropagation, MiniBatchKMeans

import warnings
warnings.filterwarnings("ignore")

In [3]:
iris_df = pd.read_csv('datasets/iris.csv',
                      skiprows=1,
                      names = ['sepal-length',
                               'sepal-width',
                               'petal-length',
                               'petal-width',
                               'class'])
iris_df.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


In [4]:
iris_df = iris_df.sample(frac=1).reset_index(drop=True)

iris_df.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
0,4.7,3.2,1.6,0.2,Setosa
1,5.0,3.5,1.6,0.6,Setosa
2,5.1,3.3,1.7,0.5,Setosa
3,5.4,3.9,1.3,0.4,Setosa
4,6.3,2.7,4.9,1.8,Virginica


In [5]:
iris_df.shape

(150, 5)

In [6]:
iris_df['class'].unique()

array(['Setosa', 'Virginica', 'Versicolor'], dtype=object)

In [7]:
#using label encoder
from sklearn import preprocessing

label_encoding = preprocessing.LabelEncoder()

iris_df['class'] = label_encoding.fit_transform(iris_df['class'].astype(str))

iris_df.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
0,4.7,3.2,1.6,0.2,0
1,5.0,3.5,1.6,0.6,0
2,5.1,3.3,1.7,0.5,0
3,5.4,3.9,1.3,0.4,0
4,6.3,2.7,4.9,1.8,2


In [8]:
iris_features = iris_df.drop('class', axis=1)

iris_features.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width
0,4.7,3.2,1.6,0.2
1,5.0,3.5,1.6,0.6
2,5.1,3.3,1.7,0.5
3,5.4,3.9,1.3,0.4
4,6.3,2.7,4.9,1.8


In [9]:
iris_labels = iris_df['class']

iris_labels.sample(5)

48     2
137    1
88     2
84     2
23     0
Name: class, dtype: int32

In [10]:
def build_model(clustering_model, data, labels):
    
    model = clustering_model(data)
    
    print("Homogenity: ", metrics.homogeneity_score(labels, model.labels_))
    print("Completeness: ", metrics.completeness_score(labels, model.labels_))
    print("v_measure: ", metrics.v_measure_score(labels, model.labels_))
    print("adjusted_random: ", metrics.adjusted_rand_score(labels, model.labels_))
    print("adjusted_mutual_info: ", metrics.adjusted_mutual_info_score(labels, model.labels_))
    print("Silhouette : ", metrics.silhouette_score(data, model.labels_))

In [11]:
def k_means(data, n_clusters=3, max_iter=1000):
    model = KMeans(n_clusters=n_clusters, max_iter=max_iter).fit(data)
    
    return model

In [12]:
build_model(k_means, iris_features, iris_labels)

Homogenity:  0.7514854021988339
Completeness:  0.7649861514489816
v_measure:  0.7581756800057786
adjusted_random:  0.7302382722834697
adjusted_mutual_info:  0.7551191675800484
Silhouette :  0.5528190123564103


In [13]:
#Default Linkage Critetion is ward which minimize the variances of clusters being merged
def agglomerative_fn(data, n_clusters=3):
    model = AgglomerativeClustering(n_clusters = n_clusters).fit(data)
    
    return model

In [14]:
build_model(agglomerative_fn, iris_features, iris_labels)

Homogenity:  0.7608008469718723
Completeness:  0.7795958005591144
v_measure:  0.7700836616487869
adjusted_random:  0.7311985567707745
adjusted_mutual_info:  0.7671669615713111
Silhouette :  0.5543236611296426


In [15]:
#min_samples = minimum of data points in a cluster
#eps = lower = too many outlier, higher = all can be in a cluster

def dbscan_fn(data, eps=0.45, min_samples=4):
    model = DBSCAN (eps=eps, min_samples=min_samples).fit(data)
    return model

In [16]:
build_model(dbscan_fn, iris_features, iris_labels)

Homogenity:  0.5773205947971476
Completeness:  0.6093983666695363
v_measure:  0.5929259393972258
adjusted_random:  0.5084974632998323
adjusted_mutual_info:  0.5842965531192635
Silhouette :  0.37244398989696226


In [17]:
#tries to discover gumpalan in a smooth cluster of data points
def mean_shift_fn(data, bandwidth=0.85):
    model = MeanShift(bandwidth=bandwidth).fit(data)
    return model

In [18]:
build_model(mean_shift_fn, iris_features, iris_labels)

Homogenity:  0.7603645798041669
Completeness:  0.7717917344958113
v_measure:  0.7660355440487252
adjusted_random:  0.7436826319432357
adjusted_mutual_info:  0.763083127524581
Silhouette :  0.5511573791952865


In [19]:
#time and memory efficient
def birch_fn(data, n_clusters=3):
    model = Birch(n_clusters=n_clusters).fit(data)
    return model

In [20]:
build_model(birch_fn, iris_features, iris_labels)

Homogenity:  0.7362143719137068
Completeness:  0.7635364986411701
v_measure:  0.7496265610732065
adjusted_random:  0.6941455626558043
adjusted_mutual_info:  0.746428957650968
Silhouette :  0.5173787231350643


In [21]:
#Do not specify the number of cluster upfront
#damping = the extend to which the current value is maintained relative to the incoming value, a learning rate for the algorithm
def affinity_propagation_fn(data, damping=0.6, max_iter=1000):
    model = AffinityPropagation(damping=damping, max_iter=max_iter).fit(data)
    return model

In [22]:
build_model(affinity_propagation_fn, iris_features, iris_labels)

Homogenity:  0.8512533506223855
Completeness:  0.4920470508767308
v_measure:  0.6236232792832318
adjusted_random:  0.4387747515158993
adjusted_mutual_info:  0.6127727485238739
Silhouette :  0.3454618638923714


In [23]:
def mini_batch_kmeans_fn(data, n_clusters=3, max_iter=1000):
    model = MiniBatchKMeans(n_clusters=n_clusters, max_iter=max_iter, batch_size=20).fit(data)
    return model

In [25]:
build_model(mini_batch_kmeans_fn, iris_features, iris_labels) 

Homogenity:  0.7869234996582516
Completeness:  0.8093691546872487
v_measure:  0.797988521701332
adjusted_random:  0.745503868180448
adjusted_mutual_info:  0.7954205025674187
Silhouette :  0.5553062646081601


In [27]:
from sklearn.cluster import SpectralClustering

In [28]:
SS=1000 #self similarity = the similarity of a data point with itself

In [29]:
IS=10 #intra cluster similarity

In [30]:
LS = 0.01 #between points in different cluster 

In [31]:
similarity_mat = [[SS,IS,IS,LS,LS,LS,LS,LS,LS],
                  [IS,SS,LS,LS,LS,LS,LS,LS,LS],
                  [IS,IS,SS,LS,LS,LS,LS,LS,LS],
                  [LS,LS,LS,SS,IS,IS,LS,LS,LS],
                  [LS,LS,LS,IS,SS,IS,LS,LS,LS],
                  [LS,LS,LS,IS,IS,SS,LS,LS,LS],
                  [LS,LS,LS,LS,LS,LS,SS,IS,IS],
                  [LS,LS,LS,LS,LS,LS,IS,SS,IS],
                  [LS,LS,LS,LS,LS,LS,IS,IS,SS]]

In [33]:
spectral_model = SpectralClustering(n_clusters=3, affinity='precomputed').fit(similarity_mat)

In [34]:
spectral_model.labels_

array([0, 0, 0, 1, 1, 1, 2, 2, 2])

In [None]:
#first array assign with 0, 2nd with 1, 3rd with 2 