In [2]:
import numpy as np  
import pandas as pd 
import matplotlib.pyplot as plt

from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import calinski_harabasz_score
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

import matplotlib.pyplot as plt
from itertools import cycle

## Data Preprocessing

In [12]:
# Select iris or spiral Dataset

df= pd.read_csv("../input/iris-flower-dataset/IRIS.csv")
# df = pd.read_csv("../input/spiral/Spiral.csv") # Uncomment for spiral dataset
df.shape

In [13]:
X = df.copy()
X = X.drop('species', axis=1)
# X = X.drop('label', axis=1) # Uncomment for spiral dataset
X.describe()

In [14]:
# Normalize X
mms = MinMaxScaler()
mms.fit(X)
Xnorm = mms.transform(X)
Xnorm.shape

# Mean Shift

### Clustering on Iris dataset

In [10]:
# Compute clustering with MeanShift
quantiles = [0.25, 0.2, 0.15]

for quantile in quantiles:
    
   
    print("QUANTILE: ", quantile)
    print('-'*60)
    
    # The following bandwidth can be automatically detected using
    bandwidth = estimate_bandwidth(Xnorm, quantile=quantile) 
    
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(Xnorm)
    labels = ms.labels_
    cluster_centers = ms.cluster_centers_

    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)

    Clustered = Xnorm.copy()
    Clustered = pd.DataFrame(Clustered)
    Clustered.loc[:,'Cluster'] = ms.labels_ # append labels to points
    frames = [df['species'], Clustered['Cluster']]
    result = pd.concat(frames, axis = 1)

    print("Quantile: ", quantile)
    print("Bandwidth: ", bandwidth)
    print("Num clusters: ", n_clusters_)
    print("max_iter: ", 300)
    print()
    
    for ClusterNum in range(n_clusters_):

        OneCluster = pd.DataFrame(result[result['Cluster'] == ClusterNum].groupby('species').size())
        OneCluster.columns=['Size']

        NewDigit = OneCluster.index[OneCluster['Size'] == OneCluster['Size'].max()].tolist()
        NewDigit[0]

        rowIndex = result.index[result['Cluster'] == ClusterNum]
        result.loc[rowIndex, 'TransLabel'] = NewDigit[0]
    
    print()
    
    plt.figure(figsize =(6, 4))
    plt.clf()

    colors = cycle("bgrcmykbgrcmykbgrcmykbgrcmyk")
    for k, col in zip(range(n_clusters_), colors):
        my_members = labels == k
        cluster_center = cluster_centers[k]
        plt.plot(Xnorm[my_members, 0], Xnorm[my_members, 1], col + ".")
    plt.title("Mean Shift Clustering")
    plt.legend(['Iris-versicolor', 'Iris-setosa', 'Iris-virginica'], loc='upper right')
    plt.savefig('foo2.jpg')
    plt.show()
    
    
    
    # Check performance of classification to 3 clusters
    print('Mean shift performance')
    print('-'*60)

    Correct = (df['species'] == result['TransLabel']).sum()
    Accuracy = round(Correct/df.shape[0],3)
    print('Accuracy: ', Accuracy)

    # METRICS for clustering algorithms

    print('silhouette: ', round(metrics.silhouette_score(Xnorm, result['TransLabel'],metric='sqeuclidean'),3))
    print('homogeneity_score: ', round(metrics.homogeneity_score(df['species'], result['TransLabel']),3))
    print('completeness_score: ', round(metrics.completeness_score(df['species'], result['TransLabel']),3))
    print('v_measure_score: ', round(metrics.v_measure_score(df['species'], result['TransLabel']),3))
    print('adjusted_rand_score: ', round(metrics.adjusted_rand_score(df['species'], result['TransLabel']),3))
    print('adjusted_mutual_info_score: ', round(metrics.adjusted_mutual_info_score(df['species'], result['TransLabel']),3))
    print('davies_bouldin_score(X, labels): ', davies_bouldin_score(Xnorm, result['TransLabel']))
    print('calinski_harabasz_score(X, labels): ', calinski_harabasz_score(Xnorm, result['TransLabel']))

    
    print('-'*60)


### Clustering for spiral dataset

In [16]:
### Compute clustering with MeanShift
quantiles = [0.105,0.110, 0.1]

for quantile in quantiles:
    
   
    print("QUANTILE: ", quantile)
    print('-'*60)
    
        # The following bandwidth can be automatically detected using
    bandwidth = estimate_bandwidth(Xnorm, quantile=quantile) 
    
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(Xnorm)
    labels = ms.labels_
    cluster_centers = ms.cluster_centers_

    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)

    Clustered = Xnorm.copy()
    Clustered = pd.DataFrame(Clustered)
    Clustered.loc[:,'Cluster'] = ms.labels_ # append labels to points
    frames = [df['label'], Clustered['Cluster']]
    result = pd.concat(frames, axis = 1)

    print("Quantile: ", quantile)
    print("Bandwidth: ", bandwidth)
    print("Num clusters: ", n_clusters_)
    print("max_iter: ", 300)
    print()
    
    for ClusterNum in range(n_clusters_):

        OneCluster = pd.DataFrame(result[result['Cluster'] == ClusterNum].groupby('label').size())
        OneCluster.columns=['Size']

        NewDigit = OneCluster.index[OneCluster['Size'] == OneCluster['Size'].max()].tolist()
        NewDigit[0]

        rowIndex = result.index[result['Cluster'] == ClusterNum]
        result.loc[rowIndex, 'TransLabel'] = NewDigit[0]
    
    print()

    plt.figure(figsize =(6, 6))
    plt.clf()

    colors = cycle("bgrcmykbgrcmykbgrcmykbgrcmyk")
    for k, col in zip(range(n_clusters_), colors):
        my_members = labels == k
        cluster_center = cluster_centers[k]
        plt.plot(Xnorm[my_members, 0], Xnorm[my_members, 1], col + ".")

    plt.title("Mean Shift Clustering")
    plt.legend(['0', '1', '2', '4'], loc='upper right')
    plt.savefig('foo.jpg')
    plt.show()
    

    # Check performance of classification to 3 clusters
    print('Mean shift performance')
    print('-'*60)

    Correct = (df['label'] == result['TransLabel']).sum()
    Accuracy = round(Correct/df.shape[0],3)
    print('Accuracy: ', Accuracy)

    # METRICS for clustering algorithms

    print('silhouette: ', round(metrics.silhouette_score(Xnorm, result['TransLabel'],metric='sqeuclidean'),n_clusters_))
    print('homogeneity_score: ', round(metrics.homogeneity_score(df['label'], result['TransLabel']),n_clusters_))
    print('completeness_score: ', round(metrics.completeness_score(df['label'], result['TransLabel']),n_clusters_))
    print('v_measure_score: ', round(metrics.v_measure_score(df['label'], result['TransLabel']),n_clusters_))
    print('adjusted_rand_score: ', round(metrics.adjusted_rand_score(df['label'], result['TransLabel']),n_clusters_))
    print('adjusted_mutual_info_score: ', round(metrics.adjusted_mutual_info_score(df['label'], result['TransLabel']),n_clusters_))
    print('davies_bouldin_score(X, labels): ', davies_bouldin_score(Xnorm, result['TransLabel']))
    print('calinski_harabasz_score(X, labels): ', calinski_harabasz_score(Xnorm, result['TransLabel']))
    print('-'*60)
    
    