<a href="https://colab.research.google.com/github/aryanbakshi04/Clustering/blob/main/Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler, PowerTransformer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering, MeanShift
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score


data = load_iris()
X = pd.DataFrame(data.data, columns=data.feature_names)


def preprocess(X, method):
    if method == 'none':
        return X.values
    elif method == 'normalize':
        return MinMaxScaler().fit_transform(X)
    elif method == 'transform':
        return PowerTransformer().fit_transform(X)
    elif method == 'pca':
        return PCA(n_components=2).fit_transform(X)
    elif method == 'normalize+transform':
        X1 = MinMaxScaler().fit_transform(X)
        return PowerTransformer().fit_transform(X1)
    elif method == 'normalize+transform+pca':
        X1 = MinMaxScaler().fit_transform(X)
        X2 = PowerTransformer().fit_transform(X1)
        return PCA(n_components=2).fit_transform(X2)


def evaluate(X, labels):
    if len(set(labels)) <= 1:
        return np.nan, np.nan, np.nan
    return (
        silhouette_score(X, labels),
        calinski_harabasz_score(X, labels),
        davies_bouldin_score(X, labels)
    )


methods = ['none', 'normalize', 'transform', 'pca', 'normalize+transform', 'normalize+transform+pca']
method_names = {
    'none': 'No Data Processing',
    'normalize': 'Using Normalization',
    'transform': 'Using Transform',
    'pca': 'Using PCA',
    'normalize+transform': 'Using T+N',
    'normalize+transform+pca': 'T+N+PCA'
}

cluster_counts = [3, 4, 5]


results_kmeans = {'Silhouette': [], 'Calinski-Harabasz': [], 'Davies-Bouldin': []}
results_hier = {'Silhouette': [], 'Calinski-Harabasz': [], 'Davies-Bouldin': []}
results_meanshift = {'Silhouette': [], 'Calinski-Harabasz': [], 'Davies-Bouldin': []}


for method in methods:
    X_proc = preprocess(X, method)

    sil_k, cal_k, dav_k = [], [], []
    sil_h, cal_h, dav_h = [], [], []

    for c in cluster_counts:

        kmeans = KMeans(n_clusters=c, random_state=0)
        labels_k = kmeans.fit_predict(X_proc)
        s, c_, d = evaluate(X_proc, labels_k)
        sil_k.append(round(s, 2)); cal_k.append(round(c_, 2)); dav_k.append(round(d, 2))


        hier = AgglomerativeClustering(n_clusters=c)
        labels_h = hier.fit_predict(X_proc)
        s, c_, d = evaluate(X_proc, labels_h)
        sil_h.append(round(s, 2)); cal_h.append(round(c_, 2)); dav_h.append(round(d, 2))

    results_kmeans['Silhouette'].append(sil_k)
    results_kmeans['Calinski-Harabasz'].append(cal_k)
    results_kmeans['Davies-Bouldin'].append(dav_k)

    results_hier['Silhouette'].append(sil_h)
    results_hier['Calinski-Harabasz'].append(cal_h)
    results_hier['Davies-Bouldin'].append(dav_h)


    ms = MeanShift()
    labels_ms = ms.fit_predict(X_proc)
    s, c_, d = evaluate(X_proc, labels_ms)
    results_meanshift['Silhouette'].append([round(s, 2)] * 3)
    results_meanshift['Calinski-Harabasz'].append([round(c_, 2)] * 3)
    results_meanshift['Davies-Bouldin'].append([round(d, 2)] * 3)


def make_table(metric_dict, metric, method_names):
    data = {
        method_names[m]: vals
        for m, vals in zip(methods, metric_dict[metric])
    }
    df = pd.DataFrame(data, index=[f'c={c}' for c in cluster_counts]).T
    df.index.name = f'{metric} →'
    return df


print("KMeans Clustering\n")
print("Silhouette:")
print(make_table(results_kmeans, 'Silhouette', method_names))
print("\nCalinski-Harabasz:")
print(make_table(results_kmeans, 'Calinski-Harabasz', method_names))
print("\nDavies-Bouldin:")
print(make_table(results_kmeans, 'Davies-Bouldin', method_names))

print("\n\n Hierarchical Clustering\n")
print("Silhouette:")
print(make_table(results_hier, 'Silhouette', method_names))
print("\nCalinski-Harabasz:")
print(make_table(results_hier, 'Calinski-Harabasz', method_names))
print("\nDavies-Bouldin:")
print(make_table(results_hier, 'Davies-Bouldin', method_names))

print("\n\nMeanShift Clustering\n")
print("Silhouette:")
print(make_table(results_meanshift, 'Silhouette', method_names))
print("\nCalinski-Harabasz:")
print(make_table(results_meanshift, 'Calinski-Harabasz', method_names))
print("\nDavies-Bouldin:")
print(make_table(results_meanshift, 'Davies-Bouldin', method_names))

from tabulate import tabulate

def make_kmeans_format_table():
    headers = ["Parameters"]
    for method in method_names.values():
        headers += [f"{method}\nc=3", f"c=4", f"c=5"]

    table = []
    for metric in ['Silhouette', 'Calinski-Harabasz', 'Davies-Bouldin']:
        row = [metric]
        for scores in results_kmeans[metric]:
            row += scores
        table.append(row)

    print(tabulate(table, headers=headers, tablefmt="grid"))

make_kmeans_format_table()

KMeans Clustering

Silhouette:
                      c=3   c=4   c=5
Silhouette →                         
No Data Processing   0.55  0.50  0.46
Using Normalization  0.48  0.45  0.44
Using Transform      0.46  0.43  0.40
Using PCA            0.60  0.56  0.55
Using T+N            0.46  0.38  0.35
T+N+PCA              0.50  0.44  0.43

Calinski-Harabasz:
                        c=3     c=4     c=5
Calinski-Harabasz →                        
No Data Processing   561.59  530.77  459.45
Using Normalization  351.30  314.47  269.94
Using Transform      246.79  213.51  179.10
Using PCA            693.71  715.90  683.68
Using T+N            239.88  202.92  203.55
T+N+PCA              290.67  256.41  276.25

Davies-Bouldin:
                      c=3   c=4   c=5
Davies-Bouldin →                     
No Data Processing   0.67  0.78  0.92
Using Normalization  0.79  0.90  0.93
Using Transform      0.84  0.91  0.96
Using PCA            0.56  0.62  0.65
Using T+N            0.85  0.90  0.95
T+N+PCA   