In [2]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, MeanShift, estimate_bandwidth, AgglomerativeClustering
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import matplotlib.pyplot as plt
from pandas.plotting import table

# Load the dataset
iris = datasets.load_iris()
X = iris.data

# Preprocessing Functions
def normalize(data): return MinMaxScaler().fit_transform(data)
def transform(data): return np.log1p(data)
def apply_pca(data, n=2): return PCA(n_components=n).fit_transform(data)

# Evaluation Function
def evaluate_clustering(X, labels):
    if len(np.unique(labels)) < 2:
        return [None, None, None]
    return [
        round(silhouette_score(X, labels), 2),
        round(calinski_harabasz_score(X, labels), 0),
        round(davies_bouldin_score(X, labels), 2),
    ]

# Prepare preprocessing pipelines
preprocessing_techniques = {
    'No Processing': lambda x: x,
    'Normalization': normalize,
    'Transform': transform,
    'PCA': lambda x: apply_pca(x),
    'T+N': lambda x: normalize(transform(x)),
    'T+N+PCA': lambda x: apply_pca(normalize(transform(x))),
}

# Cluster counts for KMeans and Hierarchical
cluster_counts = [3, 4, 5]

def perform_clustering_analysis():
    all_results = {}

    # KMeans and Hierarchical Clustering
    for algo in ['KMeans', 'Hierarchical']:
        algo_results = []
        for method_name, prep_func in preprocessing_techniques.items():
            X_prep = prep_func(X)
            for k in cluster_counts:
                if algo == 'KMeans':
                    model = KMeans(n_clusters=k, n_init=10)
                else:
                    model = AgglomerativeClustering(n_clusters=k)

                labels = model.fit_predict(X_prep)
                scores = evaluate_clustering(X_prep, labels)

                algo_results.append({
                    'Preprocessing': method_name,
                    'Clusters': k,
                    'Silhouette': scores[0],
                    'Calinski-Harabasz': scores[1],
                    'Davies-Bouldin': scores[2],
                })

        all_results[algo] = pd.DataFrame(algo_results)

    # MeanShift Clustering
    ms_results = []
    for method_name, prep_func in preprocessing_techniques.items():
        X_prep = prep_func(X)
        bandwidth = estimate_bandwidth(X_prep, quantile=0.2)
        model = MeanShift(bandwidth=bandwidth)
        labels = model.fit_predict(X_prep)
        scores = evaluate_clustering(X_prep, labels)

        ms_results.append({
            'Preprocessing': method_name,
            'Clusters': len(np.unique(labels)),
            'Bandwidth': round(bandwidth, 4),
            'Silhouette': scores[0],
            'Calinski-Harabasz': scores[1],
            'Davies-Bouldin': scores[2],
        })

    all_results['MeanShift'] = pd.DataFrame(ms_results)
    return all_results

def save_table_as_png(df, filename, extra_cols=None):
    fig, ax = plt.subplots(figsize=(12, max(4, len(df)/2)))
    ax.axis('off')

    # Create table and style it
    tbl = table(ax, df, loc='center', cellLoc='center')
    tbl.auto_set_font_size(False)
    tbl.set_fontsize(10)
    tbl.scale(1.2, 1.2)

    # Adjust layout
    plt.tight_layout()
    plt.savefig(filename, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Table saved as '{filename}'")

# Perform all clustering analyses
results = perform_clustering_analysis()

# Save results for each algorithm
for algo, df in results.items():
    print(f"\n{algo} Clustering Results:")
    print(df.to_string(index=False))

    # Save to CSV
    csv_file = f"{algo.lower()}_results.csv"
    df.to_csv(csv_file, index=False)
    print(f"Saved to {csv_file}")

    # Save to PNG
    png_file = f"{algo.lower()}_results.png"
    save_table_as_png(df, png_file)


KMeans Clustering Results:
Preprocessing  Clusters  Silhouette  Calinski-Harabasz  Davies-Bouldin
No Processing         3        0.55              562.0            0.66
No Processing         4        0.50              531.0            0.78
No Processing         5        0.49              496.0            0.81
Normalization         3        0.50              360.0            0.76
Normalization         4        0.45              314.0            0.90
Normalization         5        0.35              289.0            0.96
    Transform         3        0.57              974.0            0.63
    Transform         4        0.50              839.0            0.77
    Transform         5        0.34              722.0            1.08
          PCA         3        0.60              694.0            0.56
          PCA         4        0.56              718.0            0.60
          PCA         5        0.55              685.0            0.63
          T+N         3        0.49              