In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.metrics import (
    silhouette_score, davies_bouldin_score, calinski_harabasz_score,
    adjusted_rand_score, normalized_mutual_info_score, fowlkes_mallows_score
)

# Load the Wine dataset from UCI repository
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
column_names = [
    'Class', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium',
    'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',
    'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline'
]
df = pd.read_csv(url, header=None, names=column_names)

# Separate features and true labels
X = df.drop('Class', axis=1).values
y_true = df['Class'].values

# Define preprocessing techniques
preprocessing_methods = {
    'raw': lambda x: x,
    'standard': StandardScaler().fit_transform,
    'minmax': MinMaxScaler().fit_transform,
    'pca_standard': lambda x: PCA(n_components=2).fit_transform(StandardScaler().fit_transform(x))
}

# Define clustering algorithms
def clustering_algorithms(n_clusters):
    return {
        'KMeans': KMeans(n_clusters=n_clusters, random_state=42),
        'Agglomerative': AgglomerativeClustering(n_clusters=n_clusters),
        'GMM': GaussianMixture(n_components=n_clusters, random_state=42)
    }

# Function to evaluate clustering
def evaluate_clustering(X, labels, y_true):
    return {
        'Silhouette': silhouette_score(X, labels),
        'Davies-Bouldin': davies_bouldin_score(X, labels),
        'Calinski-Harabasz': calinski_harabasz_score(X, labels),
        'ARI': adjusted_rand_score(y_true, labels),
        'NMI': normalized_mutual_info_score(y_true, labels),
        'FMI': fowlkes_mallows_score(y_true, labels)
    }

# Run experiments
results = []

for prep_name, prep_func in preprocessing_methods.items():
    X_prep = prep_func(X)
    for n_clusters in [2, 3, 4, 5]:
        for algo_name, algo in clustering_algorithms(n_clusters).items():
            if algo_name == 'GMM':
                labels = algo.fit(X_prep).predict(X_prep)
            else:
                labels = algo.fit_predict(X_prep)
            metrics = evaluate_clustering(X_prep, labels, y_true)
            results.append({
                'Preprocessing': prep_name,
                'Algorithm': algo_name,
                'Clusters': n_clusters,
                **metrics
            })

# Run DBSCAN separately
dbscan = DBSCAN(eps=1.5, min_samples=5)
X_std = StandardScaler().fit_transform(X)
labels = dbscan.fit_predict(X_std)
# Filter out noise points for evaluation
core_samples_mask = labels != -1
if np.unique(labels[core_samples_mask]).size > 1:
    metrics = evaluate_clustering(X_std[core_samples_mask], labels[core_samples_mask], y_true[core_samples_mask])
    results.append({
        'Preprocessing': 'standard',
        'Algorithm': 'DBSCAN',
        'Clusters': len(set(labels)) - (1 if -1 in labels else 0),
        **metrics
    })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Format results into a wide-style comparison table
def format_table(df, metrics=['Silhouette', 'Calinski-Harabasz', 'Davies-Bouldin']):
    formatted_rows = []
    for (algo, prep), group in df.groupby(['Algorithm', 'Preprocessing']):
        row = {'Algorithm': algo, 'Preprocessing': prep}
        for metric in metrics:
            for k in sorted(group['Clusters'].unique()):
                val = group[group['Clusters'] == k][metric].values
                row[f'{metric}_c={k}'] = round(val[0], 2) if len(val) > 0 else np.nan
        formatted_rows.append(row)
    return pd.DataFrame(formatted_rows)

# Create final formatted table
formatted_df = format_table(results_df)

# Display the formatted table
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(formatted_df)

# Save to CSV and LaTeX for reporting
formatted_df.to_csv("formatted_clustering_results.csv", index=False)
formatted_df.to_latex("formatted_clustering_results.tex", index=False,
                      multicolumn=True, multirow=True, longtable=True)


        Algorithm Preprocessing  Silhouette_c=2  Silhouette_c=3  \
0   Agglomerative        minmax            0.30            0.29   
1   Agglomerative  pca_standard            0.47            0.56   
2   Agglomerative           raw            0.66            0.56   
3   Agglomerative      standard            0.27            0.28   
4             GMM        minmax            0.30            0.29   
5             GMM  pca_standard            0.47            0.56   
6             GMM           raw            0.55            0.35   
7             GMM      standard            0.27            0.28   
8          KMeans        minmax            0.30            0.30   
9          KMeans  pca_standard            0.48            0.56   
10         KMeans           raw            0.66            0.56   
11         KMeans      standard            0.27            0.28   

    Silhouette_c=4  Silhouette_c=5  Calinski-Harabasz_c=2  \
0             0.25            0.20                  75.75   
1     