<a href="https://colab.research.google.com/github/chirag21120/Clustering/blob/main/Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering, MeanShift
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

In [None]:
wine = load_wine()
X = wine.data

In [None]:
preprocessing_techniques = {
    'No Preprocessing': lambda x: x,
    'Normalization': MinMaxScaler().fit_transform,
    'Standardization': StandardScaler().fit_transform,
    'PCA': PCA(n_components=2).fit_transform,
    'Normalization + PCA': lambda x: PCA(n_components=2).fit_transform(MinMaxScaler().fit_transform(x)),
    'Standardization + PCA': lambda x: PCA(n_components=2).fit_transform(StandardScaler().fit_transform(x)),
    'Normalization + Standardization + PCA': lambda x: PCA(n_components=2).fit_transform(StandardScaler().fit_transform(MinMaxScaler().fit_transform(x)))
}


In [40]:
clustering_algorithms = {
    'KMeans': [KMeans(n_clusters=i) for i in range(3, 6)],
    'Hierarchical': [AgglomerativeClustering(n_clusters=i) for i in range(3, 6)],
    'MeanShift': [MeanShift()]
}



In [None]:
evaluation_metrics = {
    'Silhouette Score': silhouette_score,
    'Davies-Bouldin Index': davies_bouldin_score,
    'Calinski-Harabasz Index': calinski_harabasz_score
}

In [None]:
import os
if not os.path.exists('results'):
    os.makedirs('results')

In [44]:
best_algorithm = None
best_num_clusters = None
best_silhouette_score = float('-inf')


In [45]:
for algo_name, algorithms in clustering_algorithms.items():
    # Initialize a dictionary to store the results
    algo_results = {}
    for algorithm in algorithms:
        for preproc_name, preproc_func in preprocessing_techniques.items():
            X_preprocessed = preproc_func(X)
            algorithm.fit(X_preprocessed)

            # Check if the clustering produced valid results
            unique_labels = set(algorithm.labels_)
            if len(unique_labels) > 1 and -1 not in unique_labels:
                labels = algorithm.labels_

                # Compute evaluation metrics
                metrics = {}
                for metric_name, metric_func in evaluation_metrics.items():
                    score = metric_func(X_preprocessed, labels)
                    metrics[metric_name] = score

                # Store the results
                if preproc_name not in algo_results:
                    algo_results[preproc_name] = {}
                algo_results[preproc_name][f'{algorithm.__class__.__name__}_clusters_{algorithm.n_clusters if hasattr(algorithm, "n_clusters") else "MeanShift"}'] = metrics

    # Convert the final results to DataFrame
    final_results_df = pd.DataFrame(algo_results)

    # Save the final results to a CSV file
    final_results_df.to_csv(f"results/{algo_name}_results.csv")

    # Print the final results
    print(f"\n{algo_name} Results:")
    print(final_results_df)
    for preproc_name, preproc_results in algo_results.items():
        for num_clusters, metrics in preproc_results.items():
            if metrics['Silhouette Score'] > best_silhouette_score:
                best_algorithm = algo_name
                best_num_clusters = num_clusters
                best_silhouette_score = metrics['Silhouette Score']




KMeans Results:
                                                    No Preprocessing  \
KMeans_clusters_3  {'Silhouette Score': 0.5711381937868838, 'Davi...   
KMeans_clusters_4  {'Silhouette Score': 0.5611347173642887, 'Davi...   
KMeans_clusters_5  {'Silhouette Score': 0.5489993239795675, 'Davi...   

                                                       Normalization  \
KMeans_clusters_3  {'Silhouette Score': 0.3013463273503232, 'Davi...   
KMeans_clusters_4  {'Silhouette Score': 0.25713225377923027, 'Dav...   
KMeans_clusters_5  {'Silhouette Score': 0.242280555267062, 'Davie...   

                                                     Standardization  \
KMeans_clusters_3  {'Silhouette Score': 0.2848589191898987, 'Davi...   
KMeans_clusters_4  {'Silhouette Score': 0.2521848297078379, 'Davi...   
KMeans_clusters_5  {'Silhouette Score': 0.2295348007057559, 'Davi...   

                                                                 PCA  \
KMeans_clusters_3  {'Silhouette Score': 0.57

In [46]:
print("\nBest Clustering Algorithm:", best_algorithm)
print("Best Number of Clusters:", best_num_clusters)
print("Best Silhouette Score:", best_silhouette_score)



Best Clustering Algorithm: KMeans
Best Number of Clusters: KMeans_clusters_3
Best Silhouette Score: 0.5722554756855063
