In [3]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd

# Load datasets
soybean = fetch_ucirepo(id=91) 
zoo = fetch_ucirepo(id=111)
heart_disease = fetch_ucirepo(id=45)
dermatology = fetch_ucirepo(id=33)
breast_cancer = fetch_ucirepo(id=15)
mushroom = fetch_ucirepo(id=73)

# Convert datasets to DataFrames
# Replace 'features' and 'targets' with the actual attribute names
soybean_df = pd.DataFrame(data=soybean.data.features, columns=soybean.data.feature_names)
soybean_df['target'] = soybean.data.targets

zoo_df = pd.DataFrame(data=zoo.data.features, columns=zoo.data.feature_names)
zoo_df['target'] = zoo.data.targets

heart_disease_df = pd.DataFrame(data=heart_disease.data.features, columns=heart_disease.data.feature_names)
heart_disease_df['target'] = heart_disease.data.targets

dermatology_df = pd.DataFrame(data=dermatology.data.features, columns=dermatology.data.feature_names)
dermatology_df['target'] = dermatology.data.targets

breast_cancer_df = pd.DataFrame(data=breast_cancer.data.features, columns=breast_cancer.data.feature_names)
breast_cancer_df['target'] = breast_cancer.data.targets

mushroom_df = pd.DataFrame(data=mushroom.data.features, columns=mushroom.data.feature_names)
mushroom_df['target'] = mushroom.data.targets

# Ensure the datasets are properly defined before proceeding



In [5]:
import numpy as np
import itertools
import networkx as nx
from sklearn.manifold import SpectralEmbedding
from sklearn.cluster import KMeans

def jaccard_coefficient(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union

def ochiai_coefficient(set1, set2):
    intersection = len(set1.intersection(set2))
    denominator = np.sqrt(len(set1) * len(set2))
    return intersection / denominator

def overlap_coefficient(set1, set2):
    intersection = len(set1.intersection(set2))
    min_length = min(len(set1), len(set2))
    return intersection / min_length

def dice_coefficient(set1, set2):
    intersection = len(set1.intersection(set2))
    denominator = len(set1) + len(set2)
    return 2 * intersection / denominator

def graph_based_representation(data):
    num_samples, num_features = data.shape
    similarity_matrix = np.zeros((num_features, num_features))
    for i, j in itertools.combinations(range(num_features), 2):
        similarity_matrix[i, j] = jaccard_coefficient(set(data[:, i]), set(data[:, j]))
        similarity_matrix[j, i] = similarity_matrix[i, j]
    G = nx.from_numpy_array(similarity_matrix)
    embedding = SpectralEmbedding(n_components=p)
    representation_matrix = embedding.fit_transform(similarity_matrix)
    return representation_matrix

def joint_operation(data, representation_matrix):
    return np.dot(data, representation_matrix)

def mean_operation(data, representation_matrix):
    return np.mean(np.dot(data, representation_matrix), axis=1)

def perform_clustering(data, k):
    kmeans = KMeans(n_clusters=k)
    return kmeans.fit_predict(data)


In [6]:
# Import necessary libraries
import itertools
import networkx as nx
from sklearn.manifold import SpectralEmbedding
from sklearn.cluster import KMeans

# Define clustering functions
def create_graph_representation(data):
    num_samples, num_features = data.shape
    similarity_matrix = np.zeros((num_features, num_features))
    for i, j in itertools.combinations(range(num_features), 2):
        similarity_matrix[i, j] = jaccard_coefficient(set(data[:, i]), set(data[:, j]))
        similarity_matrix[j, i] = similarity_matrix[i, j]
    G = nx.from_numpy_array(similarity_matrix)
    embedding = SpectralEmbedding(n_components=num_components)
    representation_matrix = embedding.fit_transform(similarity_matrix)
    return representation_matrix

def apply_joint_operation(data, representation_matrix):
    return np.dot(data, representation_matrix)

def cluster_data(data, num_clusters):
    kmeans = KMeans(n_clusters=num_clusters)
    return kmeans.fit_predict(data)

# Define parameters
num_components = 10
num_clusters = 3

# Define a list to store results
performance_results = []

# Define datasets as a dictionary
datasets = {
    "soybean_df": soybean_df,
    "zoo_df": zoo_df,
    "heart_disease_df": heart_disease_df,
    "dermatology_df": dermatology_df,
    "breast_cancer_df": breast_cancer_df,
    "mushroom_df": mushroom_df
}

# Loop over dataset names
for dataset_name, dataset in datasets.items():
    # Suppress warnings
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=UserWarning)
        
        # Data preprocessing and clustering
        try:
            X = dataset.drop(columns=[dataset.columns[-1]])  # Assuming the last column is the target
            y = dataset[dataset.columns[-1]]  # Assuming the last column is the target
            encoder = OneHotEncoder()
            X_encoded = encoder.fit_transform(X)
            representation_matrix = create_graph_representation(X_encoded.toarray())
            integrated_data = apply_joint_operation(X_encoded.toarray(), representation_matrix)
            labels = cluster_data(integrated_data, num_clusters)

            ARI = adjusted_rand_score(y, labels)
            NMI = normalized_mutual_info_score(y, labels)
            FMI = fowlkes_mallows_score(y, labels)
            
            # Store results
            performance_results.append([dataset_name, ARI, NMI, FMI])
        except UserWarning as e:
            print(f"Warning: {e}")

# Convert results to DataFrame
results_df = pd.DataFrame(performance_results, columns=["Dataset", "ARI", "NMI", "FMI"])
print(results_df)


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


            Dataset       ARI       NMI       FMI
0        soybean_df  0.654719  0.718860  0.758713
1            zoo_df  0.695380  0.698895  0.792488
2  heart_disease_df  0.092516  0.125321  0.403703
3    dermatology_df  0.291307  0.328613  0.485086
4  breast_cancer_df  0.694342  0.563490  0.853592
5       mushroom_df  0.270956  0.240795  0.591300


  super()._check_params_vs_input(X, default_n_init=10)


Compare and contrast each performance index, what are the advantages and disadvantages of ARI, NMI, and FMI, and when to use each?

These three tools: Adjusted Rand Index, Normalized Mutual Information, and Folkes-Mallows Index, are popular ways to measure how well algorithms group things together.

All three assess how well a predicted clustering matches the true labels (ground truth). They provide scores between -1 (complete disagreement) and 1 (perfect agreement). They're also useful for comparing different clustering algorithms or their settings.

However, they differ in their focus and sensitivity. ARI considers how often data points are grouped together (or apart) in both predicted and true labels. It accounts for random clustering but favors solutions with more clusters. NMI focuses on the information shared between the two clusterings and is less sensitive to cluster numbers, but doesn't directly address chance agreement. FMI is similar to NMI in terms of cluster sensitivity and focuses on correctly placed data points, also not directly addressing chance agreement.

Here's how to pick the best metric: Use ARI if the number of clusters varies significantly or chance agreement is a big concern. Use NMI if the number of clusters is less of an issue and you want to understand the information shared between the true and predicted clusterings. Finally, use FMI if the number of clusters is less of a concern and you want to focus on the number of correctly classified points.

Using Kmodes and Hierarchical Clustering, use the same dataset and perform categorical data clustering, use FMI, ARI, and NMI for the comparison of performance.

In [16]:
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, fowlkes_mallows_score
from kmodes.kmodes import KModes
from sklearn.preprocessing import LabelEncoder

k = 3
def preprocess_and_encode_categorical_data(data):
    encoder = LabelEncoder()
    encoded_data = data.apply(encoder.fit_transform)
    return encoded_data

def perform_clustering_and_evaluate_performance(data, true_labels, num_clusters):
    km = KModes(n_clusters=num_clusters, init='Huang', n_init=5, verbose=0)
    km_labels = km.fit_predict(data)
    ARI_km = adjusted_rand_score(true_labels, km_labels)
    NMI_km = normalized_mutual_info_score(true_labels, km_labels)
    FMI_km = fowlkes_mallows_score(true_labels, km_labels)

    ac = AgglomerativeClustering(n_clusters=num_clusters, linkage='ward')
    ac_labels = ac.fit_predict(data)
    ARI_ac = adjusted_rand_score(true_labels, ac_labels)
    NMI_ac = normalized_mutual_info_score(true_labels, ac_labels)
    FMI_ac = fowlkes_mallows_score(true_labels, ac_labels)
    
    return [
        ["K-Modes", ARI_km, NMI_km, FMI_km],
        ["Hierarchical", ARI_ac, NMI_ac, FMI_ac]
    ]

results_categorical = []

for dataset_name in datasets:
    dataset = globals()[dataset_name]
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=UserWarning)
        try:
            X_cat = dataset.iloc[:, :-1]
            true_labels_cat = dataset.iloc[:, -1]

            X_cat_encoded = preprocess_and_encode_categorical_data(X_cat)
            clustering_results = perform_clustering_and_evaluate_performance(X_cat_encoded, true_labels_cat, k)  # Pass 'k' as parameter

            for method, ari, nmi, fmi in clustering_results:
                results_categorical.append([dataset_name + " (" + method + ")", ari, nmi, fmi])
        except UserWarning as e:
            print(f"Warning: {e}")

results_categorical_df = pd.DataFrame(results_categorical, columns=["Dataset", "ARI", "NMI", "FMI"])
print(results_categorical_df)



                            Dataset       ARI       NMI       FMI
0              soybean_df (K-Modes)  0.653689  0.837666  0.783908
1         soybean_df (Hierarchical)  0.653689  0.837666  0.783908
2                  zoo_df (K-Modes)  0.727582  0.729576  0.815845
3             zoo_df (Hierarchical)  0.461701  0.585056  0.645736
4        heart_disease_df (K-Modes)  0.094425  0.174192  0.410031
5   heart_disease_df (Hierarchical)  0.012003  0.013638  0.351372
6          dermatology_df (K-Modes)  0.523548  0.651525  0.681949
7     dermatology_df (Hierarchical)  0.031635  0.076679  0.291702
8        breast_cancer_df (K-Modes)  0.407178  0.436138  0.684170
9   breast_cancer_df (Hierarchical)  0.718035  0.630146  0.860889
10            mushroom_df (K-Modes)  0.169824  0.211118  0.531541
11       mushroom_df (Hierarchical)  0.448900  0.445047  0.711999


Write your report using Latex. Your report should be focused on the "why's and the what's" of each performance metrices (i.e. why is FMI always greater than ARI and NMI? What's the problem with ARI and NMI?).

In clustering analysis, various performance metrics are used to evaluate the quality of clustering results. Commonly used metrics include Adjusted Rand Index (ARI), Normalized Mutual Information (NMI), and Folkes-Mallows Index (FMI). In this report, we explore the characteristics and differences of these metrics, discussing their advantages and limitations.

ARI measures the similarity between two clusterings by considering all pairs of samples and counting pairs that are assigned to the same or different clusters in the predicted and true clusterings. ARI produces a value between -1 and 1, where 1 indicates perfect similarity between the two clusterings.

NMI measures the mutual dependence between the predicted and true clusterings, normalized to produce a value between 0 and 1. NMI is based on information theory principles and is commonly used in clustering evaluation.

FMI measures the geometric mean of pairwise precision and recall, providing a single value that captures both clustering purity and homogeneity.

Each performance metric has its own strengths and weaknesses, and the choice of metric depends on the specific characteristics of the dataset and the goals of the clustering analysis. ARI is suitable for evaluating clustering algorithms when the ground truth labels are available, while NMI and FMI provide alternative perspectives on clustering quality that may be more robust to certain dataset properties.