# Mushroom dataset
Source: https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data

Test AntClust against K-means for a categorical dataset.

In [71]:
from sklearn.cluster import KMeans
import pandas as pd
from sklearn.metrics import jaccard_score
from sklearn.preprocessing import LabelEncoder

# Load Mushroom dataset
mushroom_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
mushroom_data = pd.read_csv(mushroom_url, header=None)

label_encoder = LabelEncoder()
subset = 1000
mushroom_encoded = mushroom_data.iloc[:subset,1:].apply(label_encoder.fit_transform)
mushroom_label = mushroom_data.iloc[:subset, 0]
binary_labels = label_encoder.fit_transform(mushroom_label)
print(len(mushroom_encoded))

1000


K-Means

In [72]:
# Apply KMeans clustering
cluster_results = []
for k in range(2,5):
    # Initialize the KMeans model
    kmeans = KMeans(n_clusters=k, random_state=42)
    # Fit the model to the data
    kmeans.fit(mushroom_encoded)
    cluster_results.append((k,kmeans.labels_))

# AntClust with Jaccard score as similarity

In [73]:
# ----------------------
#       imports
# ----------------------
# import opencv
import cv2 as cv
# matplotlib
import matplotlib.pyplot as plt
from sklearn.metrics import pairwise_distances

# make AntClus dir known
import sys
sys.path.append("../AntClust")
# import AntClust
from AntClust import AntClust
from importlib import reload
import distance_classes
reload(distance_classes)
# import the rule set
from rules import labroche_rules

f_sim = [distance_classes.similarity_categorical()]
ant_clust = AntClust(f_sim, labroche_rules())
ant = [[mushroom_encoded.iloc[i]] for i in range(mushroom_encoded.shape[0])]
ant_clust.fit(ant)
clusters_found = ant_clust.get_clusters()

AntClust: phase 1 of 3 -> meeting ants
Meeting 75000 / 75000
Meeting 67500 / 75000
Meeting 60000 / 75000
Meeting 52500 / 75000
Meeting 45000 / 75000
Meeting 37500 / 75000
Meeting 30000 / 75000
Meeting 22500 / 75000
Meeting 15000 / 75000
Meeting 7500 / 75000
AntClust: phase 2 of 3 -> shrink nests
AntClust: phase 3 of 3 -> reassign ants


# Metrics

In [74]:
from sklearn import metrics
import pandas as pd
df = pd.DataFrame()


homogeneity_score = metrics.homogeneity_score(binary_labels, clusters_found)
completeness_score = metrics.completeness_score(binary_labels, clusters_found)
v_score = metrics.v_measure_score(binary_labels, clusters_found)
ari_score = metrics.adjusted_rand_score(binary_labels, clusters_found)
pd_data = {
    'Homogeneity': homogeneity_score,
    'Completeness': completeness_score,
    'V-measure': v_score,
    'Adjusted Rand-Index': ari_score,
}

# Creating a new DataFrame with the data for the new row
new_row = pd.DataFrame(pd_data, index=["AntClust (Jaccard similarity)"])
df = pd.concat([df, new_row])

for k, k_label in cluster_results:
    homogeneity_score = metrics.homogeneity_score(binary_labels, k_label)
    completeness_score = metrics.completeness_score(binary_labels, k_label)
    v_score = metrics.v_measure_score(binary_labels, k_label)
    ari_score = metrics.adjusted_rand_score(binary_labels, k_label)
    pd_data = {
        'Homogeneity': homogeneity_score,
        'Completeness': completeness_score,
        'V-measure': v_score,
        'Adjusted Rand-Index': ari_score,
    }
    new_row = pd.DataFrame(pd_data, index=[f"K-means (k={k})"])
    df = pd.concat([df, new_row])

df


Unnamed: 0,Homogeneity,Completeness,V-measure,Adjusted Rand-Index
AntClust (Jaccard similarity),0.926437,0.114274,0.203452,0.046784
K-means (k=2),0.011993,0.00595,0.007954,0.023562
K-means (k=3),0.195499,0.058787,0.090393,0.033639
K-means (k=4),0.239852,0.058869,0.094535,0.059449
