# Titanic dataset
Source: https://github.com/mwaskom/seaborn-data/blob/master/titanic.csv

Test AntClust against K-means for a categorical dataset.

In [7]:
from sklearn.cluster import KMeans
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

# Load Titanic dataset
titanic_data = sns.load_dataset('titanic')[['sex', 'alone', 'class', 'pclass', 'survived']]

label_encoder = LabelEncoder()
titanic_encoded = titanic_data.iloc[:,:-2].apply(label_encoder.fit_transform)
titanic_label = titanic_data.iloc[:, 0]
binary_labels = label_encoder.fit_transform(titanic_label)
print(len(binary_labels))
print(len(titanic_encoded))

891
891


K-Means

In [8]:
# Apply KMeans clustering
cluster_results = []
for k in range(2,5):
    # Initialize the KMeans model
    kmeans = KMeans(n_clusters=k, random_state=42)
    # Fit the model to the data
    kmeans.fit(titanic_encoded)
    cluster_results.append((k,kmeans.labels_))

# AntClust with Jaccard score as similarity

In [9]:
# ----------------------
#       imports
# ----------------------
# import opencv
import cv2 as cv
# matplotlib
import matplotlib.pyplot as plt
from sklearn.metrics import pairwise_distances

# make AntClus dir known
import sys
sys.path.append("../AntClust")
# import AntClust
from AntClust import AntClust
from importlib import reload
import distance_classes
reload(distance_classes)
# import the rule set
from rules import labroche_rules

f_sim = [distance_classes.similarity_categorical()]
ant_clust = AntClust(f_sim, labroche_rules())
ant = [[titanic_encoded.iloc[i]] for i in range(titanic_encoded.shape[0])]
ant_clust.fit(ant)
clusters_found = ant_clust.get_clusters()

AntClust: phase 1 of 3 -> meeting ants
Meeting 66825 / 66825
Meeting 53460 / 66825
Meeting 40095 / 66825
Meeting 26730 / 66825
Meeting 13365 / 66825
AntClust: phase 2 of 3 -> shrink nests
AntClust: phase 3 of 3 -> reassign ants


# Metrics

In [10]:
from sklearn import metrics
import pandas as pd
df = pd.DataFrame()


homogeneity_score = metrics.homogeneity_score(binary_labels, clusters_found)
completeness_score = metrics.completeness_score(binary_labels, clusters_found)
v_score = metrics.v_measure_score(binary_labels, clusters_found)
ari_score = metrics.adjusted_rand_score(binary_labels, clusters_found)
pd_data = {
    'Homogeneity': homogeneity_score,
    'Completeness': completeness_score,
    'V-measure': v_score,
    'Adjusted Rand-Index': ari_score,
}

# Creating a new DataFrame with the data for the new row
new_row = pd.DataFrame(pd_data, index=["AntClust (Jaccard similarity)"])
df = pd.concat([df, new_row])

for k, k_label in cluster_results:
    homogeneity_score = metrics.homogeneity_score(binary_labels, k_label)
    completeness_score = metrics.completeness_score(binary_labels, k_label)
    v_score = metrics.v_measure_score(binary_labels, k_label)
    ari_score = metrics.adjusted_rand_score(binary_labels, k_label)
    pd_data = {
        'Homogeneity': homogeneity_score,
        'Completeness': completeness_score,
        'V-measure': v_score,
        'Adjusted Rand-Index': ari_score,
    }
    new_row = pd.DataFrame(pd_data, index=[f"K-means (k={k})"])
    df = pd.concat([df, new_row])

df


Unnamed: 0,Homogeneity,Completeness,V-measure,Adjusted Rand-Index
AntClust (Jaccard similarity),1.0,0.234375,0.379747,0.199048
K-means (k=2),0.014472,0.013651,0.014049,0.02388
K-means (k=3),0.311123,0.1854,0.232344,0.165621
K-means (k=4),0.166684,0.079905,0.108025,0.109695
