# Digits clustering
Compare k-means with Antclust for comparing breast cancer data (digits dataset https://scikit-learn.org/stable/auto_examples/datasets/plot_digits_last_image.html#:~:text=This%20dataset%20is%20made%20up,feature%20vector%20with%20length%2064.)

In [6]:
from sklearn.cluster import KMeans
from sklearn import datasets
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_digits


# Load the digits dataset
digits = load_digits()
data = digits.data
labels = digits.target
num_cluster = len(np.unique(labels))
print(f"Number of features: {len(data[0])}\nNumber of data points: {len(data)}\nNumber of classes: {num_cluster}")


Number of features: 64
Number of data points: 1797
Number of classes: 10


# K-Means

In [13]:
#Finding the optimum number of clusters for k-means classification
from sklearn.cluster import KMeans

cluster_results = []
for k in range(8,13):
    # Initialize the KMeans model
    kmeans = KMeans(n_clusters=k, random_state=42)
    # Fit the model to the data
    kmeans.fit(data)
    cluster_results.append((k,kmeans.labels_))

# AntClust

In [3]:
# ----------------------
#       imports
# ----------------------
# import opencv
import cv2 as cv
# matplotlib
import matplotlib.pyplot as plt
from sklearn.metrics import pairwise_distances

# make AntClus dir known
import sys
sys.path.append("../AntClust")
# import AntClust
from AntClust import AntClust
from importlib import reload
import distance_classes
reload(distance_classes)
# import the rule set
from rules import labroche_rules

# Compute pairwise Euclidean distances
distances = pairwise_distances(data, metric='euclidean')

# Get the minimum and maximum distances
min_distance = np.min(distances)  # Exclude zeros on the diagonal
max_distance = np.max(distances)

f_sim = [distance_classes.similarity_euclid(min_distance, max_distance)]
ant_clust = AntClust(f_sim, labroche_rules())
ant_clust.fit([[d] for d in data])
clusters_found = ant_clust.get_clusters()

AntClust: phase 1 of 3 -> meeting ants
Meeting 134775 / 134775
Meeting 107820 / 134775
Meeting 80865 / 134775
Meeting 53910 / 134775
Meeting 26955 / 134775
AntClust: phase 2 of 3 -> shrink nests
AntClust: phase 3 of 3 -> reassign ants


# AntClust Cosine Similarity

In [12]:
# ----------------------
#       imports
# ----------------------
# import opencv
import cv2 as cv
# matplotlib
import matplotlib.pyplot as plt
from sklearn.metrics import pairwise_distances

# make AntClus dir known
import sys
sys.path.append("../AntClust")
# import AntClust
from AntClust import AntClust
from importlib import reload
import distance_classes
reload(distance_classes)
# import the rule set
from rules import labroche_rules

f_sim = [distance_classes.image_cosine_similarity(img_tensor=data)]
ant_clust = AntClust(f_sim, labroche_rules())
ant_clust.fit([[d] for d in range(len(data))])
clusters_found_cos = ant_clust.get_clusters()

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

AntClust: phase 1 of 3 -> meeting ants
Meeting 134775 / 134775
Meeting 107820 / 134775
Meeting 80865 / 134775
Meeting 53910 / 134775
Meeting 26955 / 134775
AntClust: phase 2 of 3 -> shrink nests
AntClust: phase 3 of 3 -> reassign ants


# Metrics

In [16]:
from sklearn import metrics
import pandas as pd
df = pd.DataFrame()


homogeneity_score = metrics.homogeneity_score(digits.target, clusters_found)
completeness_score = metrics.completeness_score(digits.target, clusters_found)
v_score = metrics.v_measure_score(digits.target, clusters_found)
ari_score = metrics.adjusted_rand_score(digits.target, clusters_found)
pd_data = {
    'Homogeneity': homogeneity_score,
    'Completeness': completeness_score,
    'V-measure': v_score,
    'Adjusted Rand-Index': ari_score,
}

# Creating a new DataFrame with the data for the new row
new_row = pd.DataFrame(pd_data, index=["AntClust (euclidean distance)"])
df = pd.concat([df, new_row])

for k, k_label in cluster_results:
    homogeneity_score = metrics.homogeneity_score(digits.target, k_label)
    completeness_score = metrics.completeness_score(digits.target, k_label)
    v_score = metrics.v_measure_score(digits.target, k_label)
    ari_score = metrics.adjusted_rand_score(digits.target, k_label)
    pd_data = {
        'Homogeneity': homogeneity_score,
        'Completeness': completeness_score,
        'V-measure': v_score,
        'Adjusted Rand-Index': ari_score,
    }
    new_row = pd.DataFrame(pd_data, index=[f"K-means (k={k})"])
    df = pd.concat([df, new_row])

df


Unnamed: 0,Homogeneity,Completeness,V-measure,Adjusted Rand-Index
AntClust (euclidean distance),0.809031,0.6401,0.714719,0.519198
K-means (k=8),0.669316,0.764487,0.713743,0.57967
K-means (k=9),0.692027,0.75002,0.719858,0.597194
K-means (k=10),0.737745,0.745229,0.741468,0.664962
K-means (k=11),0.790131,0.763908,0.776799,0.73117
K-means (k=12),0.793983,0.744832,0.768623,0.703364
