# Breast Cancer clustering
Compare k-means with Antclust for comparing breast cancer data (breast_cancer dataset https://archive.ics.uci.edu/dataset/109/breast_cancer)

In [5]:
from sklearn.cluster import KMeans
from sklearn import datasets
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_breast_cancer


# Load the breast_cancer dataset
breast_cancer = load_breast_cancer()
data = breast_cancer.data
labels = breast_cancer.target
num_cluster = len(np.unique(labels))
print(f"Number of features: {len(data[0])}\nNumber of data points: {len(data)}\nNumber of classes: {num_cluster}")


Number of features: 30
Number of data points: 569
Number of classes: 2


# K-Means

In [2]:
#Finding the optimum number of clusters for k-means classification
from sklearn.cluster import KMeans

cluster_results = []
for k in range(2,9):
    # Initialize the KMeans model
    kmeans = KMeans(n_clusters=k, random_state=42)
    # Fit the model to the data
    kmeans.fit(data)
    cluster_results.append((k,kmeans.labels_))

AntClust

In [3]:
# ----------------------
#       imports
# ----------------------
# import opencv
import cv2 as cv
# matplotlib
import matplotlib.pyplot as plt
from sklearn.metrics import pairwise_distances

# make AntClus dir known
import sys
sys.path.append("../AntClust")
# import AntClust
from AntClust import AntClust
from importlib import reload
import distance_classes
reload(distance_classes)
# import the rule set
from rules import labroche_rules

# Compute pairwise Euclidean distances
distances = pairwise_distances(data, metric='euclidean')

# Get the minimum and maximum distances
min_distance = np.min(distances)  # Exclude zeros on the diagonal
max_distance = np.max(distances)

f_sim = [distance_classes.similarity_euclid(min_distance, max_distance)]
ant_clust = AntClust(f_sim, labroche_rules())
ant_clust.fit([[d] for d in data])
clusters_found = ant_clust.get_clusters()

AntClust: phase 1 of 3 -> meeting ants
Meeting 42675 / 42675
Meeting 34140 / 42675
Meeting 25605 / 42675
Meeting 17070 / 42675
Meeting 8535 / 42675
AntClust: phase 2 of 3 -> shrink nests
AntClust: phase 3 of 3 -> reassign ants


# Metrics

In [4]:
from sklearn import metrics
import pandas as pd
df = pd.DataFrame()


homogeneity_score = metrics.homogeneity_score(breast_cancer.target, clusters_found)
completeness_score = metrics.completeness_score(breast_cancer.target, clusters_found)
v_score = metrics.v_measure_score(breast_cancer.target, clusters_found)
ari_score = metrics.adjusted_rand_score(breast_cancer.target, clusters_found)
data = {
    'Homogeneity': homogeneity_score,
    'Completeness': completeness_score,
    'V-measure': v_score,
    'Adjusted Rand-Index': ari_score,
}

# Creating a new DataFrame with the data for the new row
new_row = pd.DataFrame(data, index=["AntClust (euclidean distance)"])
df = pd.concat([df, new_row])

for k, k_label in cluster_results:
    homogeneity_score = metrics.homogeneity_score(breast_cancer.target, k_label)
    completeness_score = metrics.completeness_score(breast_cancer.target, k_label)
    v_score = metrics.v_measure_score(breast_cancer.target, k_label)
    ari_score = metrics.adjusted_rand_score(breast_cancer.target, k_label)
    data = {
        'Homogeneity': homogeneity_score,
        'Completeness': completeness_score,
        'V-measure': v_score,
        'Adjusted Rand-Index': ari_score,
    }
    new_row = pd.DataFrame(data, index=[f"K-means (k={k})"])
    df = pd.concat([df, new_row])

df


Unnamed: 0,Homogeneity,Completeness,V-measure,Adjusted Rand-Index
AntClust (euclidean distance),0.578839,0.273374,0.371361,0.503367
K-means (k=2),0.422291,0.516809,0.464793,0.491425
K-means (k=3),0.447857,0.451041,0.449444,0.501563
K-means (k=4),0.57505,0.333277,0.421986,0.412743
K-means (k=5),0.601928,0.297758,0.398425,0.34181
K-means (k=6),0.604317,0.274087,0.377129,0.31349
K-means (k=7),0.629624,0.245605,0.353368,0.233988
K-means (k=8),0.635252,0.248015,0.356748,0.237783
