# Wine clustering
Compare k-means with Antclust for comparing numerical data (wine dataset https://archive.ics.uci.edu/dataset/109/wine)

In [1]:
from sklearn.cluster import KMeans
from sklearn import datasets
import matplotlib.pyplot as plt
import numpy as np

# Load the Wine dataset
wine = datasets.load_wine()
data = wine.data
labels = wine.target
num_cluster = len(np.unique(labels))
print(f"Number of features: {len(data[0])}\nNumber of data points: {len(data)}")
print(num_cluster)

Number of features: 13
Number of data points: 178
3


# K-Means

In [10]:
#Finding the optimum number of clusters for k-means classification
from sklearn.cluster import KMeans

cluster_results = []
for k in range(2,9):
    # Initialize the KMeans model
    kmeans = KMeans(n_clusters=k, random_state=42)
    # Fit the model to the data
    kmeans.fit(data)
    cluster_results.append((k,kmeans.labels_))

AntClust

In [11]:
# ----------------------
#       imports
# ----------------------
# import opencv
import cv2 as cv
# matplotlib
import matplotlib.pyplot as plt
from sklearn.metrics import pairwise_distances

# make AntClus dir known
import sys
sys.path.append("../AntClust")
# import AntClust
from AntClust import AntClust
from importlib import reload
import distance_classes
reload(distance_classes)
# import the rule set
from rules import labroche_rules

# Compute pairwise Euclidean distances
distances = pairwise_distances(data, metric='euclidean')

# Get the minimum and maximum distances
min_distance = np.min(distances)  # Exclude zeros on the diagonal
max_distance = np.max(distances)

f_sim = [distance_classes.similarity_euclid(min_distance, max_distance)]
ant_clust = AntClust(f_sim, labroche_rules())
ant_clust.fit([[d] for d in data])
clusters_found = ant_clust.get_clusters()

AntClust: phase 1 of 3 -> meeting ants
Meeting 13350 / 13350
Meeting 12015 / 13350
Meeting 10680 / 13350
Meeting 9345 / 13350
Meeting 8010 / 13350
Meeting 6675 / 13350
Meeting 5340 / 13350
Meeting 4005 / 13350
Meeting 2670 / 13350
Meeting 1335 / 13350
AntClust: phase 2 of 3 -> shrink nests
AntClust: phase 3 of 3 -> reassign ants


# Metrics

In [12]:
from sklearn import metrics
import pandas as pd
df = pd.DataFrame()


homogeneity_score = metrics.homogeneity_score(wine.target, clusters_found)
completeness_score = metrics.completeness_score(wine.target, clusters_found)
v_score = metrics.v_measure_score(wine.target, clusters_found)
ari_score = metrics.adjusted_rand_score(wine.target, clusters_found)
data = {
    'Homogeneity': homogeneity_score,
    'Completeness': completeness_score,
    'V-measure': v_score,
    'Adjusted Rand-Index': ari_score,
}

# Creating a new DataFrame with the data for the new row
new_row = pd.DataFrame(data, index=["AntClust (euclidean distance)"])
df = pd.concat([df, new_row])

for k, k_label in cluster_results:
    homogeneity_score = metrics.homogeneity_score(wine.target, k_label)
    completeness_score = metrics.completeness_score(wine.target, k_label)
    v_score = metrics.v_measure_score(wine.target, k_label)
    ari_score = metrics.adjusted_rand_score(wine.target, k_label)
    data = {
        'Homogeneity': homogeneity_score,
        'Completeness': completeness_score,
        'V-measure': v_score,
        'Adjusted Rand-Index': ari_score,
    }
    new_row = pd.DataFrame(data, index=[f"K-means (k={k})"])
    df = pd.concat([df, new_row])

df


Unnamed: 0,Homogeneity,Completeness,V-measure,Adjusted Rand-Index
AntClust (euclidean distance),0.402736,0.404555,0.403644,0.404824
K-means (k=2),0.334199,0.587027,0.425919,0.369408
K-means (k=3),0.428812,0.428701,0.428757,0.371114
K-means (k=4),0.409747,0.336367,0.369449,0.288788
K-means (k=5),0.495139,0.351292,0.410993,0.311588
K-means (k=6),0.49576,0.334879,0.399739,0.290902
K-means (k=7),0.502329,0.299775,0.375477,0.22096
K-means (k=8),0.506082,0.275232,0.356553,0.197813
