# Clustering of MNIST 

In this example we try to cluster images from MNIST.  

The idea is that it can cluster images from different classes using a cosine similarity from a pretrained CLIP model. 

### Data functions
These functions are needed to load the images from the folder.

In [1]:
import glob
import os 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
from sentence_transformers import SentenceTransformer, util
from PIL import Image
import torch
from sklearn.utils import shuffle
import numpy as np

# Importing the dataset from keras
from keras.datasets import mnist

def extract_balanced_subset(data, targets, num_samples_per_class):
    subset_data = []
    subset_targets = []
    class_samples_count = {label: 0 for label in range(10)}

    for i in range(len(data)):
        label = int(targets[i][0])
        if class_samples_count[label] < num_samples_per_class:
            subset_data.append(data[i])
            subset_targets.append(label)
            class_samples_count[label] += 1

            if all(count == num_samples_per_class for count in class_samples_count.values()):
                break

    return np.array(subset_data), np.array(subset_targets)

(x_train, y_train), (x_test, y_test) = mnist.load_data()
# Flatten labels
y_train = np.expand_dims(y_train, axis=1)

# Shuffle the dataset
x_train, y_train = shuffle(x_train, y_train, random_state=42)

# Convert NumPy arrays to tuples for dictionary keys
y_train_labels = [tuple(label) for label in y_train]

# Set the number of samples you want for each class in the subset
num_samples_per_class = 100

# Extract the balanced subset
balanced_subset_data, balanced_subset_targets = extract_balanced_subset(x_train, y_train_labels, num_samples_per_class)

# Print the shapes of the balanced subset
print("Balanced Subset Data Shape:", balanced_subset_data.shape)
print("Balanced Subset Targets Shape:", balanced_subset_targets.shape)
x_train = balanced_subset_data
y_train = balanced_subset_targets
true_labels = y_train



Balanced Subset Data Shape: (1000, 28, 28)
Balanced Subset Targets Shape: (1000,)


# AntClust Cosine Similarity

In [8]:
# ----------------------
#       imports
# ----------------------
# import opencv
import cv2 as cv
# matplotlib
import matplotlib.pyplot as plt
from importlib import reload
from sklearn.metrics import adjusted_rand_score
import numpy as np
# make AntClus dir known
import sys
sys.path.append("../AntClust")
# import AntClust
from AntClust import AntClust
# import the precomputed distance matrix function for AntClust
import distance_classes
reload(distance_classes)

# import the rule set
from rules import labroche_rules

# ----------------------
#       data
# ----------------------


# ----------------------
#       AntClust
# ----------------------
# tell AntClust to treat the data set as precomputed similarity matrix
# similarity function
f_sim = [distance_classes.image_cosine_similarity(img_tensor=x_train)]


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

In [9]:
ant_clust = AntClust(f_sim, labroche_rules())

# find clusters by using the distance matrix of the data

ant_clust.fit([[i] for i in range(len(x_train))])

# get the clustering result
clusters_found = ant_clust.labels_
clusters_found_cos = ant_clust.get_clusters()

AntClust: phase 1 of 3 -> meeting ants
Meeting 75000 / 75000
Meeting 67500 / 75000
Meeting 60000 / 75000
Meeting 52500 / 75000
Meeting 45000 / 75000
Meeting 37500 / 75000
Meeting 30000 / 75000
Meeting 22500 / 75000
Meeting 15000 / 75000
Meeting 7500 / 75000
AntClust: phase 2 of 3 -> shrink nests
AntClust: phase 3 of 3 -> reassign ants


# K-means 

In [10]:
# Data Normalization
# Conversion to float
x_train = x_train.astype('float32') 
# Normalization
x_train = x_train/255.0
X_train = x_train.reshape(len(x_train),-1)


In [11]:
import numpy as np
from sklearn.cluster import KMeans
total_clusters = len(np.unique(y_test))
# Initialize the K-Means model
kmeans = KMeans(n_clusters = total_clusters)
# Fitting the model to training set
kmeans.fit(X_train)

KMeans(n_clusters=10)

# AntClust ORB Similarity

In [5]:
# ----------------------
#       imports
# ----------------------
# import sklearn distance function
from sklearn.metrics.pairwise import manhattan_distances
# import opencv
import cv2 as cv
# matplotlib
import matplotlib.pyplot as plt

# make AntClus dir known
import sys
sys.path.append("../AntClust")
# import AntClust
from AntClust import AntClust
# import the precomputed distance matrix function for AntClust
from distance_classes import precomputed_similarity_matrix, opencv_orb_similarity
# import the rule set
from rules import labroche_rules

def compute_orb_image_features(images):
    """Computes and returns the OpenCV ORB image feature descriptors"""
    # Initiate ORB detector for feature extraction
    orb = cv.ORB_create(nfeatures = 200, scaleFactor=1.2,nlevels=8,edgeThreshold=10,patchSize=16)
    descriptors = []
    not_index = []
    # compute key points and descriptors
    for i, image in enumerate(images):
        #gray_image = cv.cvtColor(image, cv.COLOR_BGR2GRAY)  # Convert to grayscale
        kp, des = orb.detectAndCompute(image, None)
        if des is not None:
            descriptors.append([des])
        else: 
            not_index.append(i)
    return descriptors, not_index

# ----------------------
#       data
# ----------------------
image_data = x_train
#image_data = read_images_from_array(image_data)
#image_data = [[np.array(sub_array, dtype=np.uint8) for sub_array in array] for array in image_data]
image_orbs, not_index = compute_orb_image_features(image_data)
# print(image_orbs)
data = np.array(image_orbs, dtype=list)
labels = np.array(true_labels)
# distance matrix for sklearn
orb_sim = opencv_orb_similarity()
distance_matrix = []
for i in range(len(data)):
    t_l = []
    for n in range(len(data)):
        t_l.append(orb_sim.similarity(data[i][0], data[n][0]))
    distance_matrix.append(t_l)
print(len(distance_matrix))
# sklearn needs it in the way that 0 means a==b
# ant clust needs it in the way 1 means a==b
distance_matrix = 1 - np.array(distance_matrix)
# AntClust needs every data tuple as an array.
# e.g. [1,2,3] needs to be [[1],[2],[3]]
distance_matrix = [[i] for i in distance_matrix]


931


In [6]:
# ----------------------
#       AntClust
# ----------------------
# tell AntClust to treat the data set as precomputed similarity matrix
# similarity function
f_sim = [precomputed_similarity_matrix()]
ant_clust = AntClust(f_sim, labroche_rules())

# find clusters by using the distance matrix of the data
ant_clust.fit(distance_matrix)

# get the clustering result
clusters_found_orb = ant_clust.get_clusters()

AntClust: phase 1 of 3 -> meeting ants
Meeting 69825 / 69825
Meeting 55860 / 69825
Meeting 41895 / 69825
Meeting 27930 / 69825
Meeting 13965 / 69825
AntClust: phase 2 of 3 -> shrink nests
AntClust: phase 3 of 3 -> reassign ants


# Metrics

In [12]:
from sklearn.metrics import adjusted_rand_score
ari_ant_cos = adjusted_rand_score(true_labels, clusters_found)
ari_kmeans = adjusted_rand_score(true_labels, kmeans.labels_)
filtered_labels = [true_labels[i] for i in range(len(true_labels)) if i not in not_index]
ari_ant_orb = adjusted_rand_score(filtered_labels, clusters_found_orb)

print(f"ARI for AntClust (Cosine Similarity) {ari_ant_cos}")
print(f"ARI for AntClust (ORB Similarity) {ari_ant_orb}")
print(f"ARI for K-means (k={total_clusters}) {ari_kmeans}")

ARI for AntClust (Cosine Similarity) 0.12179987677390727
ARI for AntClust (ORB Similarity) 0.059549661048787034
ARI for K-means (k=10) 0.34392891199081566
