# Clustering of Similar Images

In this example we try to cluster images of flowers from the 10 Category Flower Dataset (https://www.kaggle.com/datasets/olgabelitskaya/flower-color-images) clustered.  

The idea is that it can cluster images from different classes using a cosine similarity from a pretrained CLIP model. 

### Data functions
These functions are needed to load the images from the folder.

In [3]:
import glob
import os 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
from sentence_transformers import SentenceTransformer, util
from PIL import Image
import torch


path_img = "flower_images/"
df = pd.read_csv(f"{path_img}flower_labels.csv")
# we'll test with a subset of the dataset first
image_names = list(glob.glob(f'{path_img}*.png'))
subset = len(image_names)
image_names = list(glob.glob(f'{path_img}*.png'))[:subset]
true_labels = list(df["label"])[:subset]
image_names = [name.split("\\")[-1] for name in image_names]

# AntClust Cosine Similarity

In [4]:
# ----------------------
#       imports
# ----------------------
# import opencv
import cv2 as cv
# matplotlib
import matplotlib.pyplot as plt

# make AntClus dir known
import sys
sys.path.append("../AntClust")
# import AntClust
from AntClust import AntClust
# import the precomputed distance matrix function for AntClust
from distance_classes import image_cosine_similarity
# import the rule set
from rules import labroche_rules

# ----------------------
#       data
# ----------------------


# ----------------------
#       AntClust
# ----------------------
# tell AntClust to treat the data set as precomputed similarity matrix
# similarity function
path_img = "../Cosine similarity/flower_images/"
f_sim = [image_cosine_similarity(path_img,image_names)]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
ant_clust = AntClust(f_sim, labroche_rules())

# find clusters by using the distance matrix of the data

ant_clust.fit([[i] for i in range(subset)])

# get the clustering result
clusters_found = ant_clust.labels_
clusters_found = ant_clust.get_clusters()

print()
#print(f'true labeling (x)   {true_labels}')
#print(f'AntClust labels (y) {clusters_found}')
print()
correct = 0
for i in range(len(true_labels)):
    if true_labels[i] == clusters_found[i]:
        correct +=1
print(f"Accuracy {correct/len(true_labels)}")

AntClust: phase 1 of 3 -> meeting ants
Meeting 15750 / 15750
Meeting 14175 / 15750
Meeting 12600 / 15750
Meeting 11025 / 15750
Meeting 9450 / 15750
Meeting 7875 / 15750
Meeting 6300 / 15750
Meeting 4725 / 15750
Meeting 3150 / 15750
Meeting 1575 / 15750
AntClust: phase 2 of 3 -> shrink nests
AntClust: phase 3 of 3 -> reassign ants


Accuracy 0.12857142857142856


# K-means 

In [None]:
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras.applications.inception_v3 import preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.image import img_to_array
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import shutil

# Function to Extract features from the images
def image_feature(direc):
    model = InceptionV3(weights='imagenet', include_top=False)
    features = []
    img_name = []

    for i in tqdm(direc, disable=True):
        fname="../Cosine Similarity/flower_images/"+i
        img=image.load_img(fname,target_size=(224,224))
        x = img_to_array(img)
        x=np.expand_dims(x,axis=0)
        x=preprocess_input(x)
        feat=model.predict(x)
        feat=feat.flatten()
        features.append(feat)
        img_name.append(i)
    return features,img_name

img_features,img_name=image_feature(image_names)

In [8]:
#Creating Clusters
k = 10 # number of classes in dataset
clusters = KMeans(k, random_state = 40)
clusters.fit(img_features)

Accuracy 0.01904761904761905


# AntClust with ORB similarity


In [43]:
def compute_orb_image_features(images, image_resize_size):
    """Computes and returns the OpenCV ORB image feature descriptors"""
    # Initiate ORB detector for feature extraction
    orb = cv.ORB_create()
    descriptors = []
    # compute key points and descriptors
    for image in images:
        img = image[0]
        #img = cv.resize(img, image_resize_size)
        kp, des = orb.detectAndCompute(img, None)
        descriptors.append([des])

    return descriptors

In [56]:
# ----------------------
#       imports
# ----------------------
# import sklearn distance function
from sklearn.metrics.pairwise import manhattan_distances
# import opencv
import cv2 as cv
# matplotlib
import matplotlib.pyplot as plt

# make AntClus dir known
import sys
sys.path.append("../AntClust")
# import AntClust
from AntClust import AntClust
# import the precomputed distance matrix function for AntClust
from distance_classes import precomputed_similarity_matrix, opencv_orb_similarity
# import the rule set
from rules import labroche_rules

# ----------------------
#       data
# ----------------------
image_resize_size = (553, 500)
true_labels = list(df["label"])[:subset]
image_names = [name.split("\\")[-1] for name in image_names]
path_img = "../Cosine Similarity/flower_images/"
image_data = []
for image_file in image_names:
    image_data.append(
                [cv.imread(path_img + image_file, cv.IMREAD_GRAYSCALE)]
            )
image_orbs = compute_orb_image_features(image_data, image_resize_size)
data = np.array(image_orbs, dtype=list)
labels = np.array(true_labels)
# distance matrix for sklearn
orb_sim = opencv_orb_similarity()
distance_matrix = []
for i in range(len(data)):
    t_l = []
    for n in range(len(data)):
        t_l.append(orb_sim.similarity(data[i][0], data[n][0]))
    distance_matrix.append(t_l)

# sklearn needs it in the way that 0 means a==b
# ant clust needs it in the way 1 means a==b
distance_matrix = 1 - np.array(distance_matrix)
# AntClust needs every data tuple as an array.
# e.g. [1,2,3] needs to be [[1],[2],[3]]
distance_matrix = [[i] for i in distance_matrix]

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.nd

In [32]:
# ----------------------
#       AntClust
# ----------------------
# tell AntClust to treat the data set as precomputed similarity matrix
# similarity function
f_sim = [precomputed_similarity_matrix()]
ant_clust = AntClust(f_sim, labroche_rules())

# find clusters by using the distance matrix of the data
ant_clust.fit(distance_matrix)

# get the clustering result
clusters_found_orb = ant_clust.get_clusters()

AntClust: phase 1 of 3 -> meeting ants
Meeting 15750 / 15750
Meeting 14175 / 15750
Meeting 12600 / 15750
Meeting 11025 / 15750
Meeting 9450 / 15750
Meeting 7875 / 15750
Meeting 6300 / 15750
Meeting 4725 / 15750
Meeting 3150 / 15750
Meeting 1575 / 15750
AntClust: phase 2 of 3 -> shrink nests
AntClust: phase 3 of 3 -> reassign ants


# Metrics

In [34]:
from sklearn.metrics import adjusted_rand_score
ari_ant_cos = adjusted_rand_score(true_labels, clusters_found)
ari_kmeans = adjusted_rand_score(true_labels, clusters.labels_)
ari_ant_orb = adjusted_rand_score(true_labels, clusters_found_orb)

print(f"ARI for AntClust (Cosine Similarity) {ari_ant_cos}")
print(f"ARI for AntClust (ORB Similarity) {ari_ant_orb}")
print(f"ARI for K-means (k={k}) {ari_kmeans}")

ARI for AntClust (Cosine Similarity) 0.6389888868288874
ARI for AntClust (ORB Similarity) 0.011252503833263665
ARI for K-means (k=10) 0.23139984255616106
