This notebook defines all functions needed to test the clustering performance of AntClust on images.
It will expect that the images for each cluster reside in the a foldes (called "data") with the following structure:
```
data
  folder_with_images_cluster_0
    image_0
    image_1
  folder_with_images_cluster_1
    image_0
    image_1
  folder_with_images_cluster_2
    ...
    ...
  
```

In [1]:
# import matplotlib
import math
import os
import random as rng
import sys
import time
from concurrent.futures import ProcessPoolExecutor

import cv2 as cv
import matplotlib.pyplot as plt
import numpy as np
# make AntClus dir known
import sys
sys.path.append("../AntClust")
from AntClust import AntClust
from distance_classes import (
    opencv_image_flann_similarity,
    opencv_image_orb_similarity,
    opencv_orb_similarity,
    precomputed_similarity_matrix
)
from matplotlib import cm
from matplotlib.ticker import LinearLocator
from rules import labroche_rules

# data functions

In [2]:
def data_cluster_images_static(
    data_folder, num_clusters, num_images_per_cluster, seed=3
):
    """
    Will generate num_clusters clusters with num_images_per_cluster pictures each
    """
    # get the cars
    car_dir_names = sorted(os.listdir(data_folder), key=lambda x: x)

    # remove ipycheckpoint
    if car_dir_names[0] == ".ipynb_checkpoints":
        car_dir_names = car_dir_names[1::]

    # make a random shuffle of the cars/folders to take
    rng.seed(seed)
    cars_to_take = rng.sample(car_dir_names, len(car_dir_names))
    cars_to_take = cars_to_take[0:num_clusters:]

    # make the cluster data and the labels
    # generate num_clusters and add num_images_per_cluster to each cluster
    cluster_image: list = []
    cluster_labels: list = []

    label_counter = 0
    for car_folder in cars_to_take:
        # take images and shuffle them
        imgs = sorted(os.listdir(data_folder + "/" + car_folder), key=lambda x: x)
        imgs = rng.sample(imgs, len(imgs))

        # make data and labels
        # add the respectiv car folder as path
        cluster_image = cluster_image + [
            str(car_folder) + "/" + image for image in imgs[0:num_images_per_cluster:]
        ]
        cluster_labels = cluster_labels + [label_counter] * num_images_per_cluster
        label_counter += 1

    # read the images as opencv images from disk
    image_data = []
    for image_file in cluster_image:
        image_data.append(
            [cv.imread(data_folder + "/" + image_file, cv.IMREAD_GRAYSCALE)]
        )

    return cluster_image, image_data, cluster_labels


def data_cluster_images_dynamic(
    data_folder,
    num_clusters,
    num_images_per_cluster_min,
    num_images_per_cluster_max,
    seed=3,
):
    """
    Will generate num_clusters clusters where a random ammount of images in
    the range [num_images_per_cluster_min, num_images_per_cluster_max]
    """
    # get the cars
    car_dir_names = sorted(os.listdir(data_folder), key=lambda x: x)

    # remove ipycheckpoint
    if car_dir_names[0] == ".ipynb_checkpoints":
        car_dir_names = car_dir_names[1::]

    # make a random shuffle of the cars/folders to take
    rng.seed(seed)
    cars_to_take = rng.sample(car_dir_names, len(car_dir_names))
    cars_to_take = cars_to_take[0:num_clusters:]

    # make the cluster data and the labels
    # generate num_clusters and add a random ammount of images (in min, max range) to each cluster
    cluster_image: list = []
    cluster_labels: list = []

    label_counter = 0
    for car_folder in cars_to_take:
        # take images and shuffle them
        imgs = sorted(os.listdir(data_folder + "/" + car_folder), key=lambda x: x)
        imgs = rng.sample(imgs, len(imgs))

        # make data and labels
        num_images = rng.randint(num_images_per_cluster_min, num_images_per_cluster_max)
        # add the respectiv car folder as path
        cluster_image = cluster_image + [
            str(car_folder) + "/" + image for image in imgs[0:num_images:]
        ]
        cluster_labels = cluster_labels + [label_counter] * num_images
        label_counter += 1

    # read the images as opencv images from disk
    # and put them into their own array as data tuple
    image_data = []
    for image_file in cluster_image:
        image_data.append(
            [cv.imread(data_folder + "/" + image_file, cv.IMREAD_GRAYSCALE)]
        )

    return cluster_image, image_data, cluster_labels


def compute_orb_image_features(images, image_resize_size):
    # Initiate ORB detector for feature extraction
    orb = cv.ORB_create()
    descriptors = []
    # compute key points and descriptors
    for image in images:
        img = image[0]
        img = cv.resize(img, image_resize_size)
        kp, des = orb.detectAndCompute(img, None)
        descriptors.append([des])

    return descriptors

# Ant Clust parameter search

In [3]:
# test variables
# --------------
num_clusters = 30
data_folder = "data"
values_per_cluster = 18
seed = 9
image_resize_size = (150, 172)

# parameters
alpha_ant_meeting_iterations = [150, 200, 250, 300, 400]
betta_template_init_meetings = [0.3, 0.5, 0.7]
nest_shrink_prop = [0.1, 0.2, 0.3, 0.4, 0.5]
nest_removal_prop = [0.1, 0.2, 0.3, 0.4, 0.5]

params = {}
i = 0
for alpha in alpha_ant_meeting_iterations:
    for betta in betta_template_init_meetings:
        for shrink in nest_shrink_prop:
            for removal in nest_removal_prop:
                params[i] = {
                    "alpha": alpha,
                    "betta": betta,
                    "shrink": shrink,
                    "removal": removal,
                }
                i += 1


# result arrays
ari_antclust = []


# test loop
t_0 = time.time()
i = 0
for key in list(params.keys()):
    alpha = params[key]["alpha"]
    betta = params[key]["betta"]
    shrink = params[key]["shrink"]
    removal = params[key]["removal"]
    t_1 = time.time()
    print(f"testing params {i} out of {len(list(params.keys()))}")
    i += 1
    print(params[key])
    # print(f"alpha {alpha}")
    # print(f"betta {betta}")
    # print(f"shrink {shrink}")
    # print(f"removal {removal}")
    # Data generation
    # ----------------
    image_names, images, labels = data_cluster_images_static(
        data_folder, num_clusters, values_per_cluster, seed
    )
    images = compute_orb_image_features(images, image_resize_size)
    data = np.array(images, dtype=list)
    labels = np.array(labels)
    # AntClust
    # ----------
    # similarity function
    f_sim = [opencv_descriptor_flann_similarity(max_distance=70)]
    # rules
    rules = labroche_rules()
    # AntClust
    ant_clust = AntClust(
        f_sim,
        rules,
        alpha_ant_meeting_iterations=alpha,
        betta_template_init_meetings=betta,
        nest_shrink_prop=shrink,
        nest_removal_prop=removal,
        print_status=False,
    )
    # find clusters
    ant_clust.fit(data)
    # get the clustering result
    y_pred = ant_clust.get_clusters()
    # calculate the ari score
    ari_score = adjusted_rand_score(labels, y_pred)
    # append score
    ari_antclust.append(ari_score)
    # print test time
    print(f"testing took {time.time()-t_1} seconds")
    print()

print(f"testing took {time.time() - t_0} seconds")

# saving best params
print(f' min ari score {min(ari_antclust)}')
print(f' max ari score {max(ari_antclust)}')

params_best_key = np.where(np.array(ari_antclust) == max(ari_antclust))[0][0]
ant_clust_params = params[params_best_key]

print('best params')
print(ant_clust_params)

testing params 0 out of 375
{'alpha': 150, 'betta': 0.3, 'shrink': 0.1, 'removal': 0.1}


FileNotFoundError: [Errno 2] No such file or directory: 'data'