In [216]:
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
from scipy.spatial.distance import squareform
from sklearn.cluster import AgglomerativeClustering
import numpy as np
import os.path
import shutil
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
from collections import defaultdict
import cv2
import os
import sys
from os.path import exists
from scipy import spatial

## Settings

In [217]:
search_dir = '/home/drevital/pallet_detection/feb23_task_1'
clusters_base_path = '/home/drevital/pallet_detection/f1'
Path(clusters_base_path).mkdir(parents=True, exist_ok=True)
features_path = os.path.join(clusters_base_path, 'features.csv')
im_height = 200
im_width = 600
num_clusters = 10

## Similarity Function

In [218]:
def similarity(f1, f2):
    return spatial.distance.cosine(f1, f2)

## Clustering Function

In [219]:
def cluster_images(feature_vectors, similarity_function, num_clusters):
    similarities = []
    size = len(feature_vectors)
    for i in range(size):
        for j in range(i+1, size):
            similarities.append(similarity_function(feature_vectors[i], feature_vectors[j]))

    # Normalize the similarity values
    min_sim = min(similarities)
    max_sim = max(similarities)
    rng = max_sim - min_sim
    similarities = [(sim - min_sim) / rng for sim in similarities]
    
    similarities = np.array(similarities)
    pairwise_similarities = squareform(similarities)
    clustering = AgglomerativeClustering(n_clusters=num_clusters, affinity='precomputed', linkage='average').fit(pairwise_similarities)
    clusters = [[] for _ in range(num_clusters)]
    for i, label in enumerate(clustering.labels_):
        clusters[label].append(i)

    return clusters

## Load the MobileNet Module

In [220]:
module_handle = 'https://tfhub.dev/google/imagenet/mobilenet_v2_140_224/feature_vector/4'
module = hub.load(module_handle)

## Prepare compared images for features_calc and similarity search

In [221]:
sim_names = []
sim_scores = []
fnames = os.listdir(search_dir)
feature_vecs = {}
np.set_printoptions(threshold=sys.maxsize)

## Calculate and store all compared directories's images' feature-vectors

In [222]:
feature_vecs = {}
i=0

if not exists(features_path):
    for fname in tqdm(fnames):
        impath = os.path.join(search_dir, fname)
        im = tf.io.read_file(impath)
        im = tf.io.decode_jpeg(im, channels=3)
        im = tf.image.resize_with_pad(im, 224, 224)
        # Convert to shape (1, 224, 224, 3) float
        im  = tf.image.convert_image_dtype(im, tf.float32)[tf.newaxis, ...]
        f = module(im)   
        f_set = np.squeeze(f)  
        feature_vecs[i] = {'path': impath, 'features': f_set}
        i += 1
        
    features_df = pd.DataFrame.from_dict(feature_vecs).transpose() 
    features_df.to_csv(features_path, sep=',')

100%|██████████| 192/192 [00:20<00:00,  9.50it/s]


## Extract the Feature Vectors from the .csv File

In [223]:
features_df = pd.read_csv(features_path, delimiter=',')
feature_vectors = []
feature_vectors_map = {}

for i, fname in enumerate(fnames):
    impath = os.path.join(search_dir, fname)
    row = features_df[features_df['path'] == impath]
    fvec = [float(item) for item in row['features'].tolist()[0][1:-1].split()]
    feature_vectors.append(fvec)
    feature_vectors_map[i] = fname

## Cluster Images by Feature Vectors

In [224]:
clusters = cluster_images(feature_vectors, similarity, num_clusters)
clusters_map = defaultdict(list)

for i, cluster in enumerate(clusters):
    cluster_path = os.path.join(clusters_base_path, f'cluster_{i}')
    Path(cluster_path).mkdir(parents=True, exist_ok=True)
    for j in cluster:
        im_fname = feature_vectors_map[j]
        src = os.path.join(search_dir, im_fname)
        dst = os.path.join(cluster_path, im_fname)
        shutil.copy(src, dst)
        clusters_map[i].append(feature_vectors_map[j])
    #print(f'Cluster {i+1}: {clusters_map[i]}')

