In [1]:
import numpy as np

import fiftyone as fo
import fiftyone.zoo as foz



In [2]:
dataset_dir = "/media/sashi/DATA1/pictures/ACG-extra/training_data/yama_no_susume/screenshots/S4/EP12/"
dataset = fo.Dataset.from_dir(dataset_dir, dataset_type=fo.types.ImageDirectory)

model = foz.load_zoo_model("mobilenet-v2-imagenet-torch")
embeddings = dataset.compute_embeddings(model)

print(embeddings.shape)

 100% |███████████████| 6008/6008 [587.0ms elapsed, 0s remaining, 10.2K samples/s]      




 100% |███████████████| 6008/6008 [4.1m elapsed, 0s remaining, 25.3 samples/s]      
(6008, 1280)


In [3]:
from tqdm import tqdm

def mark_duplicate(subdataset, similarity_matrix, thresh=0.985):
    
    n = len(similarity_matrix)
    similarity_matrix = similarity_matrix - np.identity(n)
    
    id_map = [s.id for s in subdataset.select_fields(["id"])]
    samples_to_remove = set()
    samples_to_keep = set()

    for idx, sample in enumerate(subdataset):
        max_similarity = similarity_matrix[idx].max()
        sample["max_similarity"] = max_similarity
        sample.save()

    for idx, sample in tqdm(enumerate(subdataset)):
        if sample.id not in samples_to_remove:
            # Keep the first instance of two duplicates
            samples_to_keep.add(sample.id)

            dup_idxs = np.where(similarity_matrix[idx] > thresh)[0]
            for dup in dup_idxs:
                # We kept the first instance so remove all other duplicates
                samples_to_remove.add(id_map[dup])

            if len(dup_idxs) > 0:
                sample.tags.append("has_duplicates")
                sample.save()

        else:
            sample.tags.append("duplicate")
            sample.save()
    return samples_to_remove, samples_to_keep

In [4]:
from sklearn.metrics.pairwise import cosine_similarity

max_compare_size = 10000
thresh = 0.985

samples_to_remove = set()
samples_to_keep = set()

for k in range(0, len(embeddings), max_compare_size):
    end = min(k + max_compare_size, len(embeddings))
    similarity_matrix = cosine_similarity(embeddings[k:end])
    samples_to_remove_sub, samples_to_keep_sub = mark_duplicate(
        dataset[k:end], similarity_matrix, thresh)
    samples_to_remove = samples_to_remove | samples_to_remove_sub
    samples_to_keep = samples_to_keep | samples_to_keep_sub

6008it [00:02, 2072.36it/s]


In [5]:
session = fo.launch_app(dataset)



In [6]:
import os
for sample_id in tqdm(samples_to_remove):
    os.remove(dataset[sample_id].filepath)
dataset.delete_samples(list(samples_to_remove))

100%|█████████████████████████████████████████████████████████| 5281/5281 [00:03<00:00, 1752.00it/s]
