In [1]:
import numpy as np

import fiftyone as fo
import fiftyone.zoo as foz



Migrating database to v0.18.0


In [2]:
dataset_dir = "/media/sashi/DATA1/pictures/ACG-extra/training_data/yamanosusume/screenshots/EP2/"
dataset = fo.Dataset.from_dir(dataset_dir, dataset_type=fo.types.ImageDirectory)

model = foz.load_zoo_model("mobilenet-v2-imagenet-torch")
embeddings = dataset.compute_embeddings(model)

print(embeddings.shape)

 100% |█████████████████| 941/941 [116.2ms elapsed, 0s remaining, 8.2K samples/s]   




 100% |█████████████████| 941/941 [41.8s elapsed, 0s remaining, 27.7 samples/s]      
(941, 1280)


In [35]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(embeddings)

n = len(similarity_matrix)
similarity_matrix = similarity_matrix - np.identity(n)

print(similarity_matrix.shape)

(1147, 1147)


In [36]:
id_map = [s.id for s in dataset.select_fields(["id"])]

for idx, sample in enumerate(dataset):
    max_similarity = similarity_matrix[idx].max()
    sample["max_similarity"] = max_similarity
    sample.save()

In [46]:
session = fo.launch_app(dataset)

In [45]:
thresh = 0.98
samples_to_remove = set()
samples_to_keep = set()
for idx, sample in enumerate(dataset):
    if sample.id not in samples_to_remove:
        # Keep the first instance of two duplicates
        samples_to_keep.add(sample.id)
        
        dup_idxs = np.where(similarity_matrix[idx] > thresh)[0]
        for dup in dup_idxs:
            # We kept the first instance so remove all other duplicates
            samples_to_remove.add(id_map[dup])
            
        if len(dup_idxs) > 0:
            sample.tags.append("has_duplicates")
            sample.save()
        
    else:
        sample.tags.append("duplicate")
        sample.save()

In [31]:
import os

In [47]:
for sample_id in samples_to_remove:
    os.remove(dataset[sample_id].filepath)
dataset.delete_samples(list(samples_to_remove))