In [8]:
from transformers import CLIPProcessor, CLIPModel
import torch
import numpy as np
from pathlib import Path
from PIL import Image, ImageOps
from concurrent.futures import ThreadPoolExecutor
import tensorflow as tf
from sklearn.cluster import DBSCAN
from collections import defaultdict
import shutil
from time import time
import matplotlib.pyplot as plt
total_start = time()
print("DEPENDENCIES LOADED")

DEPENDENCIES LOADED


In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

print("MODEL LOADED")

MODEL LOADED


In [10]:
# Load and preprocess images in parallel
def load_and_process_image(path):
    image = Image.open(path)
    image = ImageOps.fit(image, (224, 224))  # Resize to 224x224
    image = np.array(image)
    image = torch.tensor(image, dtype=torch.float32)
    image = tf.constant(image.numpy(), dtype=tf.float32)
    return image

In [11]:
def process_images(demo_directory):
    image_paths = [image_path for image_path in demo_directory.iterdir() if image_path.suffix.lower() in ['.jpg', '.jpeg', '.png']]

    # Load and preprocess images in parallel
    with ThreadPoolExecutor() as executor:
        images = list(executor.map(load_and_process_image, image_paths))

    images_to_paths = {image_path.stem: image_path for image_path in image_paths}

    # Generate image embeddings in batches
    batch_size = 32
    outputs = []

    for i in range(0, len(images), batch_size):
        batch = images[i:i + batch_size]
        inputs = processor(images=batch, return_tensors="pt", padding=True).to(device)
        with torch.no_grad():
            outputs_batch = model.get_image_features(**inputs)
            outputs.extend(outputs_batch.cpu().numpy())

    images_to_embeddings = {image_id: embedding for image_id, embedding in zip(images_to_paths.keys(), outputs)}

    return images_to_embeddings, images_to_paths

In [12]:
def handle_duplicates(images_to_embeddings, images_to_paths, demo_directory, output_folder):
    # Cluster embeddings using DBSCAN
    image_ids = list(images_to_embeddings.keys())
    embeddings = list(images_to_embeddings.values())
    clustering = DBSCAN(min_samples=2, eps=3).fit(np.stack(embeddings))

    image_id_communities = defaultdict(set)
    independent_image_ids = set()

    for image_id, cluster_idx in zip(image_ids, clustering.labels_):
        cluster_idx = int(cluster_idx)
        if cluster_idx == -1:
            independent_image_ids.add(image_id)
        else:
            image_id_communities[cluster_idx].add(image_id)

    output_folder.mkdir(exist_ok=True)

    # Move duplicate images to the output folder
    for cluster_idx, image_ids in image_id_communities.items():
        if len(image_ids) > 1:  # Only move images if there are duplicates in the cluster
            for image_id in image_ids:
                # to copy use "copy", to move use "move"
                shutil.copy(images_to_paths[image_id], output_folder / f"{image_id}.jpg")

    print(f"Total duplicate images moved: {sum(len(ids) for ids in image_id_communities.values() if len(ids) > 1)}")
    print(f"Output folder: {output_folder}")

In [13]:
def Display_images(output_folder):
    image_paths = list(output_folder.glob("*.jpg"))
    
    for image_path in image_paths:
        image = Image.open(image_path)
        plt.figure()
        plt.imshow(image)
        plt.title(image_path.stem)
        plt.axis('off')
        plt.show()

In [14]:
def main():
    # Define image directory
    demo_directory = Path("../coil-100")
    # demo_directory = Path("../My_Image")
    
    output_folder = Path("../duplicates")
    
    start_time = time()

    # Process images from the directory
    images_to_embeddings, images_to_paths = process_images(demo_directory)

    # Handle duplicates
    handle_duplicates(images_to_embeddings, images_to_paths, demo_directory, output_folder)
    
    end_time = time()
    print("TIme to compute duplicates: ", end_time-start_time, "Seconds")
    
    total_end = time()
    print("TOTAL TIME TAKEN AFTER IMPORTING FILES: ",total_end-total_start, "Seconds")
    
    dispimg = input("Do you want to display the images?")
    if (dispimg == 'y'):
        Display_images(output_folder)

if __name__ == "__main__":
    main()

Total duplicate images moved: 7056
Output folder: ../duplicates
TIme to compute duplicates:  51.68691420555115 Seconds
TOTAL TIME TAKEN AFTER IMPORTING FILES:  55.212100982666016 Seconds
