In [17]:
import os
import shutil
import numpy as np
import imagehash
from PIL import Image, UnidentifiedImageError
from collections import defaultdict
from deepface import DeepFace
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import cv2
import tensorflow as tf

In [18]:
# Force TensorFlow to use GPU
physical_devices = tf.config.experimental.list_physical_devices('GPU')
if physical_devices:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
    print("✅ GPU is enabled for DeepFace!")

✅ GPU is enabled for DeepFace!


In [19]:
# Define input and output directories
INPUT_FOLDER = "./temp/input"
OUTPUT_FOLDER = "./temp/output"

# Ensure output folder exists
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)

In [20]:
# Function to calculate perceptual hash (PHash)
def calculate_phash(image_path):
    try:
        img = Image.open(image_path).convert("L")  # Convert to grayscale
        return str(imagehash.phash(img))  # Generate hash
    except UnidentifiedImageError:
        print(f"Skipping {image_path}: Unreadable image format.")
        return None
    except Exception as e:
        print(f"Error hashing {image_path}: {e}")
        return None

In [6]:
# Step 1: Group images by PHash (fast pre-filtering)
image_hash_groups = defaultdict(list)  # Dictionary to store images per hash

image_paths = [
    os.path.join(INPUT_FOLDER, file) for file in os.listdir(INPUT_FOLDER)
    if file.lower().endswith((".jpg", ".jpeg", ".png"))
]

print(f"Processing {len(image_paths)} images to find distinct ones...")

Processing 50 images to find distinct ones...


In [25]:
# Function to verify duplicates within a group using DeepFace
def verify_duplicates_in_group(group, unique_images, lock):
    for img_path in group:
        is_duplicate = False

        # Check if OpenCV can read the image
        if cv2.imread(img_path) is None:
            print(f"Skipping {img_path}: Corrupted image.")
            continue

        # Compare with already confirmed unique images
        for unique_img in unique_images:
            try:
                # Use DeepFace for precise verification with enforce_detection=False
                result = DeepFace.verify(img1_path=img_path, img2_path=unique_img, enforce_detection=False)

                # If DeepFace considers them identical, mark as duplicate
                if result.get("verified", False):
                    is_duplicate = True
                    break  # No need to check further

            except Exception as e:
                continue  # Skip this image if error occurs

        # If unique, add to final set
        if not is_duplicate:
            with lock:
                unique_images.append(img_path)
                shutil.copy(img_path, os.path.join(OUTPUT_FOLDER, os.path.basename(img_path)))

In [26]:
# Step 1: Group images by PHash (fast pre-filtering)
image_hash_groups = defaultdict(list)  # Dictionary to store images per hash

image_paths = [
    os.path.join(INPUT_FOLDER, file) for file in os.listdir(INPUT_FOLDER)
    if file.lower().endswith((".jpg", ".jpeg", ".png"))
]

print(f"Processing {len(image_paths)} images to find distinct ones...")

for img_path in tqdm(image_paths, desc="Hashing images"):
    phash = calculate_phash(img_path)
    if phash:
        image_hash_groups[phash].append(img_path)  # Group images by their hash

Processing 50 images to find distinct ones...


Hashing images: 100%|██████████| 50/50 [00:00<00:00, 460.83it/s]


In [28]:
# Step 2: Verify duplicates within each hash group using multithreading
unique_images = []  # Shared list for unique images
from threading import Lock  # To avoid race conditions in shared list access
lock = Lock()

with ThreadPoolExecutor() as executor:
    futures = [
        executor.submit(verify_duplicates_in_group, group, unique_images, lock)
        for group in image_hash_groups.values()
    ]

    for future in tqdm(futures, desc="Verifying groups"):
        future.result()  # Wait for all threads to complete

print(f"Processing complete. Found {len(unique_images)} distinct images.")
print(f"Distinct images saved in: {OUTPUT_FOLDER}")

Verifying groups: 100%|██████████| 44/44 [00:25<00:00,  1.73it/s]

Processing complete. Found 17 distinct images.
Distinct images saved in: ./temp/output



