In [4]:
import os
import shutil
import numpy as np
import imagehash
from PIL import Image
import tensorflow as tf
from collections import defaultdict
from deepface import DeepFace
from tqdm import tqdm

In [5]:
# Force TensorFlow to use GPU
physical_devices = tf.config.experimental.list_physical_devices('GPU')
if physical_devices:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
    print("✅ GPU is enabled for DeepFace!")

✅ GPU is enabled for DeepFace!


In [7]:
# Define input and output directories
INPUT_FOLDER = "./temp/input"
OUTPUT_FOLDER = "./temp/output"

# Ensure output folder exists
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)

In [8]:
# Function to calculate perceptual hash (PHash)
def calculate_phash(image_path):
    try:
        img = Image.open(image_path).convert("L")  # Convert to grayscale
        return str(imagehash.phash(img))  # Generate hash
    except Exception as e:
        print(f"Error hashing {image_path}: {e}")
        return None

In [9]:
# Step 1: Group images by PHash (fast pre-filtering)
image_hash_groups = defaultdict(list)  # Dictionary to store images per hash

image_paths = [
    os.path.join(INPUT_FOLDER, file) for file in os.listdir(INPUT_FOLDER)
    if file.lower().endswith((".jpg", ".jpeg", ".png"))
]

print(f"Processing {len(image_paths)} images to find distinct ones...")

for img_path in tqdm(image_paths, desc="Hashing images"):
    phash = calculate_phash(img_path)
    if phash:
        image_hash_groups[phash].append(img_path)  # Group images by their hash

Processing 50 images to find distinct ones...


Hashing images: 100%|██████████| 50/50 [00:00<00:00, 263.67it/s]


In [10]:
# Step 2: Verify duplicates within each hash group using DeepFace
unique_images = []
checked_images = set()  # Track already processed images

for hash_val, group in tqdm(image_hash_groups.items(), desc="Verifying groups"):
    for img_path in group:
        if img_path in checked_images:
            continue  # Skip if already checked

        is_duplicate = False

        # Compare with already confirmed unique images in the same group
        for unique_img in unique_images:
            try:
                # Use DeepFace for precise verification
                result = DeepFace.verify(img1_path=img_path, img2_path=unique_img, enforce_detection=False)

                # If DeepFace considers them identical, mark as duplicate
                if result["verified"]:
                    is_duplicate = True
                    break  # No need to check further

            except Exception as e:
                print(f"Error processing {img_path} -> {e}")
                continue  # Skip this image if error occurs

        # If unique, add to final set
        if not is_duplicate:
            unique_images.append(img_path)
            shutil.copy(img_path, os.path.join(OUTPUT_FOLDER, os.path.basename(img_path)))

        checked_images.add(img_path)

print(f"Processing complete. Found {len(unique_images)} distinct images.")
print(f"Distinct images saved in: {OUTPUT_FOLDER}")

Verifying groups: 100%|██████████| 44/44 [00:47<00:00,  1.08s/it]

Processing complete. Found 15 distinct images.
Distinct images saved in: ./temp/output



