In [1]:
import os
from imagededup.methods import PHash
from pathlib import Path

In [2]:
def find_exact_duplicates(directories):
    hasher = PHash()
    directory = directories[0]  # Only one directory provided
    directory_path = Path(directory)    
    if not directory_path.exists():
        print(f"Directory {directory} does not exist.")
        return
    print(f"Processing directory: {directory}")
    
    # Compute hashes for images in the directory
    encodings = hasher.encode_images(image_dir=directory, recursive=True)
    
    if not encodings:
        print("No images found in the specified directory.")
        return
    
    print(f"Found {len(encodings)} images. Computing duplicates...")
    
    # Find duplicates based on identical hashes
    duplicates = hasher.find_duplicates(encoding_map=encodings, max_distance_threshold=0)
    
    # Filter and collect duplicate pairs
    duplicate_pairs = []
    processed_files = set()
    for img, duplicates_list in duplicates.items():
        if duplicates_list and img not in processed_files:
            for dup_img in duplicates_list:
                if dup_img not in processed_files:
                    duplicate_pairs.append((img, dup_img))
                    processed_files.add(dup_img)
            processed_files.add(img)
    
    # Output results
    if duplicate_pairs:
        print("\nFound the following exact duplicate image pairs:")
        for img1, img2 in duplicate_pairs:
            print(f"Duplicate pair: {img1} <-> {img2}")
        print(f"Total duplicate pairs found: {len(duplicate_pairs)}")
    else:
        print("\nNo exact duplicates found.")

In [3]:
if __name__ == "__main__":
    # directory containing the images
    directories = [
        r"D:\research dataset\cloth",
    ]
    find_exact_duplicates(directories)

2025-05-20 02:39:25,198: INFO Start: Calculating hashes...


Processing directory: D:\research dataset\cloth


100%|████████████████████████████████████████████████████████████████████████████████| 754/754 [00:24<00:00, 30.55it/s]
2025-05-20 02:39:54,578: INFO End: Calculating hashes!
2025-05-20 02:39:54,581: INFO Start: Evaluating hamming distances for getting duplicates
2025-05-20 02:39:54,582: INFO Start: Retrieving duplicates using BKTree algorithm


Found 705 images. Computing duplicates...


100%|████████████████████████████████████████████████████████████████████████████████| 705/705 [00:17<00:00, 40.11it/s]
2025-05-20 02:40:16,462: INFO End: Retrieving duplicates using BKTree algorithm
2025-05-20 02:40:16,464: INFO End: Evaluating hamming distances for getting duplicates



Found the following exact duplicate image pairs:
Duplicate pair: 00017.png <-> 00019.png
Duplicate pair: 00026.png <-> 00033.png
Duplicate pair: 00031.png <-> 00032.png
Duplicate pair: 00331.png <-> 00332.png
Total duplicate pairs found: 4
