In [16]:
import cv2
import imagehash
from PIL import Image
from pathlib import Path
import matplotlib.pyplot as plt
import concurrent.futures

In [17]:
def preprocess_and_hash(image_path):
    img = cv2.imread(str(image_path))
    img = cv2.resize(img, (64, 64))
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    blurred = cv2.GaussianBlur(gray, (5, 5), 0)
    equalized = cv2.equalizeHist(blurred)
    normalized = equalized / 255.0
    hash_value = imagehash.phash(Image.fromarray((normalized * 255).astype('uint8')))
    return image_path, hash_value


In [18]:
def find_duplicates_phash_parallel(folder_path, threshold=5):
    hash_dict = {}
    duplicate_groups = {}

    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [executor.submit(preprocess_and_hash, image_path) for image_path in Path(folder_path).glob('*.*')]

        for future in concurrent.futures.as_completed(futures):
            image_path, hash_value = future.result()

            for existing_hash, existing_image_paths in hash_dict.items():
                if hash_value - existing_hash < threshold:
                    existing_image_paths.append(image_path)
                    duplicate_groups[hash_value] = existing_image_paths

            hash_dict[hash_value] = [image_path]

    return duplicate_groups

In [19]:
def display_all_duplicate_images(duplicate_groups):
    for hash_value, image_paths in duplicate_groups.items():
        num_images = len(image_paths)

        plt.figure(figsize=(5 * num_images, 5))
        for i, image_path in enumerate(image_paths, 1):
            img = cv2.imread(str(image_path))
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            plt.subplot(1, num_images, i)
            plt.imshow(img)
            plt.title(f'Image {i}: {image_path.name}')

        plt.show()
        
    print(f'Found {len(duplicate_groups)} groups of duplicate images')

In [20]:
if __name__ == "__main__":
    folder_path = "../California/Photos"
    duplicate_pairs = find_duplicates_phash_parallel(folder_path)

    if duplicate_pairs:
        display_all_duplicate_images(duplicate_pairs)
    else:
        print("No duplicate images found.")

error: OpenCV(4.9.0) /Users/xperience/GHA-OpenCV-Python2/_work/opencv-python/opencv-python/opencv/modules/imgproc/src/resize.cpp:4152: error: (-215:Assertion failed) !ssize.empty() in function 'resize'
