In [None]:
import cv2
import imagehash
from PIL import Image
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import shutil
import os

california_images = "../California/Photos_copy"
custom_images = "../My_Image"

In [None]:
def preprocess_image(image_path):
    try:
        # Read the image
        img = cv2.imread(image_path)

        # Resize the image
        img = cv2.resize(img, (64, 64))

        # Convert the image to grayscale
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        
        # Apply a slight blur
        blurred = cv2.GaussianBlur(gray, (5, 5), 0)

        # Histogram equalization
        equalized = cv2.equalizeHist(blurred)

        # Normalize pixel values
        normalized = equalized / 255.0

        return normalized
    
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        return None

In [None]:
def perceptual_hash(image):
    # Preprocess the image
    processed_image = preprocess_image(image)

    # Create a pHash using imagehash library
    hash_value = imagehash.phash(Image.fromarray((processed_image * 255).astype('uint8')))
    
    return hash_value

In [None]:
def find_duplicates_phash(folder_path, threshold=5):
    hash_dict = {}
    duplicate_groups = {}
    total_images = 0

    for image_path in Path(folder_path).glob('*.*'):
        if image_path.suffix.lower() in ['.jpg', '.jpeg', '.png']:
            total_images += 1
            hash_value = perceptual_hash(str(image_path))

            # Check for near-duplicates within a threshold
            for existing_hash, existing_image_paths in hash_dict.items():
                if hash_value - existing_hash < threshold:
                    existing_image_paths.append(image_path)
                    duplicate_groups[hash_value] = existing_image_paths

            hash_dict[hash_value] = [image_path]

    return duplicate_groups, total_images

In [None]:
def display_unique_duplicates(duplicate_groups):
    for hash_value, image_paths in duplicate_groups.items():
        num_images = len(image_paths)

        # Create a matplotlib figure for consolidated display
        plt.figure(figsize=(5 * num_images, 5))

        for i, image_path in enumerate(image_paths, 1):
            # Load the image using cv2
            img = cv2.imread(str(image_path))

            # Check if image is loaded successfully
            if img is not None:
                # Convert to RGB for matplotlib compatibility
                img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

                # Add subplot using matplotlib
                plt.subplot(1, num_images, i)
                plt.imshow(img_rgb)
                plt.title(f'Image {i}: {image_path.name}')
            else:
                print(f"Error: Failed to load image {image_path.name}")

        plt.show()  # Display the figure with all images

In [None]:
if __name__ == "__main__":
    folder_path = california_images
    duplicate_groups, total_images = find_duplicates_phash(folder_path)
    duplicates_folder = "../duplicates"

    if duplicate_groups:
        print(f"Found {len(duplicate_groups)} groups of duplicate images")
        duplicate_images_count = sum([len(images) - 1 for images in duplicate_groups.values()])
        print(f"Total number of images: {total_images}")
        print(f"Number of unique images: {total_images - duplicate_images_count}")
        print(f"Number of duplicate images: {duplicate_images_count}")
        display_unique_duplicates(duplicate_groups)
    else:
        print("No duplicate images found.")