In [20]:
import cv2
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import os
import shutil
from itertools import combinations

california_images = "../California/photos"
custom_images = "../My_Image"

# SIFT

In [21]:
def find_duplicates_sift(folder_path, threshold, output_folder):
    images_dict = {}  # Store image paths and their descriptors
    duplicate_groups = {}
    total_images = 0

    for image_path in Path(folder_path).glob('*.*'):
        if image_path.suffix.lower() in ['.jpg', '.jpeg', '.png']:
            img = cv2.imread(str(image_path), cv2.IMREAD_GRAYSCALE)  # Load image as grayscale
            if img is None:
                print(f"Unable to read image: {image_path}")
                continue
            sift = cv2.SIFT_create()

            # Find keypoints and calculate descriptors
            keypoints, descriptors = sift.detectAndCompute(img, None)
            if descriptors is None:
                print(f"No descriptors found for image: {image_path}")
                continue

            images_dict[image_path] = descriptors
            total_images += 1

    # Compare descriptors using FLANN Matcher
    flann_params = dict(algorithm=1, trees=5)
    matcher = cv2.FlannBasedMatcher(flann_params, {})

    duplicate_image_paths = set()
    output_dir = os.path.join(output_folder)  # Create output folder
    # Create the output folder if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for (path_a, desc_a), (path_b, desc_b) in combinations(images_dict.items(), 2):
        try:
            matches = matcher.knnMatch(desc_a, desc_b, k=2)
        except cv2.error as e:
            print(f"Error matching descriptors for {path_a} and {path_b}: {e}")
            continue

        # 'Lowe's ratio test' for robust matching
        good_matches = [m for m, n in matches if m.distance < threshold * n.distance]

        if len(good_matches) > 5:  # Reduced from 10 to 5
            duplicate_groups.setdefault(path_a, []).append(path_b)
            duplicate_image_paths.add(path_b)  # Add only duplicates, not the original
            # Move duplicates to the output folder
            shutil.copy(path_b, os.path.join(output_dir, os.path.basename(path_b)))

    return duplicate_groups, duplicate_image_paths, total_images


# SURF / ORB (alternate of SURF)

## currently not open source

In [22]:
def find_duplicates_surf(folder_path, threshold=0.7, output_folder="duplicates"):
    images_dict = {}  # Store image paths and their descriptors
    duplicate_groups = {}

    for image_path in Path(folder_path).glob('*.*'):
        if image_path.suffix.lower() in ['.jpg', '.jpeg', '.png']:
            img = cv2.imread(str(image_path), cv2.IMREAD_GRAYSCALE)  # Load image as grayscale
            surf = cv2.xfeatures2d.SURF_create()

            # Find keypoints and calculate descriptors
            keypoints, descriptors = surf.detectAndCompute(img, None)

            images_dict[image_path] = descriptors

    # Compare descriptors using FLANN Matcher
    flann_params = dict(algorithm=1, trees=5)
    matcher = cv2.FlannBasedMatcher(flann_params, {})

    duplicate_image_paths = set()

    output_dir = os.path.join(folder_path, output_folder)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for (path_a, desc_a), (path_b, desc_b) in combinations(images_dict.items(), 2):
        matches = matcher.knnMatch(desc_a, desc_b, k=2)

        # 'Lowe's ratio test' for robust matching
        good_matches = [m for m, n in matches if m.distance < threshold * n.distance]

        if len(good_matches) > 10:
            duplicate_groups.setdefault(path_a, []).append(path_b)
            duplicate_image_paths.add(path_b)  # Add only duplicates, not the original
            # Move duplicates to the output folder
            shutil.move(path_b, os.path.join(output_dir, os.path.basename(path_b)))

    return duplicate_groups, duplicate_image_paths


In [23]:
import matplotlib.image as mpimg

def display_all_duplicate_images(folder_path):
    image_files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
    for image_file in image_files:
        image_path = os.path.join(folder_path, image_file)
        img_data = mpimg.imread(image_path)
        plt.figure()
        plt.imshow(img_data)
        plt.title(image_file)
        plt.show()

In [24]:
if __name__ == "__main__":
    folder_path = california_images
    threshold = 0.9
    output_folder = "../duplicates"
    duplicate_pairs, duplicate_image_paths, total_images = find_duplicates_sift(folder_path, threshold, output_folder)

    print(f'Total images found: {total_images}')
    print(f'Found {len(duplicate_image_paths)} individual duplicate images')
    display_all_duplicate_images(output_folder)

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x10565ea90>>
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 770, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 
