In [None]:
import os
import cv2
import glob
from PIL import Image
import shutil
import numpy as np
import matplotlib.pyplot as plt
import time

In [None]:
FEATURES_DISTANCE = 0.3
MIN_MATCHES = 50
SCALE_PERCENT = 60

directory_personal = "../My_Image"
directory_cal = "../California/Photos"
destination_folder = "../duplicates"

In [None]:
def collect_imgs(directory):
    img_paths = glob.glob(os.path.join(directory, '*'))
    img_paths = [path for path in img_paths if path.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif'))]
    return img_paths

In [None]:
def load_and_resize_image(path):
    img = Image.open(path)
    width, height = img.size
    img = img.resize((int(width * SCALE_PERCENT / 100), int(height * SCALE_PERCENT / 100)))
    img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2GRAY)
    return img

In [None]:
def detect_features(img_paths):
    sift = cv2.SIFT_create()
    imgs = []
    for path in img_paths:
        img = load_and_resize_image(path)
        kp, des = sift.detectAndCompute(img, None)
        imgs.append({
            'f': img,
            'p': path,
            'kp': kp,
            'des': des
        })
    return imgs

In [None]:
def similarity_check(imgs):
    duplicates = set()
    for i1 in range(len(imgs)):
        for i2 in range(i1 + 1, len(imgs)):
            FLANN_INDEX_KDTREE = 1
            index_params = dict(
                algorithm = FLANN_INDEX_KDTREE,
                trees = 5
            )
            search_params = dict(checks=50)
            flann = cv2.FlannBasedMatcher(index_params, search_params)
            matches = flann.knnMatch(imgs[i1]['des'], imgs[i2]['des'], k=2)
            matchesCount = 0
            for i,(m,n) in enumerate(matches):
                if m.distance < FEATURES_DISTANCE * n.distance:
                    matchesCount += 1
            if(matchesCount > MIN_MATCHES):
                print('[DUPLICATE FOUND]', imgs[i1]['p'], imgs[i2]['p'])
                # adds the lower resolution image to the deletion list
                h1, w1 = imgs[i1]['f'].shape[:2]
                h2, w2 = imgs[i2]['f'].shape[:2]
                duplicates.add(imgs[i2 if h1*w1 > h2*w2 else i1]['p'])  # Use add instead of append
    return duplicates


In [None]:
def copy_duplicates(duplicates, destination_folder):
	os.makedirs(destination_folder, exist_ok=True)
	for path in duplicates:
		filename = os.path.basename(path)
		destination_path = os.path.join(destination_folder, filename)
		shutil.copy2(path, destination_path)
		print('[COPIED]', path, 'to', destination_path)

In [None]:
def delete(duplicates):
	for path in duplicates:
		os.remove(path)
		print('[DELETED]', path)

In [None]:
def display_images(duplicates):
    fig = plt.figure(figsize=(10, 10))
    columns = 3
    rows = len(duplicates) // columns + (len(duplicates) % columns > 0)
    for i, img_path in enumerate(duplicates):
        img = Image.open(img_path)
        fig.add_subplot(rows, columns, i+1)
        plt.imshow(img)
        plt.axis('off')
    plt.tight_layout()
    plt.show()

In [None]:
def main():
    start_time = time.time()

    img_paths = collect_imgs(directory_cal)
    imgs = detect_features(img_paths)
    duplicates = similarity_check(imgs)
    copy_duplicates(duplicates, destination_folder)

    end_time = time.time()

    print('Time taken (ms):', end_time - start_time)
    # display_images(duplicates)

In [36]:
if __name__ == "__main__":
    main()

[DUPLICATE FOUND] ../California/Photos/471.jpg ../California/Photos/470.jpg
[DUPLICATE FOUND] ../California/Photos/471.jpg ../California/Photos/467.jpg


KeyboardInterrupt: 