In [98]:
import os
import cv2 as cv
import glob
from PIL import Image
import shutil
import numpy as np

In [99]:
FEATURES_DISTANCE = 0.3
MIN_MATCHES = 50
SCALE_PERCENT = 60

directory = "../My_Image"
destination_folder = "../duplicates"

In [100]:
def collect_imgs(directory):
    img_paths = glob.glob(os.path.join(directory, '*'))
    img_paths = [path for path in img_paths if path.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif'))]
    return img_paths

In [101]:
def load_and_resize_image(path):
    img = Image.open(path)
    width, height = img.size
    img = img.resize((int(width * SCALE_PERCENT / 100), int(height * SCALE_PERCENT / 100)))
    img = cv.cvtColor(np.array(img), cv.COLOR_RGB2GRAY)
    return img

In [102]:
def detect_features(img_paths):
    sift = cv.SIFT_create()
    imgs = []
    for path in img_paths:
        img = load_and_resize_image(path)
        kp, des = sift.detectAndCompute(img, None)
        imgs.append({
            'f': img,
            'p': path,
            'kp': kp,
            'des': des
        })
    return imgs

In [103]:
def similarity_check(imgs):
    duplicates = set()
    for i1 in range(len(imgs)):
        for i2 in range(i1 + 1, len(imgs)):
            FLANN_INDEX_KDTREE = 1
            index_params = dict(
                algorithm = FLANN_INDEX_KDTREE,
                trees = 5
            )
            search_params = dict(checks=50)
            flann = cv.FlannBasedMatcher(index_params, search_params)
            matches = flann.knnMatch(imgs[i1]['des'], imgs[i2]['des'], k=2)
            matchesCount = 0
            for i,(m,n) in enumerate(matches):
                if m.distance < FEATURES_DISTANCE * n.distance:
                    matchesCount += 1
            if(matchesCount > MIN_MATCHES):
                print('[DUPLICATE FOUND]', imgs[i1]['p'], imgs[i2]['p'])
                # adds the lower resolution image to the deletion list
                h1, w1 = imgs[i1]['f'].shape[:2]
                h2, w2 = imgs[i2]['f'].shape[:2]
                duplicates.add(imgs[i2 if h1*w1 > h2*w2 else i1]['p'])  # Use add instead of append
    return duplicates


In [104]:
def copy_duplicates(duplicates, destination_folder):
	os.makedirs(destination_folder, exist_ok=True)
	for path in duplicates:
		filename = os.path.basename(path)
		destination_path = os.path.join(destination_folder, filename)
		shutil.copy2(path, destination_path)
		print('[COPIED]', path, 'to', destination_path)

In [105]:
def delete(duplicates):
	for path in duplicates:
		os.remove(path)
		print('[DELETED]', path)

In [106]:
def main():
    img_paths = collect_imgs(directory)
    imgs = detect_features(img_paths)
    duplicates = similarity_check(imgs)
    copy_duplicates(duplicates, destination_folder)

In [107]:
if __name__ == "__main__":
    main()

[DUPLICATE FOUND] ../My_Image/687 copy.jpg ../My_Image/686.jpg
[DUPLICATE FOUND] ../My_Image/687 copy.jpg ../My_Image/687.jpg
[DUPLICATE FOUND] ../My_Image/IMG_2239.png ../My_Image/IMG_223.png
[DUPLICATE FOUND] ../My_Image/686.jpg ../My_Image/687.jpg
[DUPLICATE FOUND] ../My_Image/682.jpg ../My_Image/681.jpg
[COPIED] ../My_Image/687 copy.jpg to ../duplicates/687 copy.jpg
[COPIED] ../My_Image/682.jpg to ../duplicates/682.jpg
[COPIED] ../My_Image/IMG_2239.png to ../duplicates/IMG_2239.png
[COPIED] ../My_Image/686.jpg to ../duplicates/686.jpg
