In [36]:
import os
import cv2 as cv
import shutil

In [37]:
FEATURES_DISTANCE = 0.3
MIN_MATCHES = 50
def collect_imgs(directory):
	imgs = []
	for file in os.listdir(directory):
		if(file.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif'))):
			path = os.path.join(directory, file)
			imgs.append({
				'f': cv.imread(path, cv.IMREAD_GRAYSCALE),
				'p': path
			});
	return imgs

In [38]:
def detect_features(imgs):
	sift = cv.SIFT_create()
	for img in imgs:
		img['kp'], img['des'] = sift.detectAndCompute(img['f'], None)
	return imgs

In [39]:
def similarity_check(imgs):
	duplicates = []
	for i1 in range(len(imgs)):
		for i2 in range(i1 + 1, len(imgs)):
			FLANN_INDEX_KDTREE = 1
			index_params = dict(
				algorithm = FLANN_INDEX_KDTREE,
				trees = 5
			)
			search_params = dict(checks=50)
			flann = cv.FlannBasedMatcher(index_params, search_params)
			matches = flann.knnMatch(imgs[i1]['des'], imgs[i2]['des'], k=2)
			matchesCount = 0
			for i,(m,n) in enumerate(matches):
				if m.distance < FEATURES_DISTANCE * n.distance:
					matchesCount += 1
			if(matchesCount > MIN_MATCHES):
				print('[DUPLICATE FOUND]', imgs[i1]['p'], imgs[i2]['p'])
				# adds the lower resolution image to the deletion list
				h1, w1 = imgs[i1]['f'].shape[:2]
				h2, w2 = imgs[i2]['f'].shape[:2]
				duplicates.append(imgs[i2 if h1*w1 > h2*w2 else i1]['p'])
	return duplicates

In [40]:
def copy_duplicates(duplicates, destination_folder):
	os.makedirs(destination_folder, exist_ok=True)
	for path in duplicates:
		filename = os.path.basename(path)
		destination_path = os.path.join(destination_folder, filename)
		shutil.copy2(path, destination_path)
		print('[COPIED]', path, 'to', destination_path)

In [41]:
def delete(duplicates):

	for path in duplicates:
		os.remove(path)
		print('[DELETED]', path)

In [42]:
def main():
	directory = "../My_Image"
	destination_folder = "../duplicates"
	imgs = collect_imgs(directory)
	imgs = detect_features(imgs)
	duplicates = similarity_check(imgs)
	copy_duplicates(duplicates, destination_folder)
if __name__ == "__main__":
	main()

[DUPLICATE FOUND] ../My_Image/IMG_2239.png ../My_Image/IMG_223.png
[DUPLICATE FOUND] ../My_Image/686.jpg ../My_Image/687.jpg
[DUPLICATE FOUND] ../My_Image/682.jpg ../My_Image/681.jpg
[COPIED] ../My_Image/IMG_2239.png to ../duplicates/IMG_2239.png
[COPIED] ../My_Image/686.jpg to ../duplicates/686.jpg
[COPIED] ../My_Image/682.jpg to ../duplicates/682.jpg
