In [5]:
from PIL import Image
import imagehash
import os
import numpy as np
from tqdm import tqdm

class DuplicateRemover:
    def __init__(self,dirname,hash_size = 8):
        self.dirname = dirname
        self.hash_size = hash_size
        
    def find_duplicates(self):
        """
        Find and Delete Duplicates
        """
        
        fnames = os.listdir(self.dirname)
        hashes = {}
        duplicates = []
        print("Finding Duplicates Now!\n")
        for image in fnames:
            if image[-4:] != '.jpg':
                continue
            with Image.open(os.path.join(self.dirname,image)) as img:
                temp_hash = imagehash.average_hash(img, self.hash_size)
                if temp_hash in hashes:
                    print("Duplicate {} \nfound for Image {}!\n".format(image,hashes[temp_hash]))
                    duplicates.append(image)
                else:
                    hashes[temp_hash] = image
                   
        # if len(duplicates) != 0:
        #     a = input("Do you want to delete these {} Images? Press Y or N:  ".format(len(duplicates)))
        #     space_saved = 0
        #     if(a.strip().lower() == "y"):
        #         for duplicate in duplicates:
        #             space_saved += os.path.getsize(os.path.join(self.dirname,duplicate))
                    
        #             os.remove(os.path.join(self.dirname,duplicate))
        #             print("{} Deleted Succesfully!".format(duplicate))
    
        #         print("\n\nYou saved {} mb of Space!".format(round(space_saved/1000000),2))
        #     else:
        #         print("Thank you for Using Duplicate Remover")
        # else:
        #     print("No Duplicates Found :(")
        print(len(duplicates))
            
        
            
            
    def find_similar(self, location, similarity=80):
        # similars
        res = []

        fnames = os.listdir(self.dirname)
        threshold = 1 - similarity/100
        diff_limit = int(threshold*(self.hash_size**2))
        
        with Image.open(os.path.join(self.dirname, location)) as img:
            hash1 = imagehash.average_hash(img, self.hash_size).hash
        
        print("Finding Similar Images to {} Now!".format(location))
        for image in fnames:
            if image[-4:] != '.jpg' or image == location:
                continue
            with Image.open(os.path.join(self.dirname,image)) as img:
                hash2 = imagehash.average_hash(img, self.hash_size).hash
                
                if np.count_nonzero(hash1 != hash2) <= diff_limit:
                    print("{} image found {}% similar to {}".format(image,similarity,location))
                    res.append(image)
        return res

    def find_similar_all(self, tot_size, similarity=80):
        similars = []

        # Find Similar Images and iter all photos
        for i in tqdm(range(0, tot_size)):

            index = str(i)
            index = '0' * (5 - len(index)) + index
            target = f'train_{index}.jpg'

            if target in similars:
                continue

            res = dr.find_similar(target, 95)
            similars.extend(res)
        return similars

    def find_Unsimilar(self, location, Unsimilarity=80):
        # similars
        res = []

        fnames = os.listdir(self.dirname)
        threshold = Unsimilarity/100
        diff_limit = int(threshold*(self.hash_size**2))
        
        with Image.open(os.path.join(self.dirname, location)) as img:
            hash1 = imagehash.average_hash(img, self.hash_size).hash
        
        print("Finding Un-Similar Images to {} Now!".format(location))
        for image in fnames:
            if image[-4:] != '.jpg' or image == location:
                continue
            with Image.open(os.path.join(self.dirname,image)) as img:
                hash2 = imagehash.average_hash(img, self.hash_size).hash
                
                if np.count_nonzero(hash1 != hash2) >= diff_limit:
                    print("{} image found {}% Un-similar to {}".format(image,Unsimilarity,location))
                    res.append(image)
        return res


# Duplicates Image

In [2]:
# # Remove Duplicates
# dirname = './classify'

# dr = DuplicateRemover(dirname)
# dr.find_duplicates()

# Similar Image

In [6]:
# Remove Non-Similarity
dirname = './classify'

dr = DuplicateRemover(dirname + f'/class_{1}')
dr.find_Unsimilar('train_00001.jpg', 80)

Finding Un-Similar Images to train_00001.jpg Now!
train_00021.jpg image found 80% Un-similar to train_00001.jpg
train_00035.jpg image found 80% Un-similar to train_00001.jpg
train_00047.jpg image found 80% Un-similar to train_00001.jpg
train_00056.jpg image found 80% Un-similar to train_00001.jpg
train_00066.jpg image found 80% Un-similar to train_00001.jpg
train_00072.jpg image found 80% Un-similar to train_00001.jpg
train_00096.jpg image found 80% Un-similar to train_00001.jpg
train_00112.jpg image found 80% Un-similar to train_00001.jpg
train_00122.jpg image found 80% Un-similar to train_00001.jpg
train_00148.jpg image found 80% Un-similar to train_00001.jpg
train_00165.jpg image found 80% Un-similar to train_00001.jpg
train_00176.jpg image found 80% Un-similar to train_00001.jpg
train_00203.jpg image found 80% Un-similar to train_00001.jpg
train_00209.jpg image found 80% Un-similar to train_00001.jpg
train_00216.jpg image found 80% Un-similar to train_00001.jpg
train_00269.jpg imag

['train_00021.jpg',
 'train_00035.jpg',
 'train_00047.jpg',
 'train_00056.jpg',
 'train_00066.jpg',
 'train_00072.jpg',
 'train_00096.jpg',
 'train_00112.jpg',
 'train_00122.jpg',
 'train_00148.jpg',
 'train_00165.jpg',
 'train_00176.jpg',
 'train_00203.jpg',
 'train_00209.jpg',
 'train_00216.jpg',
 'train_00269.jpg',
 'train_00275.jpg',
 'train_00278.jpg',
 'train_00284.jpg',
 'train_00301.jpg',
 'train_00308.jpg',
 'train_00326.jpg',
 'train_00451.jpg',
 'train_00486.jpg',
 'train_00575.jpg',
 'train_00579.jpg',
 'train_00596.jpg',
 'train_00636.jpg',
 'train_00718.jpg',
 'train_00960.jpg',
 'train_01024.jpg',
 'train_01074.jpg',
 'train_01087.jpg',
 'train_01104.jpg',
 'train_01157.jpg',
 'train_01297.jpg',
 'train_01339.jpg',
 'train_01341.jpg',
 'train_01349.jpg',
 'train_01431.jpg',
 'train_01445.jpg',
 'train_01497.jpg',
 'train_01498.jpg',
 'train_01594.jpg',
 'train_01601.jpg',
 'train_01627.jpg',
 'train_01695.jpg',
 'train_01778.jpg',
 'train_01791.jpg',
 'train_01886.jpg',
