#### This program searches through a directory and detects images that are duplicates.
##### First it goes through all the files in the directory that are image files and extracts the files extension.
##### It then determines if the file extension is in a list of file extension types that can be processed. 
##### If the extension is in the list, the file's path is added  to a list called f_list for further processing. 
##### If not in the extension list the file's path is  appended to list named bad_lis and is not processedt.
##### Then for the files in the f_list it reads in the file in as gray scale, and resizes it into a vector of length 
##### hash_length +1. It then performs a difference hash on the vector. The file name along with the hash
##### is stored in a dictionary  hash_dict, with the key=file name and the value = the hash.
##### Once the dictionary is made for all files in the  f_list the dictionary is then processed to find if any files have
##### hash values that are with 'distance' of each other. These files are then considered duplicates of each other.
##### A new dictionary dup_dict is then created. The dictionary key is a file name. The value is a list of all file names
##### that are duplicates of the key file name. The user is given the option to delete the duplicate files.
##### if duplicate files are deleted the user is given the option of nenaming the files in a numerically sequential
#####  fashion starting from a specified starting number. Note- files are renamed sequentially using
#####  'zero's padding' which preserves the processing order of files by python functions
#####  Note this function requires tqdm and cv2 be installed in your environment

#### Import needed modules

In [None]:
import os
import cv2
import numpy as np
from tqdm import tqdm
import shutil

#### This is the main function

In [1]:
def find_duplicate(source_dir,distance=12,hash_length=128):
    f_list=[] # list to store file paths for files that have valid extensionsand can be processed
    bad_list=[] # list to store file paths for files with improper extensions or corrupt images
    hash_dict={} dictionary to store file name and its associated hash
    good_exts=['jpg', 'jpe', 'jpeg', 'png', 'bmp', 'tiff']
    if check_inputs(source_dir, good_exts)==True:
        # generate a list of files with proper extensions
        t_list=os.listdir(source_dir) # list of files in the source directory
        for f in t_list: # iterate through the file list
            f_path=os.path.join(source_dir,f)  # define the files path
            if os.path.isfile(f_path):              # make sure f is a file not a directory
                index=f.rfind('.') # find the last . character in the file path
                ext=f[index+1:]    # this is the files extension like jpg or bmp
                if ext  in good_exts:
                    f_list.append(f)  # file has a valid extensiom add to list of files to process
                else:
                    bad_file_dir_path=os.path.join(source_dir, 'bad files')
                    if os.path.isdir(bad_file_dir_path)==False:
                        os.mkdir(bad_file_dir_path)
                    dest_path= os.path.join(bad_file_dir_path, f_path)
                    bad_list.append(f_path) # store the path of files that be removed from the source_dir
                    shutil.move(f_path, dest_path) # remove the bad files from the source_dir tp bad_files sub directory
        if len(bad_list) > 0:
            print('\n The following is a list of files removed from ', source_dir, ' and moved to the bad_files sub directory')
            for file in bad_list:
                print ('   ', file)          
        for f in tqdm(f_list): # iterate through the list of good files
            hash=get_hash(f,source_dir,hash_length)      # call get_hash to get the difference hash of the file
            hash_dict[f]=get_hash(f,source_dir,hash_length)  # store the hash in a dictionary with key-filename and value=hash
        # now go the dictionary and create a new dictionary of duplicate files        
        dup_dict=find_duplicates(hash_dict, distance) # call find_duplicates to create a dictionay of duplicate images
        if len(dup_dict)<1:
            print('***** No duplicate images were found*****')
            return (None, bad_list)
        else:
            print('\nduplicate images were detected, see list below:')
            print('{0}{1:^20s}{0:3s}{2:}'.format(' ','IMAGE', 'DUPLICATE IMAGES'))
            for key,value in dup_dict.items():
                print('{0}{1:^20s}{0:3s}{2}'.format(' ', key, value))
            ans=input('Enter D to delete duplicates or press Enter to NOT delete') # prompt user to delete duplicate files
            if ans=='D' or ans == 'd':
                removed=0
                for key,value in dup_dict.items(): # iterate through the duplicates and delete the files
                    for f in value:
                        fpath=os.path.join ( source_dir, f)
                        os.remove(fpath)
                        removed=removed + 1
                print(removed, ' duplicate files were deleted')
                msg='enter an integer to rename files sequentially from this value'
                msg=msg + ' or press enter to not rename'
                ans=input(msg) # prompt the user to nename the files
                if ans !='':
                    num=int(ans)
                    rename(source_dir, num)
        final_list=os.listdir(source_dir)
        print(len(final_list), '  files remain in ', source_dir)
        print ('**********   PROCESS COMPLETED   **********')
        return (dup_dict, bad_list)
    else:
        print('\nerrors occurred in {0} read the error messages, correct and rerun'.format(source_dir))
        return(None,bad_list)
    
def check_inputs(source_dir, good_exts):
    if os.path.isdir(source_dir)==False:
        print ('the directory {0} you specified does not exist'.format(source_dir))
        return False
    elif len(os.listdir(source_dir))<1:
        print ('there are no files in {0}'.format(source_dir))
        return False
    else:
        one_good_ext=False
        f_list=os.listdir(source_dir)
        for f in f_list:
            index=f.rfind('.')
            ext=f[index+1:]
            if ext  in good_exts:
                one_good_ext=True
        if one_good_ext==False:
            print ('the are no files in {0} that have image formats that can be processed'.format(source_dir))
            print ('one good ext is {0}'.format(one_good_ext))
            return one_good_ext
    return True
        
def get_hash(f,dir, hash_length):
    fpath=os.path.join(dir, f)
    r_str=''    
    img=cv2.imread(fpath,0)        # read image as gray scale image
    img = cv2.resize(img, (hash_length+1, 1), interpolation = cv2.INTER_AREA)    
    # now compare adjacent horizontal values in a row if pixel to the left>pixel toright result=1 else 0
    for col in range (0,hash_length):
        if(img[0][col]>img[0][col+1]):
            value=str(1)
        else:
            value=str(0)
        r_str=r_str + value
    return r_str

def find_duplicates(hash_dict, distance):
    dup_dict={}
    key_list=[]
    for key,value in tqdm(hash_dict.items()):
        dup_list=[]
        for key2,value2 in hash_dict.items():
            if key != key2 and key not in key_list and key2 not in key_list:
                if match(value,value2,distance):
                    dup_list.append(key2)
                    dup_dict[key]=dup_list
                    if key2 not in key_list:
                        key_list.append(key2)
        if key not in key_list:
            key_list.append(key)
        #print('key= {0}  key2= {1} match= {2} key list = {3}'.format(key,key2,m,key_list))
    return dup_dict
                                   
        
def match(value1, value2, distance):
    length=len(value1)
    mismatch=0
    for i in range(0, length):
        if value1[i] !=value2[i]:
            mismatch = mismatch  + 1
            if mismatch>distance:
                return False
    return True

def rename (dir, snum, new_ext=None):
    dir_list=os.listdir(dir)    
    fc=0
    f_list=[]
    f_dict={}
    for f in dir_list:
        f_path=os.path.join(dir,f)
        if os.path.isfile(f_path):
            fc=fc+1            
            f_list.append(f)     
    #determine the amount of preceeding zeros to pad the file name with to preserve order
    pad=0
    mod = 10               
    for i in range(1, fc + 1): # skip i=0 because 0 modulo anything is 0 and we don't want to increment pad
        if i % mod == 0:
            pad=pad+1                    
            mod =mod * 10
    #  make all file names unique by adding a numeric extension in place of real extension
    for i in range(0,fc):
        f=f_list[i] 
        f_path=os.path.join(dir,f) #full path to the file
        index=f.rfind('.')
        fname=f[:index]
        ext=f[index:]   # this includes the period (.)
        if ext=='.jfif':  # jfif is the same as a jpg file but cv2 and other modules don't process jfif files
            ext='.jpg'
        #to avoid trying to rename a file with a name that is in the directory
        # give each file a new unique numerical extension
        #sore the original extension in a dictionay where the key is the file name
        fnew=fname +'.' + str(i+1)
        new_path=os.path.join(dir, fnew)            
        os.rename(f_path, new_path)
        f_dict[fname]=ext  #key is new file name, value is original extension        
        #now all full filenames are unique with unique extensions         
    f_list=os.listdir (dir)
    i=0
    for d in f_list:        
        d_path=os.path.join (dir,d)
        if os.path.isdir(d_path)==False:                
            index=d.rfind('.')
            fname=d[:index]
            ext=f_dict[fname]                
            fname=str(i + snum).zfill(pad+1) + ext
            i=i+1
            new_path=os.path.join(dir, fname)
            shutil.copy(d_path,new_path)                
            os.remove(d_path)                
            if new_ext !=None:                                        
                index=new_path.rfind('.')
                old_ext=new_path[index+1:]
                if old_ext !=new_ext:
                    fname=new_path[:index]
                    new_name=fname + '.' + new_ext
                    img= cv2.imread(new_path)                        
                    cv2.imwrite( new_name, img)                        
                    os.remove(new_path)
   
    return

In [2]:
source_dir=r'C:\Temp\rename tests'
hash_length=256
distance=10
result=find_duplicate(source_dir,distance,hash_length)


100%|█████████████████████████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 131.71it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 9734.58it/s]



No improper file extensions were found

duplicate images were detected, see list below:
        IMAGE           DUPLICATE IMAGES
       001.jpg          ['11.jpg']
       002.jpg          ['12.jpg']
       003.jpg          ['13.jpg']
       004.jpg          ['14.jpg']
       005.jpg          ['15.jpg']
Enter D to delete duplicates or press Enter to NOT deleted
5  duplicate files were deleted
enter an integer to rename files sequentially from this value or press enter to not rename10
10   files remain in  C:\Temp\rename tests
**********   PROCESS COMPLETED   **********
