#### Removing duplicates from the dataset

Having a lot of duplicates in the dataset will lead to bad models that are biased to the duplicate data

In [41]:
#Import the required modules
from imutils import paths
import numpy as np
import argparse
import cv2
import os

In [42]:
last_index = 30 # specify the length of the name of the class to compare

#### calculating the hash for all the images

In [43]:
#create and calculate the hash for each image
#if two images are similar then delete one
def dhash(image,hashSize=8):
  #convert the image to grayscale and resze the grayscale image
  #add a single column (width) so we can compute the horizontal gradient
  gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
  resized = cv2.resize(gray, (hashSize+1, hashSize))
  
  #compute the relative horizontal gradient between adjacent column pixels
  diff = resized[:, 1:] > resized[:, :-1]
  
  #convert the difference image to a has and return it
  return sum([2 ** i for (i,v) in enumerate(diff.flatten()) if v])

#### Create a map where the key is the hash and the value is the image

In [44]:
DATASET_PATH = "../../dataset"
#grab the paths to all images in our input dataset directory
#initialise the hashes dictionary
print("[INFO] computing image hashes ....")
imagePaths = list(paths.list_images(DATASET_PATH))
hashes ={}



[INFO] computing image hashes ....


In [45]:
len(imagePaths)


320

#Loop for finding all duplicates

* Calculating a map in which the key is the hash of the image
* Value is list of images with the same hash i.e. duplicate images

In [46]:
#loop over the image paths
for imagePath in imagePaths:
  #load the input image and compute the hash
  image =cv2.imread(imagePath)
  h = dhash(image)
  
  #grab all image paths with the hash, add the current image
  #path to it, and store the list back in the hashes dictionary
  p = hashes.get(h, [])
  p.append(imagePath)
  hashes[h] =p

In [47]:
count = 0
for (h, hashedPaths) in hashes.items():
  if (len(hashedPaths)>1):
    count += 1
    print(f'{hashedPaths[0].split(os.sep)[-1][0:last_index]} | {hashedPaths[1].split(os.sep)[-1][0:last_index]}')
print(len(hashes.items()))

320


In [48]:
remove = True

# Loop over the image hashes
for h, hashedPaths in hashes.items():
    # Check to see if there is more than one image with the same hash
    if len(hashedPaths) > 1:
        # Check to see if this is a dry run
        if remove == False:
            # Initialize a montage to store all images with the same hash
            montage = None

            # Loop over all the image paths with the same hash
            for p in hashedPaths:
                # Load the input image and resize it to a fixed width and height
                image = cv2.imread(p)
                image = cv2.resize(image, (150, 150))

                # If our montage is None, initialize it
                if montage is None:
                    montage = image

                # Otherwise, horizontally stack the image
                else:
                    montage = np.hstack([montage, image])

            # Show montage for the hash
            print(f"[INFO] hash : {h}")
            cv2.imshow("Montage", montage)
            cv2.waitKey(0)
        else:
            # Loop over all the image paths with the same hash except
            # for the first image in the list (since we want to keep one and only one of the duplicate images)
            for p in hashedPaths[1:]:
                print(f"Removing: {p}")
                os.remove(p)