# Find and Delete Duplicates using imagededup,resnet50, and tensorflow provided with the help of ChatGPT

In [None]:
%pip install ultralytics  # install
from ultralytics import YOLO, checks, hub
checks()  # checks

Install Dependencies

In [None]:
%pip install tensorflow keras imageHash torch torchvision imagededup

Main Code to run

In [None]:
import os
import tensorflow as tf
import numpy as np
from tqdm import tqdm
import tensorflow as tf
import numpy as np
from PIL import Image
from keras.applications import ResNet50

# Set GPU options
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)

# Load the pre-trained ResNet50 model
model = ResNet50(weights='imagenet')

# Set the paths to the image and label directories
train_image_dir = 'train/images'
train_label_dir = 'train/labels'
test_image_dir = 'test/images'
test_label_dir = 'test/labels'
valid_image_dir = 'valid/images'
valid_label_dir = 'valid/labels'

# Define a function to load an image
def load_image(image_path):
    # Load the image
    image = tf.keras.preprocessing.image.load_img(image_path, target_size=(416, 416))
    
    # Convert the image to a NumPy array
    image = tf.keras.preprocessing.image.img_to_array(image)
    
    # Rescale the pixel values to be between -1 and 1
    image = tf.keras.applications.resnet50.preprocess_input(image)
    
    # Return the image
    return image

# Define a function to calculate the similarity between two images
def calculate_similarity(image_path_1, image_path_2):
    # Load the images
    image_1 = load_image(image_path_1)
    image_2 = load_image(image_path_2)

    # Resize the images to 224x224
    #image_1 = image_1.resize((224, 224))
    image_1 = np.array(Image.fromarray(image_1.astype(np.uint8)).resize((224, 224))).astype('float32')
    #image_2 = image_2.resize((224, 224))
    image_2 = np.array(Image.fromarray(image_2.astype(np.uint8)).resize((224, 224))).astype('float32')

    # Convert the images to arrays
    image_1 = tf.keras.preprocessing.image.img_to_array(image_1)
    image_2 = tf.keras.preprocessing.image.img_to_array(image_2)

    # Preprocess the images
    image_1 = tf.keras.applications.resnet50.preprocess_input(image_1)
    image_2 = tf.keras.applications.resnet50.preprocess_input(image_2)

    # Get the image features using the ResNet50 model
    feature_1 = model.predict(image_1[np.newaxis, ...])
    feature_2 = model.predict(image_2[np.newaxis, ...])

    # Calculate the cosine similarity between the image features
    similarity = tf.keras.losses.cosine_similarity(feature_1, feature_2)
    return similarity.numpy()[0]

## Define a function to delete duplicates
def delete_duplicates(image_dir, label_dir):
    # Get the list of image files in the directory
    image_files = sorted(os.listdir(image_dir))
    
    # Initialize a list to store the indices of duplicate images
    duplicate_indices = []
    
    # Loop over all pairs of images
    for i in tqdm(range(len(image_files))):
        for j in range(i+1, len(image_files)):
            # Check if the images are similar
            similarity = calculate_similarity(os.path.join(image_dir, image_files[i]), os.path.join(image_dir, image_files[j]))
            if similarity > 0.95:
                # If the images are similar, add the index of the second image to the list of duplicate indices
                duplicate_indices.append(j)

    # Delete the duplicate images and labels
    for index in tqdm(sorted(duplicate_indices, reverse=True)):
        os.remove(os.path.join(image_dir, image_files[index]))
        os.remove(os.path.join(label_dir, image_files[index][:-4] + '.txt'))

# Delete duplicates in the train directory
print("Deleting duplicates in the train directory")
delete_duplicates(train_image_dir, train_label_dir)

# Delete duplicates in the test directory
print("Deleting duplicates in the test directory")
delete_duplicates(test_image_dir, test_label_dir)

# Delete duplicates in the valid directory
print("Deleting duplicates in the valid directory")
delete_duplicates(valid_image_dir, valid_label_dir)
