In [3]:
import json
import os
import hashlib
from tqdm import tqdm

# Create hashes of the trash images and save it to json

In [2]:
def file_hash(filepath):
    block_size = 65536
    file_hash = hashlib.sha256()
    with open(filepath, 'rb') as f:
        for block in iter(lambda: f.read(block_size), b''):
            file_hash.update(block)
    return file_hash.hexdigest()

In [1]:
def get_unique_hashes(folder_path):
    unique_hashes = set()

    files_list = os.listdir(folder_path)

    for filename in tqdm(files_list, desc="Processing images"):
        if os.path.isfile(os.path.join(folder_path, filename)):
            current_image_path = os.path.join(folder_path, filename)

            current_image_hash = file_hash(current_image_path)
            #i add the hash to the set to keep it unique
            unique_hashes.add(current_image_hash)

    return list(unique_hashes)

In [16]:
def save_hashes_to_json(unique_hashes, output_json_path):
    with open(output_json_path, 'w') as json_file:
        json.dump(unique_hashes, json_file)

In [17]:
folder_path = 'C:/Users/nello/Desktop/TESI_CODICE/EDA/images_to_remove'
output_json_path = 'C:/Users/nello/Desktop/TESI_CODICE/EDA/unique_hashes.json'

unique_hashes = get_unique_hashes(folder_path)
save_hashes_to_json(unique_hashes, output_json_path)

Processing images: 100%|██████████| 311/311 [00:01<00:00, 229.82it/s]


# Find out how many images inside the whole dataset matches the trash images inside images_to_remove folder

In [3]:
def find_matching_hashes(folder_path, json_path):
    #load unique hashes
    with open(json_path, 'r') as json_file:
        unique_hashes = set(json.load(json_file))

    matching_images = []
    files_list = os.listdir(folder_path)

    for filename in tqdm(files_list, desc="Processing images"):
        if os.path.isfile(os.path.join(folder_path, filename)):
            current_image_path = os.path.join(folder_path, filename)

            #get hash of the current image
            current_image_hash = file_hash(current_image_path)
            #i check if the hash is in the set of unique hashes
            if current_image_hash in unique_hashes:
                matching_images.append(filename)

    return matching_images

In [4]:
folder_path = 'C:/Users/nello/Desktop/TESI_CODICE/dataset/public_image_set'
json_path = 'C:/Users/nello/Desktop/TESI_CODICE/EDA/unique_hashes.json'

matching_images = find_matching_hashes(folder_path, json_path)

#list of matching images
output_json_path = 'C:/Users/nello/Desktop/all_images_names_to_remove.json'
with open(output_json_path, 'w') as output_json_file:
    json.dump(matching_images, output_json_file)

print(f'Number of images with matching hashes: {len(matching_images)}')

Processing images: 100%|██████████| 773563/773563 [20:35<00:00, 625.92it/s] 

Number of images with matching hashes: 3392





# Find out how many images are equal to a specific trash image

In [5]:
def find_duplicate_images(given_image_path, folder_path):
    given_image_hash = file_hash(given_image_path)
    files_list = os.listdir(folder_path)

    duplicate_count = 0

    for filename in tqdm(files_list, desc="Processing images"):
        if os.path.isfile(os.path.join(folder_path, filename)):
            current_image_path = os.path.join(folder_path, filename)

            current_image_hash = file_hash(current_image_path)

            if current_image_hash == given_image_hash:
                duplicate_count += 1

    return duplicate_count

given_image_path = 'C:/Users/nello/Desktop/TESI_CODICE/dataset/dataset_small/1a5s5s.jpg'
folder_path = 'C:/Users/nello/Desktop/TESI_CODICE/dataset/public_image_set' 
result = find_duplicate_images(given_image_path, folder_path)

print(f'Number of images with the same hash as the given image: {result}')

Processing images: 100%|██████████| 773563/773563 [19:22<00:00, 665.26it/s] 

Number of images with the same hash as the given image: 2888



