In [10]:
import json
import os
import hashlib
from tqdm import tqdm
import shutil
import cv2
import numpy as np

## Restore duplicates variable from duplicates.json

In [6]:
def restore_duplicates_from_json():
        """
        Restores the list of duplicates from the json file
        """
        print("Restoring duplicates from json file...") 
        all_duplicates = None
        duplicates_file_path = os.path.join('C:/Users/nello/Desktop/TESI_CODICE/EDA/method_1/duplicates/all_duplicates.json')
        with open(duplicates_file_path, 'r') as file:
            all_duplicates = json.load(file)
        print("Done")
        return all_duplicates

In [3]:
all_duplicates = restore_duplicates_from_json()
print(all_duplicates[:10])

Restoring duplicates from json file...
Done
['cb2m58f', '8wppd2', '2lf5ko', 'cawnmss', '6h47ct', 'd3mhh6a', '13ltj1', 'cjbnia', 'chc3rfz', 'cbp2yz4']


## Copy the duplicates images inside all_duplicates into a destination folder called all_duplicate_images

In [5]:
def copy_images(image_names, source_folder, destination_folder):
    for image_name in image_names:
        source_path = os.path.join(source_folder, f"{image_name}.jpg")
        destination_path = os.path.join(destination_folder, f"{image_name}.jpg")

        try:
            shutil.copyfile(source_path, destination_path)
            print(f"Successfully copied {image_name}.jpg")
        except FileNotFoundError:
            print(f"Error: {image_name}.jpg not found in {source_folder}")

In [8]:
source_folder = "C:/Users/nello/Desktop/TESI_CODICE/dataset/public_image_set"
destination_folder = "C:/Users/nello/Desktop/TESI_CODICE/EDA/method_1/all_duplicates_images"

os.makedirs(destination_folder, exist_ok=True)
copy_images(all_duplicates, source_folder, destination_folder)

Successfully copied cb2m58f.jpg
Successfully copied 8wppd2.jpg
Successfully copied 2lf5ko.jpg
Successfully copied cawnmss.jpg
Successfully copied 6h47ct.jpg
Successfully copied d3mhh6a.jpg
Successfully copied 13ltj1.jpg
Successfully copied cjbnia.jpg
Successfully copied chc3rfz.jpg
Successfully copied cbp2yz4.jpg
Successfully copied d6px9g.jpg
Successfully copied chuxits.jpg
Successfully copied 199h1k.jpg
Successfully copied csb6xx.jpg
Successfully copied cj6w5h.jpg
Successfully copied e5fi6f9.jpg
Successfully copied aze0h1.jpg
Successfully copied zwoue.jpg
Successfully copied 15q4c9.jpg
Successfully copied e1gn3ci.jpg
Successfully copied cov4mbc.jpg
Successfully copied 1lklu5.jpg
Successfully copied cczrjnv.jpg
Successfully copied cm1s01o.jpg
Successfully copied 4dq8wq.jpg
Successfully copied 2lqial.jpg
Successfully copied dewujyh.jpg
Successfully copied cz547kb.jpg
Successfully copied f3swcw3.jpg
Successfully copied celsp56.jpg
Successfully copied cjl7kbt.jpg
Successfully copied diox

## Detect images containing 'humans' inside the all_duplicates_images folder

In [30]:
net = cv2.dnn.readNet('yolov3.weights', 'yolov3.cfg')
classes = []
with open('coco.names', 'r') as f:
    classes = [line.strip() for line in f.readlines()]
output_layers = net.getUnconnectedOutLayersNames()

def detect_human_yolo(image_path, confidence_threshold=0.7):
    try:
        image = cv2.imread(image_path)
        blob = cv2.dnn.blobFromImage(image, 1/255.0, (416, 416), swapRB=True, crop=False)
        net.setInput(blob)
        #forward pass
        detections = net.forward(output_layers)

        for detection in detections:
            scores = detection[:, 5:]
            class_ids = np.argmax(scores, axis=1)
            confidences = scores[np.arange(len(class_ids)), class_ids]

            #filter detections based on confidence and class_id
            mask = (confidences > confidence_threshold) & (class_ids == 0)  #class 0 corresponds to "person" in coco.names
            if np.any(mask):
                return True
        # case no human detected
        return False
    
    except Exception as e:
        return False


inference single img test

In [None]:
image_path = 'C:/Users/nello/Desktop/TESI_CODICE/EDA/method_1/all_duplicates_images/1ogg8g.jpg'

if detect_human_yolo(image_path):
    print("Human detected in the image.")
else:
    print("No human detected in the image.")


In [None]:
def export_to_json(data, json_file_path):
    with open(json_file_path, 'w') as json_file:
        json.dump(data, json_file, indent=4)

In [None]:
def process_batch(image_files, folder_path, confidence_threshold, batch_index):
    batch_results = [os.path.splitext(image_file)[0] for image_file in image_files if detect_human_yolo(os.path.join(folder_path, image_file), confidence_threshold)]

    # Create the directory if it doesn't exist
    output_directory = 'C:/Users/nello/Desktop/TESI_CODICE/EDA/method_1/duplicates_containing_humans/'
    os.makedirs(output_directory, exist_ok=True)

    # Export the list to a JSON file for this batch
    json_file_path = os.path.join(output_directory, f'duplicates_containing_humans_batch_{batch_index}.json')
    export_to_json(batch_results, json_file_path)
    print(f"Batch {batch_index} - Images containing humans: {batch_results}")
    print(f"Exported to {json_file_path}")

In [None]:
def collect_images_with_human_presence_batch(folder_path, batch_size=10, confidence_threshold=0.7):
    image_files = [f for f in os.listdir(folder_path)]

    # Process images in batches
    for batch_index, batch in enumerate(tqdm(range(0, len(image_files), batch_size), desc='Batches', unit='batch')):
        batch_files = image_files[batch: batch + batch_size]
        process_batch(batch_files, folder_path, confidence_threshold, batch_index)

In [None]:
def merge_json_files(merged_json_path, folder_path):
    merged_data = []
    json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]

    for json_file in json_files:
        json_file_path = os.path.join(folder_path, json_file)

        if os.path.exists(json_file_path):
            with open(json_file_path, 'r') as f:
                data = json.load(f)
                merged_data.extend(data)
        else:
            print(f"File not found: {json_file_path}")

    export_to_json(merged_data, merged_json_path)
    print(f"Merged data exported to {merged_json_path}")

In [31]:
dataset_path = "C:/Users/nello/Desktop/TESI_CODICE/EDA/method_1/all_duplicates_images"
batch_size = 500  
confidence_threshold = 0.7  #confidence to have a human inside the img

#in batches, get list of all images containing humans 
collect_images_with_human_presence_batch(dataset_path, batch_size=batch_size, confidence_threshold=confidence_threshold)

merged_json_path = 'C:/Users/nello/Desktop/TESI_CODICE/EDA/method_1/duplicates_containing_humans/merged_duplicates_containing_humans.json'
merge_json_files(merged_json_path, 'C:/Users/nello/Desktop/TESI_CODICE/EDA/method_1/duplicates_containing_humans/')

Batches:   1%|          | 1/99 [02:33<4:10:17, 153.24s/batch]

0


Batches:   2%|▏         | 2/99 [05:03<4:04:56, 151.51s/batch]

1


Batches:   3%|▎         | 3/99 [07:37<4:03:53, 152.43s/batch]

2


Batches:   4%|▍         | 4/99 [10:09<4:01:15, 152.37s/batch]

3


Batches:   5%|▌         | 5/99 [12:39<3:57:29, 151.59s/batch]

4


Batches:   6%|▌         | 6/99 [15:09<3:54:18, 151.16s/batch]

5


Batches:   7%|▋         | 7/99 [17:40<3:51:39, 151.08s/batch]

6


Batches:   8%|▊         | 8/99 [20:11<3:49:05, 151.05s/batch]

7


Batches:   9%|▉         | 9/99 [22:42<3:46:32, 151.03s/batch]

8


Batches:  10%|█         | 10/99 [25:12<3:43:38, 150.77s/batch]

9


Batches:  11%|█         | 11/99 [27:42<3:40:45, 150.52s/batch]

10


Batches:  12%|█▏        | 12/99 [30:13<3:38:05, 150.41s/batch]

11


Batches:  13%|█▎        | 13/99 [32:43<3:35:23, 150.27s/batch]

12


Batches:  14%|█▍        | 14/99 [35:13<3:32:46, 150.20s/batch]

13


Batches:  15%|█▌        | 15/99 [37:43<3:30:30, 150.36s/batch]

14


Batches:  16%|█▌        | 16/99 [40:14<3:27:59, 150.35s/batch]

15


Batches:  17%|█▋        | 17/99 [42:44<3:25:25, 150.31s/batch]

16


Batches:  18%|█▊        | 18/99 [45:14<3:23:00, 150.38s/batch]

17


Batches:  19%|█▉        | 19/99 [47:45<3:20:31, 150.40s/batch]

18


Batches:  20%|██        | 20/99 [50:15<3:17:59, 150.37s/batch]

19


Batches:  21%|██        | 21/99 [52:46<3:15:39, 150.51s/batch]

20


Batches:  22%|██▏       | 22/99 [55:16<3:13:07, 150.48s/batch]

21


Batches:  23%|██▎       | 23/99 [57:47<3:10:32, 150.43s/batch]

22


Batches:  24%|██▍       | 24/99 [1:00:17<3:08:10, 150.54s/batch]

23


Batches:  25%|██▌       | 25/99 [1:02:48<3:05:33, 150.45s/batch]

24


Batches:  26%|██▋       | 26/99 [1:05:18<3:03:02, 150.44s/batch]

25


Batches:  27%|██▋       | 27/99 [1:07:49<3:00:43, 150.61s/batch]

26


Batches:  28%|██▊       | 28/99 [1:10:19<2:58:03, 150.47s/batch]

27


Batches:  29%|██▉       | 29/99 [1:12:50<2:55:28, 150.41s/batch]

28


Batches:  30%|███       | 30/99 [1:15:20<2:52:59, 150.43s/batch]

29


Batches:  31%|███▏      | 31/99 [1:17:50<2:50:28, 150.42s/batch]

30


Batches:  32%|███▏      | 32/99 [1:20:21<2:47:57, 150.41s/batch]

31


Batches:  33%|███▎      | 33/99 [1:22:51<2:45:28, 150.44s/batch]

32


Batches:  34%|███▍      | 34/99 [1:25:20<2:42:31, 150.03s/batch]

33


Batches:  35%|███▌      | 35/99 [1:27:50<2:39:45, 149.77s/batch]

34


Batches:  36%|███▋      | 36/99 [1:30:19<2:37:01, 149.55s/batch]

35


Batches:  37%|███▋      | 37/99 [1:32:48<2:34:23, 149.41s/batch]

36


Batches:  38%|███▊      | 38/99 [1:35:17<2:31:44, 149.25s/batch]

37


Batches:  39%|███▉      | 39/99 [1:37:49<2:30:12, 150.20s/batch]

38


Batches:  40%|████      | 40/99 [1:40:30<2:30:51, 153.42s/batch]

39


Batches:  41%|████▏     | 41/99 [1:43:03<2:28:11, 153.31s/batch]

40


Batches:  42%|████▏     | 42/99 [1:45:31<2:24:07, 151.72s/batch]

41


Batches:  43%|████▎     | 43/99 [1:47:54<2:19:13, 149.17s/batch]

42


Batches:  44%|████▍     | 44/99 [1:50:17<2:15:03, 147.34s/batch]

43


Batches:  45%|████▌     | 45/99 [1:52:47<2:13:17, 148.11s/batch]

44


Batches:  46%|████▋     | 46/99 [1:55:11<2:09:49, 146.98s/batch]

45


Batches:  47%|████▋     | 47/99 [1:57:40<2:07:42, 147.35s/batch]

46


Batches:  48%|████▊     | 48/99 [2:00:14<2:07:02, 149.46s/batch]

47


Batches:  49%|████▉     | 49/99 [2:02:47<2:05:20, 150.40s/batch]

48


Batches:  51%|█████     | 50/99 [2:05:22<2:04:07, 151.98s/batch]

49


Batches:  52%|█████▏    | 51/99 [2:07:44<1:59:07, 148.90s/batch]

50


Batches:  53%|█████▎    | 52/99 [2:10:05<1:54:51, 146.62s/batch]

51


Batches:  54%|█████▎    | 53/99 [2:12:41<1:54:35, 149.46s/batch]

52


Batches:  55%|█████▍    | 54/99 [2:15:30<1:56:18, 155.09s/batch]

53


Batches:  56%|█████▌    | 55/99 [2:18:09<1:54:42, 156.42s/batch]

54


Batches:  57%|█████▋    | 56/99 [2:20:44<1:51:47, 155.98s/batch]

55


Batches:  58%|█████▊    | 57/99 [2:23:21<1:49:23, 156.26s/batch]

56


Batches:  59%|█████▊    | 58/99 [2:26:05<1:48:21, 158.57s/batch]

57


Batches:  60%|█████▉    | 59/99 [2:28:55<1:47:58, 161.95s/batch]

58


Batches:  61%|██████    | 60/99 [2:31:47<1:47:15, 165.01s/batch]

59


Batches:  62%|██████▏   | 61/99 [2:34:45<1:47:02, 169.01s/batch]

60


Batches:  63%|██████▎   | 62/99 [2:37:31<1:43:38, 168.06s/batch]

61


Batches:  64%|██████▎   | 63/99 [2:40:18<1:40:41, 167.83s/batch]

62


Batches:  65%|██████▍   | 64/99 [2:43:16<1:39:36, 170.75s/batch]

63


Batches:  66%|██████▌   | 65/99 [2:45:52<1:34:17, 166.39s/batch]

64


Batches:  67%|██████▋   | 66/99 [2:48:27<1:29:31, 162.76s/batch]

65


Batches:  68%|██████▊   | 67/99 [2:51:00<1:25:16, 159.90s/batch]

66


Batches:  69%|██████▊   | 68/99 [2:53:40<1:22:43, 160.10s/batch]

67


Batches:  70%|██████▉   | 69/99 [2:56:16<1:19:20, 158.67s/batch]

68


Batches:  71%|███████   | 70/99 [2:58:52<1:16:19, 157.93s/batch]

69


Batches:  72%|███████▏  | 71/99 [3:01:30<1:13:39, 157.84s/batch]

70


Batches:  73%|███████▎  | 72/99 [3:04:05<1:10:44, 157.21s/batch]

71


Batches:  74%|███████▎  | 73/99 [3:06:40<1:07:44, 156.35s/batch]

72


Batches:  75%|███████▍  | 74/99 [3:09:17<1:05:14, 156.59s/batch]

73


Batches:  76%|███████▌  | 75/99 [3:11:54<1:02:45, 156.91s/batch]

74


Batches:  77%|███████▋  | 76/99 [3:14:29<59:52, 156.19s/batch]  

75


Batches:  78%|███████▊  | 77/99 [3:17:05<57:12, 156.01s/batch]

76


Batches:  79%|███████▉  | 78/99 [3:19:46<55:12, 157.74s/batch]

77


Batches:  80%|███████▉  | 79/99 [3:22:21<52:15, 156.79s/batch]

78


Batches:  81%|████████  | 80/99 [3:24:58<49:41, 156.94s/batch]

79


Batches:  82%|████████▏ | 81/99 [3:27:32<46:49, 156.06s/batch]

80


Batches:  83%|████████▎ | 82/99 [3:30:07<44:07, 155.72s/batch]

81


Batches:  84%|████████▍ | 83/99 [3:32:42<41:27, 155.45s/batch]

82


Batches:  85%|████████▍ | 84/99 [3:35:19<38:58, 155.87s/batch]

83


Batches:  86%|████████▌ | 85/99 [3:38:00<36:44, 157.47s/batch]

84


Batches:  87%|████████▋ | 86/99 [3:40:44<34:32, 159.43s/batch]

85


Batches:  88%|████████▊ | 87/99 [3:43:23<31:50, 159.17s/batch]

86


Batches:  89%|████████▉ | 88/99 [3:46:00<29:05, 158.71s/batch]

87


Batches:  90%|████████▉ | 89/99 [3:48:41<26:33, 159.37s/batch]

88


Batches:  91%|█████████ | 90/99 [3:51:24<24:05, 160.57s/batch]

89


Batches:  92%|█████████▏| 91/99 [3:54:05<21:24, 160.57s/batch]

90


Batches:  93%|█████████▎| 92/99 [3:56:53<19:00, 162.87s/batch]

91


Batches:  94%|█████████▍| 93/99 [3:59:38<16:20, 163.44s/batch]

92


Batches:  95%|█████████▍| 94/99 [4:02:23<13:39, 163.81s/batch]

93


Batches:  96%|█████████▌| 95/99 [4:05:09<10:58, 164.54s/batch]

94


Batches:  97%|█████████▋| 96/99 [4:07:57<08:16, 165.51s/batch]

95


Batches:  98%|█████████▊| 97/99 [4:10:45<05:32, 166.22s/batch]

96


Batches:  99%|█████████▉| 98/99 [4:13:19<02:42, 162.54s/batch]

97


Batches: 100%|██████████| 99/99 [4:14:24<00:00, 154.19s/batch]

98





Merged data exported to C:/Users/nello/Desktop/TESI_CODICE/EDA/method_1/duplicates_containing_humans/merged_duplicates_containing_humans.json


In [4]:
with open('C:/Users/nello/Desktop/TESI_CODICE/EDA/method_1/duplicates_containing_humans/merged_duplicates_containing_humans.json', 'r') as file:
    json = json.load(file)

print(len(json))

14547


## Create a folder duplicates_containing_humans to check the quality of the detection

In [8]:
def restore_duplicates_from_json():
        """
        Restores the list of duplicates from the json file
        """
        print("Restoring duplicates from json file...") 
        duplicates = None
        duplicates_file_path = os.path.join('C:/Users/nello/Desktop/TESI_CODICE/EDA/method_1/duplicates_containing_humans/merged_duplicates_containing_humans.json')
        with open(duplicates_file_path, 'r') as file:
            duplicates = json.load(file)
        print("Done")
        return duplicates

In [11]:
source_folder = "C:/Users/nello/Desktop/TESI_CODICE/EDA/method_1/all_duplicates_images"
destination_folder = "C:/Users/nello/Desktop/TESI_CODICE/EDA/method_1/duplicates_containing_humans_images"

os.makedirs(destination_folder, exist_ok=True)

duplicates_containing_humans = restore_duplicates_from_json()
copy_images(duplicates_containing_humans, source_folder, destination_folder)

Restoring duplicates from json file...
Done
Successfully copied 103crd.jpg
Successfully copied 103neu.jpg
Successfully copied 103ozy.jpg
Successfully copied 106yw5.jpg
Successfully copied 107zkt.jpg
Successfully copied 10c49y.jpg
Successfully copied 10hfrt.jpg
Successfully copied 10ivdc.jpg
Successfully copied 10kltt.jpg
Successfully copied 10piyf.jpg
Successfully copied 10qvbv.jpg
Successfully copied 10re2h.jpg
Successfully copied 10rqk4.jpg
Successfully copied 10ttd2.jpg
Successfully copied 10ujav.jpg
Successfully copied 10unyp.jpg
Successfully copied 10vnl0.jpg
Successfully copied 10x5l9.jpg
Successfully copied 110bop.jpg
Successfully copied 110hkd.jpg
Successfully copied 1139mc.jpg
Successfully copied 115pxg.jpg
Successfully copied 11ef8b.jpg
Successfully copied 11eul6.jpg
Successfully copied 11grh4.jpg
Successfully copied 11icsb.jpg
Successfully copied 11k0bv.jpg
Successfully copied 11nknl.jpg
Successfully copied 11os5t.jpg
Successfully copied 11s1da.jpg
Successfully copied 11sm3d

## Remove all the duplicates containing humans from all_duplicates_images folder

In [16]:
import os
import shutil
import json

all_duplicates_images_path = "C:/Users/nello/Desktop/TESI_CODICE/EDA/method_1/all_duplicates_images"
duplicates_without_humans_images_path = "C:/Users/nello/Desktop/TESI_CODICE/EDA/method_1/duplicates_without_humans_images"
json_file_path = "C:/Users/nello/Desktop/TESI_CODICE/EDA/method_1/duplicates_containing_humans/merged_duplicates_containing_humans.json"
os.makedirs(duplicates_without_humans_images_path, exist_ok=True)

# Read the JSON file
with open(json_file_path) as json_file:
    data = json.load(json_file)

# Ensure "filenames" is a key in the JSON data and is a list
json_filenames = data["filenames"] if "filenames" in data else []

if not isinstance(json_filenames, list):
    raise ValueError("The 'filenames' key in the JSON file should contain a list of filenames.")

# Iterate through files in folder A
for filename in os.listdir(all_duplicates_images_path):
    file_path = os.path.join(all_duplicates_images_path, filename)

    # Check if the filename (without extension) is not in the JSON list
    if filename.split('.')[0] not in json_filenames and os.path.isfile(file_path):
        # Copy the file to folder B
        shutil.copy(file_path, os.path.join(duplicates_without_humans_images_path, filename))


In [17]:
import os
import shutil
import json

def load_excluded_image_names(json_file_path):
    with open(json_file_path, 'r') as json_file:
        return set(json.load(json_file))

def copy_images(folder_a, folder_b, excluded_image_names):
    images_to_copy = [file for file in os.listdir(folder_a) if file.endswith('.jpg') and file[:-4] not in excluded_image_names]

    for image in images_to_copy:
        shutil.copyfile(os.path.join(folder_a, image), os.path.join(folder_b, image))
        print(f"Copying {image} to {folder_b}")

folder_a_path = 'C:/Users/nello/Desktop/TESI_CODICE/EDA/method_1/all_duplicates_images'
folder_b_path = 'C:/Users/nello/Desktop/TESI_CODICE/EDA/method_1/duplicates_without_humans_images'
json_file_path = 'C:/Users/nello/Desktop/TESI_CODICE/EDA/method_1/duplicates_containing_humans/merged_duplicates_containing_humans.json'

excluded_image_names = load_excluded_image_names(json_file_path)
copy_images(folder_a_path, folder_b_path, excluded_image_names)


Copying 100qm6.jpg to C:/Users/nello/Desktop/TESI_CODICE/EDA/method_1/duplicates_without_humans_images
Copying 101j7x.jpg to C:/Users/nello/Desktop/TESI_CODICE/EDA/method_1/duplicates_without_humans_images
Copying 101pol.jpg to C:/Users/nello/Desktop/TESI_CODICE/EDA/method_1/duplicates_without_humans_images
Copying 1028p3.jpg to C:/Users/nello/Desktop/TESI_CODICE/EDA/method_1/duplicates_without_humans_images
Copying 102dal.jpg to C:/Users/nello/Desktop/TESI_CODICE/EDA/method_1/duplicates_without_humans_images
Copying 102udo.jpg to C:/Users/nello/Desktop/TESI_CODICE/EDA/method_1/duplicates_without_humans_images
Copying 1032yi.jpg to C:/Users/nello/Desktop/TESI_CODICE/EDA/method_1/duplicates_without_humans_images
Copying 103577.jpg to C:/Users/nello/Desktop/TESI_CODICE/EDA/method_1/duplicates_without_humans_images
Copying 1036sy.jpg to C:/Users/nello/Desktop/TESI_CODICE/EDA/method_1/duplicates_without_humans_images
Copying 103a9s.jpg to C:/Users/nello/Desktop/TESI_CODICE/EDA/method_1/dup