In [1]:
import os

from pycocotools.coco import COCO
import urllib
import csv
from tqdm import tqdm


#### Modify the cell below with the folder you prefer (training data will be stored in there)

In [2]:
DATA_ROOT_FOLDER = "/media/edge7/TOSHIBA EXT"

### Get COCO annotation file, plus IMDb-face CSV (they are 2 well-known datasets)

In [3]:
!bash get_annotations.sh "{DATA_ROOT_FOLDER}"

--2023-03-10 17:14:54--  http://images.cocodataset.org/annotations/annotations_trainval2017.zip
Resolving images.cocodataset.org (images.cocodataset.org)... 52.216.216.97, 54.231.224.145, 52.217.85.68, ...
Connecting to images.cocodataset.org (images.cocodataset.org)|52.216.216.97|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 252907541 (241M) [application/zip]
Saving to: ‘annotations_trainval2017.zip’


2023-03-10 17:15:04 (26.4 MB/s) - ‘annotations_trainval2017.zip’ saved [252907541/252907541]

Archive:  annotations_trainval2017.zip
  inflating: annotations/instances_train2017.json  
  inflating: annotations/instances_val2017.json  
  inflating: annotations/captions_train2017.json  
  inflating: annotations/captions_val2017.json  
  inflating: annotations/person_keypoints_train2017.json  
  inflating: annotations/person_keypoints_val2017.json  
--2023-03-10 17:15:08--  https://storage.googleapis.com/public_stuff/IMDb-Face.csv
Resolving stor

### Set Number of Images, feel free to change the below

In [4]:
N_PERSON_TOTAL = 5500 # Modify this, as you want
N_PERSON_COCO = int(N_PERSON_TOTAL * 0.3)
N_PERSON_IMDB = N_PERSON_TOTAL - N_PERSON_COCO
N_NOT_PERSON_TOTAL = N_PERSON_TOTAL # Modify this if you don't want a balanced dataset

In [5]:
coco=COCO('annotations/instances_val2017.json')
catIds = coco.getCatIds(catNms=['person'])
imgIds = coco.getImgIds(catIds=catIds)
img_infos = coco.loadImgs(imgIds)


loading annotations into memory...
Done (t=0.29s)
creating index...
index created!


In [6]:
counter = 0
with tqdm(total=N_PERSON_COCO) as pbar:
    for img_info in img_infos:
        img_url = img_info['coco_url']
        img_filename = f"{DATA_ROOT_FOLDER}/data/person/{img_info['file_name']}"
        urllib.request.urlretrieve(img_url, img_filename)
        counter +=1
        pbar.update(1)
        if counter == N_PERSON_COCO:
         break

100%|██████████| 1650/1650 [17:14<00:00,  1.60it/s]


### Now get the remaining from the IMDb-face dataset, the file should have been automatically downloaded

In [7]:
from urllib.error import HTTPError
import random
seed_value = 1234
random.seed(seed_value)

counter = 0
with open('IMDb-Face.csv', newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    with tqdm(total=N_PERSON_IMDB) as pbar:
        for row in reader:
            if counter == N_PERSON_IMDB:
                break
            if random.random() < 0.2: # to avoid to take the same person too many times
                img_filename = f"{DATA_ROOT_FOLDER}/data/person/IMDB_{counter}_{row['image']}"
                try:
                    urllib.request.urlretrieve(row['url'], img_filename)
                    counter +=1
                    pbar.update(1)
                except HTTPError as e:
                    pass #  Some images are not available, just skip


100%|██████████| 3850/3850 [18:16<00:00,  3.51it/s]  


In [8]:
catIds = coco.getCatIds()
catIds.remove(coco.getCatIds(catNms=['person'])[0])
N = int(N_NOT_PERSON_TOTAL / len(catIds)) +1

# get image IDs for all categories except 'person'
imgIds = []
for catId in catIds:
    imgIds.append((coco.loadCats(catId)[0]['name'], coco.getImgIds(catIds=[catId])))


# download the images
with tqdm(total=N * len(catIds)) as pbar:
    for cat, imgs in imgIds:
        imgs = coco.loadImgs(imgs)
        COUNTER_PER_CATEGORY = 0
        for img_info in imgs:
            img_url = img_info['coco_url']
            img_filename = f"{DATA_ROOT_FOLDER}/data/notperson/{cat}_{img_info['file_name']}"
            # load the annotations for the image
            annIds = coco.getAnnIds(imgIds=img_info['id'])
            anns = coco.loadAnns(annIds)
            if not any([ann['category_id'] == 1 for ann in anns]): # be 101% sure no person is in the image
                pbar.update(1)
                try:
                    urllib.request.urlretrieve(img_url, img_filename)
                    COUNTER_PER_CATEGORY +=1
                except Exception as e:
                    pass
                if COUNTER_PER_CATEGORY == N:
                    break

print("DONE!")

 67%|██████▋   | 3722/5530 [36:42<17:49,  1.69it/s]  

DONE!





### It can be that we are missing some images

In [9]:
import requests

def download_images(n):
    # Set the API endpoint URL
    url = "https://source.unsplash.com/random"

    # Set the request parameters
    params = {
        "orientation": "landscape",
        "content_filter": "high",
        "topics": ["nature", "food", "animals", "architecture", "travel", "art", "textures", "patterns"]
    }

    # Initialize a set to keep track of downloaded image URLs
    downloaded_urls = set()

    # Loop over N images
    for i in range(n):
        # Send the request to the API
        try:
            response = requests.get(url, params=params)
        except Exception:
            continue
        # Check if the response contains a valid image and it has not been downloaded before
        if response.status_code == 200 and response.url not in downloaded_urls:
            # Add the downloaded URL to the set of downloaded URLs
            downloaded_urls.add(response.url)

            # Yield the image content
            yield response.content



In [10]:
number_of_no_person_images_so_far = len(os.listdir(f"{DATA_ROOT_FOLDER}/data/notperson/"))

In [11]:
if number_of_no_person_images_so_far < N_NOT_PERSON_TOTAL:
    print("still missing some images, recovering . . . ")
    number_to_download = N_NOT_PERSON_TOTAL - number_of_no_person_images_so_far
    counter = 0
    with tqdm(total=number_to_download) as pbar:
        for response in download_images(number_to_download):
            with open(f"{DATA_ROOT_FOLDER}/data/notperson/unsplash_{counter}.jpg", "wb") as f:
                f.write(response)
                pbar.update(1)
                counter +=1

still missing some images, recovering . . . 


 78%|███████▊  | 1392/1778 [18:10<05:02,  1.28it/s]


In [12]:
print("All done dataset ready for training")

All done dataset ready for training
