In [10]:
from pycocotools.coco import COCO
import urllib
import csv
from tqdm import tqdm


In [11]:
DATA_ROOT_FOLDER = "/media/edge7/TOSHIBA EXT/"

### Get COCO annotation file, plus IMDb-face CSV (they are 2 well-known datasets)

In [12]:
!bash get_annotations.sh "{DATA_ROOT_FOLDER}"

--2023-03-10 13:07:49--  http://images.cocodataset.org/annotations/annotations_trainval2017.zip
Resolving images.cocodataset.org (images.cocodataset.org)... 3.5.27.112, 54.231.199.97, 52.216.245.244, ...
Connecting to images.cocodataset.org (images.cocodataset.org)|3.5.27.112|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 252907541 (241M) [application/zip]
Saving to: ‘annotations_trainval2017.zip’


2023-03-10 13:07:58 (26.1 MB/s) - ‘annotations_trainval2017.zip’ saved [252907541/252907541]

Archive:  annotations_trainval2017.zip
  inflating: annotations/instances_train2017.json  
  inflating: annotations/instances_val2017.json  
  inflating: annotations/captions_train2017.json  
  inflating: annotations/captions_val2017.json  
  inflating: annotations/person_keypoints_train2017.json  
  inflating: annotations/person_keypoints_val2017.json  
--2023-03-10 13:08:03--  https://storage.googleapis.com/public_stuff/IMDb-Face.csv
Resolving storage.g

### Set Number of Images, feel free to change the below

In [13]:
N_PERSON_TOTAL = 10000 # Modify this, as you want
N_PERSON_COCO = int(N_PERSON_TOTAL * 0.3)
N_PERSON_IMDB = N_PERSON_TOTAL - N_PERSON_COCO
N_NOT_PERSON_TOTAL = N_PERSON_TOTAL # Modify this if you don't want a balanced dataset

In [14]:
coco=COCO('annotations/instances_val2017.json')
catIds = coco.getCatIds(catNms=['person'])
imgIds = coco.getImgIds(catIds=catIds)
img_infos = coco.loadImgs(imgIds)


loading annotations into memory...
Done (t=0.39s)
creating index...
index created!


In [15]:
counter = 0
with tqdm(total=N_PERSON_COCO) as pbar:
    for img_info in img_infos:
        img_url = img_info['coco_url']
        img_filename = f"{DATA_ROOT_FOLDER}/data/person/{img_info['file_name']}"
        urllib.request.urlretrieve(img_url, img_filename)
        counter +=1
        pbar.update(1)
        if counter == N_PERSON_COCO:
         break

 90%|████████▉ | 2693/3000 [26:47<03:03,  1.68it/s]


### Now get the remaining from the IMDb-face dataset, the file should have been automatically downloaded

In [16]:
from urllib.error import HTTPError
import random
seed_value = 1234
random.seed(seed_value)

counter = 0
with open('IMDb-Face.csv', newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    with tqdm(total=N_PERSON_IMDB) as pbar:
        for row in reader:
            if counter == N_PERSON_IMDB:
                break
            if random.random() < 0.2: # to avoid to take the same person too many times
                img_filename = f"{DATA_ROOT_FOLDER}/data/person/IMDB_{counter}_{row['image']}"
                try:
                    urllib.request.urlretrieve(row['url'], img_filename)
                    counter +=1
                    pbar.update(1)
                except HTTPError as e:
                    pass #  Some images are not available, just skip


100%|██████████| 7000/7000 [46:59<00:00,  2.48it/s]  


In [None]:
catIds = coco.getCatIds()
catIds.remove(coco.getCatIds(catNms=['person'])[0])
N = int(N_NOT_PERSON_TOTAL / len(catIds)) +1

# get image IDs for all categories except 'person'
imgIds = []
for catId in catIds:
    imgIds.append((coco.loadCats(catId)[0]['name'], coco.getImgIds(catIds=[catId])))


# download the images
with tqdm(total=N * len(catIds)) as pbar:
    for cat, imgs in imgIds:
        imgs = coco.loadImgs(imgs)
        COUNTER_PER_CATEGORY = 0
        for img_info in imgs:
            img_url = img_info['coco_url']
            img_filename = f"{DATA_ROOT_FOLDER}/data/notperson/{cat}_{img_info['file_name']}"
            # load the annotations for the image
            annIds = coco.getAnnIds(imgIds=img_info['id'])
            anns = coco.loadAnns(annIds)
            if not any([ann['category_id'] == 1 for ann in anns]): # be 101% sure no person is in the image
                pbar.update(1)
                try:
                    urllib.request.urlretrieve(img_url, img_filename)
                    COUNTER_PER_CATEGORY +=1
                except Exception as e:
                    pass
                if COUNTER_PER_CATEGORY == N:
                    break

print("DONE!")

 37%|███▋      | 3760/10033 [36:55<57:13,  1.83it/s]  

In [None]:
print("All done dataset ready for training")