In [None]:
import json
from shutil import copyfile
from pathlib import Path
from pycocotools.coco import COCO

In [None]:
# all_skip

# Script to generate tiny test data set for quick testing
>

In [None]:
img_path = Path('/root/data/coco/val2014')
anno_path = Path('/root/data/coco/annotations/captions_val2014.json')
coco =COCO(anno_path)
coco_ds = coco.dataset

loading annotations into memory...
Done (t=0.31s)
creating index...
index created!


In [None]:
num_test_imgs = 100

## Data Format

``` json
{
    "info": {...},
    "licenses": [...],
    "images": [...],
    "annotations": [...],
    "categories": [...], <-- Not in Captions annotations
    "segment_info": [...] <-- Only in Panoptic annotations
}
```

In [None]:
coco_ds['info']

{'contributor': 'COCO Consortium',
 'date_created': '2017/09/01',
 'description': 'COCO 2014 Dataset',
 'url': 'http://cocodataset.org',
 'version': '1.0',
 'year': 2014}

In [None]:
coco_ds['licenses'][:3]

[{'id': 1,
  'name': 'Attribution-NonCommercial-ShareAlike License',
  'url': 'http://creativecommons.org/licenses/by-nc-sa/2.0/'},
 {'id': 2,
  'name': 'Attribution-NonCommercial License',
  'url': 'http://creativecommons.org/licenses/by-nc/2.0/'},
 {'id': 3,
  'name': 'Attribution-NonCommercial-NoDerivs License',
  'url': 'http://creativecommons.org/licenses/by-nc-nd/2.0/'}]

In [None]:
coco_ds['images'][:3]

[{'coco_url': 'http://images.cocodataset.org/val2014/COCO_val2014_000000391895.jpg',
  'date_captured': '2013-11-14 11:18:45',
  'file_name': 'COCO_val2014_000000391895.jpg',
  'flickr_url': 'http://farm9.staticflickr.com/8186/8119368305_4e622c8349_z.jpg',
  'height': 360,
  'id': 391895,
  'license': 3,
  'width': 640},
 {'coco_url': 'http://images.cocodataset.org/val2014/COCO_val2014_000000522418.jpg',
  'date_captured': '2013-11-14 11:38:44',
  'file_name': 'COCO_val2014_000000522418.jpg',
  'flickr_url': 'http://farm1.staticflickr.com/1/127244861_ab0c0381e7_z.jpg',
  'height': 480,
  'id': 522418,
  'license': 4,
  'width': 640},
 {'coco_url': 'http://images.cocodataset.org/val2014/COCO_val2014_000000184613.jpg',
  'date_captured': '2013-11-14 12:36:29',
  'file_name': 'COCO_val2014_000000184613.jpg',
  'flickr_url': 'http://farm3.staticflickr.com/2169/2118578392_1193aa04a0_z.jpg',
  'height': 336,
  'id': 184613,
  'license': 3,
  'width': 500}]

In [None]:
coco_ds['annotations'][:3]

[{'caption': 'A bicycle replica with a clock as the front wheel.',
  'id': 37,
  'image_id': 203564},
 {'caption': 'A black Honda motorcycle parked in front of a garage.',
  'id': 38,
  'image_id': 179765},
 {'caption': 'A room with blue walls and a white sink and door.',
  'id': 49,
  'image_id': 322141}]

## Build Annotations

In [None]:
num_images = 100

In [None]:
test_coco = {
    'info': coco_ds['info'], 
    'licenses': coco_ds['licenses'],
    'images': coco_ds['images'][:num_test_imgs],
}

In [None]:
img_ids = [img_obj['id'] for img_obj in test_coco['images']]

In [None]:
img_ids[:10]

[391895, 522418, 184613, 318219, 554625, 397133, 574769, 60623, 309022, 5802]

In [None]:
annotations = [anno for anno in coco_ds['annotations'] if anno['image_id'] in img_ids]

In [None]:
annotations[:3]

[{'caption': 'A small closed toilet in a cramped space.',
  'id': 441,
  'image_id': 331352},
 {'caption': 'A tan toilet and sink combination in a small room.',
  'id': 540,
  'image_id': 331352},
 {'caption': 'This is an advanced toilet with a sink and control panel.',
  'id': 981,
  'image_id': 331352}]

In [None]:
test_coco['annotations'] = annotations

In [None]:
# save to
annot_out_path = Path('./tiny_data/captions_tiny.json')
annot_out_path.write_text(json.dumps(test_coco))

83385

## Build images

In [None]:
img_out_path = Path('./tiny_data/tiny_imgs')
img_files = [(img_path/img_obj['file_name'], img_out_path/img_obj['file_name']) for img_obj in test_coco['images']]

In [None]:
len(img_files), img_files[:3]

(100,
 [(PosixPath('/root/data/coco/val2014/COCO_val2014_000000391895.jpg'),
   PosixPath('test_data/test_imgs/COCO_val2014_000000391895.jpg')),
  (PosixPath('/root/data/coco/val2014/COCO_val2014_000000522418.jpg'),
   PosixPath('test_data/test_imgs/COCO_val2014_000000522418.jpg')),
  (PosixPath('/root/data/coco/val2014/COCO_val2014_000000184613.jpg'),
   PosixPath('test_data/test_imgs/COCO_val2014_000000184613.jpg'))])

In [None]:
# save
img_out_path.mkdir(exist_ok=True)
for src, dest in img_files:
    copyfile(src, dest)