In [2]:
import fiftyone as fo
import fiftyone.zoo as foz

fo.config.dataset_zoo_dir = "./datasets"

# Download and load the validation split of COCO-2017
train_ds = foz.load_zoo_dataset("coco-2017", split="train", max_samples=10000, shuffle=True, seed=42)
val_ds   = foz.load_zoo_dataset("coco-2017", split="validation", max_samples=2000, shuffle=False)
test_ds  = foz.load_zoo_dataset("coco-2017", split="test", max_samples=2000, shuffle=False)

Downloading split 'train' to './datasets/coco-2017/train' if necessary
Found annotations at 'datasets/coco-2017/raw/instances_train2017.json'
Sufficient images already downloaded
Existing download of split 'train' is sufficient
Loading existing dataset 'coco-2017-train-10000'. To reload from disk, either delete the existing dataset or provide a custom `dataset_name` to use
Downloading split 'validation' to './datasets/coco-2017/validation' if necessary
Found annotations at 'datasets/coco-2017/raw/instances_val2017.json'
Sufficient images already downloaded
Existing download of split 'validation' is sufficient
Loading existing dataset 'coco-2017-validation-2000'. To reload from disk, either delete the existing dataset or provide a custom `dataset_name` to use
Downloading split 'test' to './datasets/coco-2017/test' if necessary
Found test info at 'datasets/coco-2017/raw/image_info_test2017.json'
1522 images found; downloading the remaining 478
 100% |██████████████████| 478/478 [23.2s el

In [None]:
#attaching captions to dataset samples, since fiftyone doesn't currently include them
import json
from collections import defaultdict

def add_captions(dataset, captions_file):
    #read COCO captions file
    with open(captions_file) as f:
        caps = json.load(f)
    
    #image_id -> [caption, ...]
    caps_by_id = defaultdict(list)
    for ann in caps["annotations"]:
        caps_by_id[ann["image_id"]].append(ann["caption"])

    #file_name -> image_id
    id_by_fname = {img["file_name"]: img["id"] for img in caps["images"]}

    for sample in dataset:
        fname = sample.filepath.split("/")[-1]
        coco_id = id_by_fname.get(fname)
        if coco_id is not None:
            sample["captions"] = caps_by_id.get(coco_id, [])
            sample.save()

add_captions(train_ds, "datasets/coco-2017/raw/captions_train2017.json")
add_captions(val_ds, "datasets/coco-2017/raw/captions_val2017.json")


In [6]:
print(len(train_ds), len(val_ds), len(test_ds))

10000 2000 2000


In [8]:
#view sample data
for sample in val_ds.take(5):
    print(sample.filepath)
    print(sample.ground_truth) #object label and bounding boxes
    print(sample.captions) #captions
    print()

/Users/wxy/Desktop/Dev/DL/image caption/datasets/coco-2017/validation/data/000000007386.jpg
<Detections: {
    'detections': [
        <Detection: {
            'id': '6920275b7e816a9715b281c3',
            'attributes': {},
            'tags': [],
            'label': 'dog',
            'bounding_box': [0.30151666666666666, 0.70285, 0.064, 0.1137],
            'mask': None,
            'mask_path': None,
            'confidence': None,
            'index': None,
            'supercategory': 'animal',
            'iscrowd': 0,
        }>,
        <Detection: {
            'id': '6920275b7e816a9715b281c4',
            'attributes': {},
            'tags': [],
            'label': 'motorcycle',
            'bounding_box': [0.0852, 0.031400000000000004, 0.9148, 0.9686],
            'mask': None,
            'mask_path': None,
            'confidence': None,
            'index': None,
            'supercategory': 'vehicle',
            'iscrowd': 0,
        }>,
        <Detection: {
      