In [8]:
import json 
import os
from collections import defaultdict
import numpy as np 
import random 
# np.random.seed(42)

In [2]:
#######################################################################################
############################# CHANGE FILENAMES HERE ###################################
#######################################################################################

annotations_dir = "../annotations"
annotations_filename = "annotations_full.json"
annotations_filepath = os.path.join(annotations_dir, annotations_filename)
annotations = json.load(open(annotations_filepath))

In [3]:
print(annotations.keys())
categories = annotations["categories"]
images = annotations["images"]
det_annotations = annotations["annotations"]
oracle_annotations = annotations["input_oracle_annotations"]
segmentation_images = annotations["segmentation_images"]

dict_keys(['categories', 'images', 'segmentation_images', 'input_oracle_annotations', 'annotations', 'phrase_annotations'])


In [4]:
det_anns_map = defaultdict(list)
oracle_anns_map = defaultdict(list)

for ann in det_annotations:
    det_anns_map[ann["image_id"]].append(ann)

for ann in oracle_annotations:
    oracle_anns_map[ann["image_id"]].append(ann)

In [5]:
train_annotations = {}
train_annotations["categories"] = categories
train_annotations["images"] = []
train_annotations["annotations"] = []
train_annotations["input_oracle_annotations"] = []
train_annotations["segmentation_images"] = []

val_annotations = {}
val_annotations["categories"] = categories
val_annotations["images"] = []
val_annotations["annotations"] = []
val_annotations["input_oracle_annotations"] = []
val_annotations["segmentation_images"] = []

test_annotations = {}
test_annotations["categories"] = categories
test_annotations["images"] = []
test_annotations["annotations"] = []
test_annotations["input_oracle_annotations"] = []
test_annotations["segmentation_images"] = []

In [6]:
# randomly shuffle a list 
n_determiners = 25 
n_samples_per_determiner = 10000

train = 0.7 
val = 0.1 
test = 0.2

for i in range(n_determiners): 
    # generate a list of random indexes from 1 to 10000
    idxs = [j for j in range(i*n_samples_per_determiner, (i+1)*n_samples_per_determiner)]
    np.random.shuffle(idxs)
    n_train = int(train * n_samples_per_determiner)
    n_val = int(val * n_samples_per_determiner)
    n_test = n_samples_per_determiner - n_train - n_val

    train_idxs = idxs[:n_train]
    val_idxs = idxs[n_train:n_train+n_val]
    test_idxs = idxs[n_train+n_val:]

    for idx in train_idxs:
        image = images[idx]
        segmentation_image = segmentation_images[idx]
        train_annotations["images"].append(image)
        train_annotations["annotations"].extend(det_anns_map[image["id"]])
        train_annotations["input_oracle_annotations"].extend(oracle_anns_map[image["id"]])
        train_annotations["segmentation_images"].append(segmentation_image)

    for idx in val_idxs:
        image = images[idx]
        segmentation_image = segmentation_images[idx]
        val_annotations["images"].append(image)
        val_annotations["annotations"].extend(det_anns_map[image["id"]])
        val_annotations["input_oracle_annotations"].extend(oracle_anns_map[image["id"]])
        val_annotations["segmentation_images"].append(segmentation_image)

    for idx in test_idxs:
        image = images[idx]
        segmentation_image = segmentation_images[idx]
        test_annotations["images"].append(image)
        test_annotations["annotations"].extend(det_anns_map[image["id"]])
        test_annotations["input_oracle_annotations"].extend(oracle_anns_map[image["id"]])
        test_annotations["segmentation_images"].append(segmentation_image)



9999
19999
29999
39999
49999
59999
69999
79999
89999
99999
109999
119999
129999
139999
149999
159999
169999
179999
189999
199999
209999
219999
229999
239999
249999


In [7]:
print("Length of train annotations: ", len(train_annotations["images"]))
print("Length of val annotations: ", len(val_annotations["images"]))
print("Length of test annotations: ", len(test_annotations["images"]))

Length of train annotations:  175000
Length of val annotations:  25000
Length of test annotations:  50000


In [8]:
det_counts = defaultdict(int)

for images in train_annotations["images"]:
    caption = images["caption"]
    det = caption.split(" ")[0]
    det_counts[det] += 1

print(det_counts)


defaultdict(<class 'int'>, {'a': 7000, 'an': 7000, 'all': 7000, 'any': 7000, 'every': 7000, 'my': 7000, 'your': 7000, 'this': 7000, 'that': 7000, 'these': 7000, 'those': 7000, 'some': 7000, 'many': 7000, 'few': 7000, 'both': 7000, 'neither': 7000, 'little': 7000, 'much': 7000, 'either': 7000, 'our': 7000, 'no': 7000, 'the': 7000, 'half': 7000, 'several': 7000, 'each': 7000})


In [9]:
# save the annotations

train_annotations_filename = "annotations_train.json"
train_annotations_filepath = os.path.join(annotations_dir, train_annotations_filename)
json.dump(train_annotations, open(train_annotations_filepath, "w"))

val_annotations_filename = "annotations_val.json"
val_annotations_filepath = os.path.join(annotations_dir, val_annotations_filename)
json.dump(val_annotations, open(val_annotations_filepath, "w"))

test_annotations_filename = "annotations_test.json"
test_annotations_filepath = os.path.join(annotations_dir, test_annotations_filename)
json.dump(test_annotations, open(test_annotations_filepath, "w"))

Split train into 1, 5, 10, 25, 50, 100% 

In [9]:
annotations_filename = "annotations_train.json"
annotations_dir = "../../annotations/old"
save_dir = os.path.join(annotations_dir, "splits")
annotations_filepath = os.path.join(annotations_dir, annotations_filename)
annotations = json.load(open(annotations_filepath))

n_determiners = 25 
categories = annotations["categories"]
images = annotations["images"]
det_annotations = annotations["annotations"]
oracle_annotations = annotations["input_oracle_annotations"]
segmentation_images = annotations["segmentation_images"]


In [12]:
det_anns_map = defaultdict(list)
oracle_anns_map = defaultdict(list)

for ann in det_annotations:
    det_anns_map[ann["image_id"]].append(ann)

for ann in oracle_annotations:
    oracle_anns_map[ann["image_id"]].append(ann)

In [10]:
print(len(images))

175000


In [20]:
n_samples_per_determiner = len(images)//n_determiners

splits = [0.1, 0.25, 1, 2.5, 5, 10, 25, 50]

for split in splits:
    split_annotation = {
        "categories": categories,
        "images": [],
        "annotations": [],
        "input_oracle_annotations": [],
        "segmentation_images": []
    }

    for i in range(n_determiners): 
        idxs = [j for j in range(i*n_samples_per_determiner, (i+1)*n_samples_per_determiner)]
        np.random.shuffle(idxs)
        
        n_samples = int(split/100 * n_samples_per_determiner)
        
        idxs = idxs[:n_samples]

        for idx in idxs:
            image = images[idx]
            segmentation_image = segmentation_images[idx]
            split_annotation["images"].append(image)
            split_annotation["annotations"].extend(det_anns_map[image["id"]])
            split_annotation["input_oracle_annotations"].extend(oracle_anns_map[image["id"]])
            split_annotation["segmentation_images"].append(segmentation_image)

    split_annotation_filename = f"train_{str(split).replace(".", "point")}.json"
    split_annotation_filepath = os.path.join(save_dir, split_annotation_filename)
    json.dump(split_annotation, open(split_annotation_filepath, "w"))



In [17]:
annotations = json.load(open(os.path.join(save_dir, "1_train.json")))
print(len(annotations["images"]))

1750
