In [2]:
import json
import numpy as np
import os
import random
import copy


COCO_PATH = "data/coco/annotations/instances_train2017.json"
PART_PATH = "data/coco/parts/annotations/"


In [3]:
def read_file(path):
    with open(path, "r") as f:
        tmp = json.load(f)
    return tmp

def write_to_file(path, info):
    with open(path, "w") as f:
        json.dump(info, f)


def img_for_cat(annotations, cats):
    images = annotations["images"]
    instances = annotations["annotations"]
    return list(set([ins["image_id"] for ins in instances if ins["category_id"] in cats]))


def get_instances_per_class(instances):
    class_number_instances = {}
    for instance in instances:
        cat_id = instance["category_id"]
        if cat_id not in class_number_instances.keys():
            class_number_instances[cat_id] = 0
        class_number_instances[cat_id] +=1
    return class_number_instances

def image_ids_by_instances(annotations, instances):
    image_ids = [instance["image_id"] for instance in instances]
    images = annotations["images"]
    return [image for image in images if image["id"] in image_ids]

def instances_by_images(annotations, images):
    instances = annotations["annotations"]
    return [instance for instance in instances if instance["image_id"] in images]

def len_intresection(list1, list2):
    return len(set(list1).intersection(set(list2)))


def random_2_image_assign(imgs1, imgs2, three_set_intresection=set()):
    two_set_intresection = list(set(imgs1).intersection(set(imgs2)) - set(three_set_intresection))
    N = len(two_set_intresection)
    return two_set_intresection[: int(N/2)], two_set_intresection[int(N/2):]


def random_image_assign(images):
    three_set_intresection = list((set(images[0]).intersection(set(images[1]))).intersection(set(images[2])))
    N = len(three_set_intresection)
    imgs = [three_set_intresection[: int(N/3)], three_set_intresection[int(N/3): 2*int(N/3)], three_set_intresection[2*int(N/3) :]]
    for (i,j) in zip([0,0,1], [1,2,2]):
        part1, part2 = random_2_image_assign(images[i], images[j], three_set_intresection)
        imgs[i].extend(part1)
        imgs[j].extend(part2)
    for i in range(3):
        xor = set(images[i]) - set(images[(i+1)%3]) - set(images[(i+2)%3])
        imgs[i].extend(xor)
    return imgs


def get_instances_for_image_cat_groups(instances, cats_groups, imgs_groups):
    annotations_groups = []
    instaces_by_image = get_instances_idx_by_images(instances)
    for idx in range(len(imgs_groups)):
        cats_group = cats_groups[idx]
        img_group = imgs_groups[idx]
        annotations_group = []
        for img in img_group:
            instances_idx = instaces_by_image[img]
            instances_per_images = [instances[instance_idx] for instance_idx in instances_idx]
            #import ipdb; ipdb.set_trace()
            for instance in instances_per_images:
                cat = instance["category_id"]
                if cat in cats_group:
                    annotations_group.append(instance)
        annotations_groups.append(annotations_group)
    return annotations_groups


def get_instances_idx_by_images(instances):
    instaces_by_image = {}
    for idx, instance in enumerate(instances):
        img_id = instance["image_id"]
        if img_id not in instaces_by_image.keys():
            instaces_by_image[img_id] = []
        instaces_by_image[img_id].append(idx)
    return instaces_by_image


def get_categories_from_initial_annotations(annotations, cat_ids):
    return [cat_info for cat_info in annotations["categories"] if cat_info["id"] in cat_ids]


def create_group_dataset(annotations, instances_group, images_group_ids, cats_group_ids):
    images = annotations["images"]
    images_group = [image for image in images if image["id"] in images_group_ids]
    categories_group = get_categories_from_initial_annotations(annotations, cats_group_ids)
    return create_dataset(instances_group, images_group, categories_group)

def create_dataset(instances_group, images_group, categories_group):
    annotations_group = {}
    annotations_group["annotations"] = instances_group
    annotations_group["images"] = images_group
    annotations_group["categories"] = categories_group
    return annotations_group

def get_cats_group(instances):
    instances_per_class = get_instances_per_class(instances)
    instances_per_class_sorted = list(reversed(sorted(instances_per_class.items(), key = lambda x: x[1])))
    cat_group1 = [instances_per_class_sorted[i][0] for i in range(2, len(instances_per_class_sorted), 2)]
    cat_group2 = [instances_per_class_sorted[i][0] for i in range(1, len(instances_per_class_sorted), 2)]
    cat_group3 = [instances_per_class_sorted[0][0]]
    cats_groups = [cat_group1, cat_group2, cat_group3]
    return cats_groups


def dataset_from_cats(annotations, cats_groups, suffix="_train"):
    instances = annotations["annotations"]
    images = [img_for_cat(annotations, [idx for idx in group]) for group in cats_groups]
    images_groups = random_image_assign(images)
    instances_groups_train= get_instances_for_image_cat_groups(instances, cats_groups, images_groups)
    
    for idx in range(len(images)):
        annotations_group = create_group_dataset(annotations, instances_groups_train[idx], images_groups[idx], cats_groups[idx])
        #write_to_file( os.path.join(PART_PATH, str(idx)+suffix+".json"), annotations_group)


In [None]:
annotations = read_file(COCO_PATH)
instances = annotations["annotations"]
cats_groups = get_cats_group(instances)
dataset_from_cats(annotations, cats_groups, suffix="_train")

In [None]:
[len(group) for group in cats_groups]

In [None]:
COCO_PATH_VAL = "data/coco/annotations/instances_val2017.json"
annotations_val = read_file(COCO_PATH_VAL)
dataset_from_cats(annotations_val, cats_groups, suffix = "_val")



In [None]:
annotations = read_file(COCO_PATH)
len(annotations["annotations"])

In [None]:
print("In 1,2,3 group theare are ", [len(img) for img in images ], "images")
print("In 1,2,3 group theare are ", [len(group) for group in cats_groups], "cats")
print("Image intresection of (0,1), (0,2), (1,3)", [len_intresection(images[x], images[y]) for (x,y) in zip([0,0,1], [1,2,2])]) 
print("Image inresection of (0,1,2)", len((set(images[0]).intersection(set(images[1]))).intersection(set(images[2]))))

In [None]:
print("Final annnoations of files. Images ",  list(map(lambda x: len(x), images_groups)))
print("Final annnoations of files train. Instances ",  list(map(lambda x: len(x), instances_groups_train)))
print("Final annnoations of files val. Instances ",  list(map(lambda x: len(x), instances_groups_vals)))

In [None]:
print("Total number of groups instances/ total initial number", sum(list(map(lambda x: len(x), instances_groups)))/ len(instances))

In [None]:
files = ["0.json", "0_train.json", "0_val.json"]
file = files[2]
annotations = {}
with open(os.path.join(PART_PATH, file), "r") as f:
        annotations[file] = json.load(f)

In [None]:
len(annotations[files[2]]["annotations"])

In [None]:
[len(annotations[i]["categories"]) for i in files]

In [None]:
files = os.listdir(PART_PATH)
for file in files:
    with open(os.path.join(PART_PATH, file), "r") as f:
        annotations = json.load(f)
    instances = annotations["annotations"]
    images = annotations["images"]
    random.shuffle(images)
    N = len(images)
    train_img, val_img = images[:int(0.8*N)], images[int(0.8*N):]
    train_inst, val_inst = instances_by_images(annotations, train_inst), instances_by_images(annotations, val_inst)
    train_dataset, val_dataset = create_dataset(train_inst, train_img, annotations["categories"]), create_dataset(train_val, train_val, annotations["categories"])
    base_filename = os.path.join(PART_PATH, file).split(".json")[0]
    write_to_file(base_filename+"+_val.json", val_dataset)
    write_to_file(base_filename+"+_train.json", train_dataset)
    
    
    break

# Statictis of datasets

In [None]:
files = ["0_train.json", "_train.json", "2_val.json"]
annotations = {}
with open(os.path.join(PART_PATH, file), "r") as f:
        annotations[file] = json.load(f)

# Create Val based on Parts

In [4]:
COCO_PATH = "../data/coco/annotations/instances_val2017.json"
PART_PATH = "../data/coco/intresected_parts/annotations/"
val = read_file(COCO_PATH)

In [5]:
files = ["0_train.json", "_train.json", "2_val.json"]
for idx, file in ennumerate(files):
    annotations = read_file(os.path.join(PART_PATH, file))
    cats = annotations["categories"]
    ann_parts = [ann for ann in val["annotations"] if ann["category_id"] in cats]
    write_to_file(os.path.join(PART_PATH, "coco"+str(idx)+"_val.json"), 
                  {"images": val["images"], 
                   "annotations": ann_parts,
                  "categories": cats})
    

NotADirectoryError: [Errno 20] Not a directory: '../data/coco/annotations/instances_val2017.json/0_train.json'