In [None]:
from copy import deepcopy
from utils import convert2coco, dataset_split, dataset_analysis
from utils import coco_to_img2annots, img2annots_to_coco, save_json_file
from utils import check_image_id_duplication, check_annotation_id_duplication

In [None]:
def split_by_num_objects(annotations, num_objects_split=120):
    img2annots = coco_to_img2annots(annotations)
    
    img2annots1 = {
        'type': img2annots['type'],
        'categories': img2annots['categories'],
        'img2annots': {}
    }
    
    img2annots2 = deepcopy(img2annots1)
    
    for key, val in img2annots['img2annots'].items():
        num_objects = 0
        for _, no in val['num_objects'].items():
            num_objects = num_objects + no
        
        if num_objects >= num_objects_split:
            img2annots1['img2annots'][key] = val
        else:
            img2annots2['img2annots'][key] = val
        
    # print(len(img2annots1['img2annots']))
    # print(len(img2annots2['img2annots']))
    
    return img2annots_to_coco(img2annots1), img2annots_to_coco(img2annots2)

def concatenate_2_coco_annotations(annotations_1, annotations_2):
    ########## IMPORTANT ##########
    # Assumption: inputs have exactly the same type and categories
    
    annotations = deepcopy(annotations_1)
    for key in ['images', 'annotations']:
        for val in annotations_2[key]:
            annotations[key].append(val)
            
    return annotations

In [3]:
base = './data/ori'
annotations = convert2coco(base)

In [4]:
annotations_split_1, annotations_split_2 = split_by_num_objects(annotations)

In [5]:
split_dictionary = {
    'train': 0.60,
    'val': 0.20,
    'test': 0.20
}

ann_split_1 = dataset_split(annotations_split_1, split_dictionary, 10000)
ann_split_2 = dataset_split(annotations_split_2, split_dictionary, 10000)

The best error: 0.0025285820459238096
The best error: 0.001728383055635808
The best error: 0.0008509389410633236
The best error: 0.0007904229430527673
The best error: 0.0006755372506618615
The best error: 0.0005724400321431754
The best error: 0.00024343343452798446
The best error: 0.00010672960672460979
The best error: 8.630289129719244e-05
The best error: 7.303264618028857e-05
The best error: 6.406480665843996e-05
The best error: 2.91151516095374e-05
The best error: 1.7847396297138352e-05
The best error: 1.6065037539174714e-05
The best error: 0.0026950399164626672
The best error: 0.002634227667185885
The best error: 0.0020762380401282356
The best error: 0.0008450298707356742
The best error: 0.0006081089980456688
The best error: 0.0005525958410105324
The best error: 0.00044268733609703734
The best error: 0.00040963313325720634
The best error: 0.00016948587714615793
The best error: 0.00012890769545038813
The best error: 0.0001243975780364595
The best error: 0.00010902651931936703
The be

In [6]:
results_split_1 = dataset_analysis(ann_split_1)

-----------------------------------
num_images          89
num_objects      16228
-----------------------------------
num_images on each set

train          54    0.607
val            18    0.202
test           17    0.191
-----------------------------------
num_objects on each set

train        9744    0.600
val          3273    0.202
test         3211    0.198
-----------------------------------
Category: 1

train        4457    0.600
val          1500    0.202
test         1474    0.198
-----------------------------------
Category: 2

train        5287    0.601
val          1773    0.202
test         1737    0.197
-----------------------------------


In [7]:
results_split_2 = dataset_analysis(ann_split_2)

-----------------------------------
num_images          91
num_objects       6911
-----------------------------------
num_images on each set

train          55    0.604
val            19    0.209
test           17    0.187
-----------------------------------
num_objects on each set

train        4147    0.600
val          1380    0.200
test         1384    0.200
-----------------------------------
Category: 1

train        2343    0.600
val           776    0.199
test          788    0.202
-----------------------------------
Category: 2

train        1804    0.601
val           604    0.201
test          596    0.198
-----------------------------------


In [8]:
anns = {}
for set_name in ['train', 'val', 'test']:
    anns[set_name] = concatenate_2_coco_annotations(ann_split_1[set_name], ann_split_2[set_name])
results = dataset_analysis(anns)

-----------------------------------
num_images         180
num_objects      23139
-----------------------------------
num_images on each set

train         109    0.606
val            37    0.206
test           34    0.189
-----------------------------------
num_objects on each set

train       13891    0.600
val          4653    0.201
test         4595    0.199
-----------------------------------
Category: 1

train        6800    0.600
val          2276    0.201
test         2262    0.200
-----------------------------------
Category: 2

train        7091    0.601
val          2377    0.201
test         2333    0.198
-----------------------------------


In [9]:
# Save and check duplication
print('-'*40)
for key, val in anns.items():
    save_json_file(val, f'data/instances_{key}.json')
    print('set_name:', key,'\n')
    print('image_id_duplication', ' '*5, check_image_id_duplication(val['images']))
    print('annotation_id_duplication', '', check_annotation_id_duplication(val['annotations']))
    print('-'*40)

----------------------------------------
set_name: train 

image_id_duplication       False
annotation_id_duplication  False
----------------------------------------
set_name: val 

image_id_duplication       False
annotation_id_duplication  False
----------------------------------------
set_name: test 

image_id_duplication       False
annotation_id_duplication  False
----------------------------------------
