In [1]:
from copy import deepcopy
from utils import convert2coco, dataset_split, dataset_analysis
from utils import coco_to_img2annots, img2annots_to_coco, save_json_file
from utils import check_image_id_duplication, check_annotation_id_duplication

In [2]:
def split_by_num_objects(annotations, num_objects_split=120):
    img2annots = coco_to_img2annots(annotations)
    
    img2annots1 = {
        'type': img2annots['type'],
        'categories': img2annots['categories'],
        'img2annots': {}
    }
    
    img2annots2 = deepcopy(img2annots1)
    
    for key, val in img2annots['img2annots'].items():
        num_objects = 0
        for _, no in val['num_objects'].items():
            num_objects = num_objects + no
        
        if num_objects >= num_objects_split:
            img2annots1['img2annots'][key] = val
        else:
            img2annots2['img2annots'][key] = val
        
    # print(len(img2annots1['img2annots']))
    # print(len(img2annots2['img2annots']))
    
    return img2annots_to_coco(img2annots1), img2annots_to_coco(img2annots2)

def concatenate_2_coco_annotations(annotations_1, annotations_2):
    ########## IMPORTANT ##########
    # Assumption: inputs have exactly the same type and categories
    
    annotations = deepcopy(annotations_1)
    for key in ['images', 'annotations']:
        for val in annotations_2[key]:
            annotations[key].append(val)
            
    return annotations

In [3]:
base = './data/ori'
annotations = convert2coco(base)

In [4]:
annotations_split_1, annotations_split_2 = split_by_num_objects(annotations)

In [5]:
split_dictionary = {
    'train': 0.60,
    'val': 0.20,
    'test': 0.20
}

ann_split_1 = dataset_split(annotations_split_1, split_dictionary, 20000)
ann_split_2 = dataset_split(annotations_split_2, split_dictionary, 20000)

The best error: 0.0011583955693151798
The best error: 0.0007074338279419634
The best error: 0.0005309330031942209
The best error: 0.0003345984785767271
The best error: 0.00015668930596868922
The best error: 4.170811961990254e-05
The best error: 4.047713685417575e-05
The best error: 3.370437759060894e-05
The best error: 2.7202927936513847e-05
The best error: 2.2908706461815647e-05
The best error: 0.0030899645694929135
The best error: 0.00299280673003211
The best error: 0.0012331221015432624
The best error: 0.0004846979143656389
The best error: 0.0004468568113179473
The best error: 0.0003680497345411491
The best error: 0.0003520994616082913
The best error: 0.00034519353350321035
The best error: 0.0002561975663368075
The best error: 8.874325999029039e-05
The best error: 4.221448566508577e-05
The best error: 3.407378795580049e-05


In [6]:
results_split_1 = dataset_analysis(ann_split_1)

-----------------------------------
num_images          89
num_objects      16228
-----------------------------------
num_images on each set

train          54    0.607
val            18    0.202
test           17    0.191
-----------------------------------
num_objects on each set

train        9747    0.601
val          3223    0.199
test         3258    0.201
-----------------------------------
Category: 1

train        4482    0.603
val          1468    0.198
test         1481    0.199
-----------------------------------
Category: 2

train        5265    0.598
val          1755    0.199
test         1777    0.202
-----------------------------------


In [7]:
results_split_2 = dataset_analysis(ann_split_2)

-----------------------------------
num_images          91
num_objects       6911
-----------------------------------
num_images on each set

train          55    0.604
val            19    0.209
test           17    0.187
-----------------------------------
num_objects on each set

train        4166    0.603
val          1380    0.200
test         1365    0.198
-----------------------------------
Category: 1

train        2361    0.604
val           779    0.199
test          767    0.196
-----------------------------------
Category: 2

train        1805    0.601
val           601    0.200
test          598    0.199
-----------------------------------


In [8]:
anns = {}
for set_name in ['train', 'val', 'test']:
    anns[set_name] = concatenate_2_coco_annotations(ann_split_1[set_name], ann_split_2[set_name])
results = dataset_analysis(anns)

-----------------------------------
num_images         180
num_objects      23139
-----------------------------------
num_images on each set

train         109    0.606
val            37    0.206
test           34    0.189
-----------------------------------
num_objects on each set

train       13913    0.601
val          4603    0.199
test         4623    0.200
-----------------------------------
Category: 1

train        6843    0.604
val          2247    0.198
test         2248    0.198
-----------------------------------
Category: 2

train        7070    0.599
val          2356    0.200
test         2375    0.201
-----------------------------------


In [9]:
# Save and check duplication
print('-'*40)
for key, val in anns.items():
    save_json_file(val, f'data/instances_{key}.json')
    print('set_name:', key,'\n')
    print('image_id_duplication', ' '*5, check_image_id_duplication(val['images']))
    print('annotation_id_duplication', '', check_annotation_id_duplication(val['annotations']))
    print('-'*40)

----------------------------------------
set_name: train 

image_id_duplication       False
annotation_id_duplication  False
----------------------------------------
set_name: val 

image_id_duplication       False
annotation_id_duplication  False
----------------------------------------
set_name: test 

image_id_duplication       False
annotation_id_duplication  False
----------------------------------------
