In [9]:
import pandas as pd
import numpy as np

In [10]:
import os

In [11]:
dataset_path = '/Users/tin/data/defect-rec-multi'
dataset_name = os.path.basename(dataset_path)
anno_file = '{}/annotations/instances_{}.json'.format(dataset_path, dataset_name)

train_anno_file = '{}/annotations/instances_{}_train.json'.format(dataset_path, dataset_name)
val_anno_file = '{}/annotations/instances_{}_validate.json'.format(dataset_path, dataset_name)
test_anno_file = '{}/annotations/instances_{}_test.json'.format(dataset_path, dataset_name)

In [12]:
import json

In [13]:
data = json.load(open(anno_file, 'r'))

In [14]:
data.keys()

dict_keys(['categories', 'images', 'annotations'])

In [15]:
images = pd.DataFrame(data['images'])
annotations = pd.DataFrame(data['annotations'])

In [16]:
images.columns

Index(['annotated', 'annotating', 'category_ids', 'coco_url', 'dataset_id',
       'date_captured', 'file_name', 'flickr_url', 'height', 'id', 'license',
       'metadata', 'path', 'width'],
      dtype='object')

In [17]:
annotations.columns

Index(['area', 'bbox', 'category_id', 'color', 'creator', 'dataset_id',
       'height', 'id', 'image_id', 'iscrowd', 'metadata', 'segmentation',
       'width'],
      dtype='object')

In [18]:
ratios = [0.7, 0.2, 0.1]
cumsum_ratios = np.cumsum(ratios)
train_imgs, validate_imgs, test_imgs = np.split(images.sample(frac=1),
                                                [int(cumsum_ratios[0]*len(images)), int(cumsum_ratios[1]*len(images))])

In [19]:
print(train_imgs.columns)
print(len(train_imgs))

Index(['annotated', 'annotating', 'category_ids', 'coco_url', 'dataset_id',
       'date_captured', 'file_name', 'flickr_url', 'height', 'id', 'license',
       'metadata', 'path', 'width'],
      dtype='object')
1189


In [20]:
print(annotations.columns)
print(len(annotations))

Index(['area', 'bbox', 'category_id', 'color', 'creator', 'dataset_id',
       'height', 'id', 'image_id', 'iscrowd', 'metadata', 'segmentation',
       'width'],
      dtype='object')
2386


In [21]:
annotations.shape

(2386, 13)

In [22]:
train_annos = annotations[annotations['image_id'].isin(train_imgs['id'])]
validate_annos = annotations[annotations['image_id'].isin(validate_imgs['id'])]
test_annos = annotations[annotations['image_id'].isin(test_imgs['id'])]

In [23]:
len(train_annos) + len(validate_annos) + len(test_annos)

2386

In [24]:
annotations['category_id'].value_counts(normalize=True)

1    0.583822
5    0.196563
3    0.125733
2    0.093881
Name: category_id, dtype: float64

In [25]:
train_annos['category_id'].value_counts(normalize=True)

1    0.583790
5    0.198659
3    0.120658
2    0.096892
Name: category_id, dtype: float64

In [26]:
validate_annos['category_id'].value_counts(normalize=True)

1    0.602794
5    0.191617
3    0.135729
2    0.069860
Name: category_id, dtype: float64

In [27]:
test_annos['category_id'].value_counts(normalize=True)

1    0.545082
5    0.192623
3    0.139344
2    0.122951
Name: category_id, dtype: float64

In [28]:
data.keys()

dict_keys(['categories', 'images', 'annotations'])

In [29]:
train_data = {'images': train_imgs.to_dict('records'),
              'categories': data['categories'],
              'annotations': train_annos.to_dict('records')}

validate_data = {'images': validate_imgs.to_dict('records'),
              'categories': data['categories'],
              'annotations': validate_annos.to_dict('records')}

test_data = {'images': test_imgs.to_dict('records'),
              'categories': data['categories'],
              'annotations': test_annos.to_dict('records')}

In [30]:
with open(train_anno_file, 'w') as fo:
    json.dump(train_data, fo)

with open(val_anno_file, 'w') as fo:
    json.dump(validate_data, fo)

with open(test_anno_file, 'w') as fo:
    json.dump(test_data, fo)

In [31]:
from pycocotools.coco import COCO

In [32]:
coco = COCO(train_anno_file)

loading annotations into memory...
Done (t=0.04s)
creating index...
index created!


In [33]:
coco = COCO(val_anno_file)

loading annotations into memory...
Done (t=0.01s)
creating index...
index created!


In [34]:
coco = COCO(test_anno_file)

loading annotations into memory...
Done (t=0.01s)
creating index...
index created!
