In [1]:
import numpy as np
from abyss_deep_learning.coco_classes import CocoDataset
from pycocotools.coco import COCO

In [57]:
# Generate a dataset
class_dist = {"rhino": 0.10, "hippo": 0.30, "dingo": 0.5}

ds = CocoDataset()
for class_name, class_prob in class_dist.items():
    ds.add_category(class_name)
    print(class_name)
    
for _ in range(200):
    image_id = ds.add_image((100, 100), "{:d}.png".format(image_id), "url")
    for category_id, (class_name, class_prob) in enumerate(class_dist.items()):
        if np.random.binomial(1, class_prob):
            ds.add_annotation({"image_id": image_id, "category_id": category_id + 1})

dingo
rhino
hippo


In [58]:
ds.save("/data/tmp/blah2.json")
ds = COCO("/data/tmp/blah2.json")

loading annotations into memory...
Done (t=0.00s)
creating index...
index created!


In [59]:
import pandas as pd
df = pd.DataFrame(list(ds.anns.values()))
df = df.set_index('id')
df

Unnamed: 0_level_0,category_id,image_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3,1
2,1,4
3,1,5
4,1,6
5,3,6
6,1,7
7,3,7
8,1,8
9,3,8
10,3,9


In [60]:
### Convert dataset from csv captions to proper COCO captions
# n = df.index.max() + 1
# for ann_id in tuple(ds.anns.keys()):
#     ann = ds.anns[ann_id]
#     captions = ann['caption'].split(",")
#     ann['caption'] = captions[0]
#     for caption in captions[1:]:
#         ds.anns[n] = dict(ann)
#         ds.anns[n]['caption'] = caption
#         n += 1
# cds2 = CocoDataset.from_COCO(ds)
# cds2.save("/data/tmp/blah.json")
# ds = COCO("/data/tmp/blah.json")

In [61]:
# Captions in each image
df[["image_id", "category_id"]].groupby("image_id").count()["category_id"]

image_id
1      1
4      1
5      1
6      2
7      2
8      2
9      1
10     1
12     2
13     1
14     1
15     1
16     1
17     2
18     1
19     2
21     1
23     1
24     2
26     1
27     1
28     2
29     2
30     1
31     1
32     1
33     1
34     1
35     1
36     1
      ..
157    1
158    1
159    2
161    1
162    2
164    2
165    1
167    1
168    2
169    1
170    1
172    1
174    1
175    2
176    1
178    2
179    2
180    1
181    2
182    1
183    2
184    2
185    1
188    1
189    2
190    1
192    1
193    2
198    2
200    1
Name: category_id, Length: 147, dtype: int64

In [62]:
# Image IDs by caption
dict(df[["image_id", "category_id"]].groupby("category_id").apply(lambda x: x['image_id'].as_matrix()))

{1: array([  4,   5,   6,   7,   8,  10,  12,  13,  14,  15,  17,  19,  21,
         24,  27,  28,  29,  30,  31,  32,  33,  37,  38,  40,  42,  45,
         47,  49,  51,  52,  53,  54,  56,  57,  59,  60,  61,  62,  65,
         67,  70,  72,  73,  75,  76,  78,  79,  80,  81,  82,  83,  85,
         86,  88,  89,  91,  92,  93,  96,  97, 100, 102, 103, 104, 105,
        106, 107, 108, 109, 111, 112, 115, 116, 119, 121, 127, 129, 131,
        132, 136, 141, 142, 143, 145, 146, 147, 154, 155, 158, 159, 161,
        162, 164, 168, 169, 170, 172, 174, 176, 180, 183, 184, 188, 189,
        192, 193, 198, 200]),
 2: array([ 12,  28,  35,  36,  46,  49,  51,  52,  56,  62,  65,  69,  72,
         87,  95, 107, 126, 129, 132, 159, 165, 168, 175, 178, 179, 181,
        184]),
 3: array([  1,   6,   7,   8,   9,  16,  17,  18,  19,  23,  24,  26,  29,
         34,  44,  45,  59,  60,  63,  66,  67,  74,  77,  81,  82,  87,
         88,  89,  91,  95,  97, 101, 104, 106, 107, 111, 116, 118, 11

In [67]:
from collections import Counter
def balanced_annotation_set(coco, ann_type='caption', num_anns=None, ignore=None):
    """Return a subset of image IDs that produce the largest balanced set, approximatley."""
    ignore = ignore or []
    annotations = [ann for ann in coco.anns.values() if ann_type in ann and ann[ann_type] not in ignore]
    captions = [
        ann[ann_type] for ann in annotations
        if ann_type in ann and ann[ann_type] not in ignore]
    count_captions = Counter(captions)
    unique_captions = np.unique(captions)
    
    image_ids_for_class = dict(df[["image_id", ann_type]].groupby(ann_type).apply(lambda x: x['image_id'].as_matrix().tolist()))

    captions_in_image = { # Counts how many captions are in each image
        image_id: ([
            ann[ann_type]
            for ann in annotations if ann['image_id'] == image_id])
        for image_id in coco.getImgIds()}
    
    out = {caption: [] for caption in unique_captions}
    caption_count = {caption: 0 for caption in unique_captions}
    
    def add_to_counts(image_id):
        # Increment counts for all captions in image
        for caption in captions_in_image[image_id]:
            out[caption].append(image_id)
            caption_count[caption] += 1
        # Remove image_id from all images_in_caption
        for images in image_ids_for_class.values():
            if image_id in images:
                images.pop(images.index(image_id))
                
    target_set_size = num_anns or min(count_captions.values())
    while any([caption_count[caption] < target_set_size for caption in unique_captions]):
        least = min(out.items(), key=lambda x: len(x[1]))
        image_id = image_ids_for_class[least[0]].pop()
        add_to_counts(image_id)
    out = list(set([j
           for i in out.values()
          for j in i]))
    return out

balanced_ids = balanced_annotation_set(ds, ann_type='category_id', ignore=[4, 5])
print(balanced_ids)
balanced_anns = ds.loadAnns(ds.getAnnIds(imgIds=balanced_ids))
print(Counter([ann['category_id'] for ann in balanced_anns]))


Counter({1: 108, 3: 67, 2: 27})
[129, 132, 139, 12, 145, 182, 150, 152, 153, 155, 28, 157, 156, 159, 162, 35, 164, 165, 36, 167, 168, 174, 175, 176, 49, 178, 51, 180, 52, 181, 183, 184, 56, 179, 185, 188, 189, 62, 190, 192, 193, 65, 69, 198, 200, 72, 87, 95, 46, 107, 126]
Counter({1: 29, 2: 27, 3: 27})
