In [2]:
from pycocotools.coco import COCO
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import matplotlib.patches as patches
import pandas as pd
import yaml
%matplotlib inline

In [3]:
coco = COCO("../object_detection/train.json")
coco_eval = COCO("../object_detection/eval.json")

loading annotations into memory...
Done (t=0.15s)
creating index...
index created!
loading annotations into memory...
Done (t=0.03s)
creating index...
index created!


In [4]:
cat_df = pd.DataFrame.from_dict(coco.cats, orient='index')
ann_df = pd.DataFrame.from_dict(coco.anns, orient='index')
train_img_df = pd.DataFrame.from_dict(coco.imgs, orient='index')
eval_img_df = pd.DataFrame.from_dict(coco_eval.imgs, orient='index')

In [5]:
cat_df

Unnamed: 0,id,name,supercategory
1,1,Actiniaria,Anemone
2,2,Actinernus,Anemone
3,3,Actiniidae,Anemone
4,4,Actinoscyphia,Anemone
5,5,Bolocera,Anemone
...,...,...,...
286,286,Polychaeta,Worm
287,287,Polynoidae,Worm
288,288,Sabellidae,Worm
289,289,Serpulidae,Worm


In [6]:
def images_eda(image_df):
    n_total = len(image_df)
    dims = pd.Series(tuple(zip(image_df.width, image_df.height)))
    print(f'Total number of images in dataset: {n_total}')
    print('Top resolutions:')
    print(dims.value_counts().head(25))

# images_eda(train_img_df)

In [7]:
def object_eda(annotation_df, categories_df):
    cat_count = annotation_df.category_id.value_counts().to_frame()
    df = categories_df.join(cat_count).sort_values('category_id', ascending=False).reset_index(drop=True)
    df = df.rename(columns={'id': 'category_id',
                              'name': 'category',
                              'supercategory': 'supercategory',
                              'category_id': 'cat_id_counts'})
    # df = df[df.cat_id_counts > 50]
    print(f'There are {len(df.dropna())} of 290 species present in the dataset.')
    print(f'There are {len(df.dropna().supercategory.unique())} of 20 semantic supercategories present.')
    return df

df = object_eda(ann_df, cat_df).dropna()
df

There are 133 of 290 species present in the dataset.
There are 16 of 20 semantic supercategories present.


Unnamed: 0,category_id,category,supercategory,cat_id_counts
0,160,Strongylocentrotus fragilis,Urchin,11205.0
1,37,Merluccius productus,Fish,2352.0
2,119,Rathbunaster californicus,Sea star,1528.0
3,51,Sebastes,Fish,1161.0
4,10,Metridium farcimen,Anemone,756.0
...,...,...,...,...
128,195,Parastenella,Sea fan,1.0
129,83,Nudibranchia,Gastropod,1.0
130,101,Hymenaster,Sea star,1.0
131,90,Astropecten,Sea star,1.0


In [8]:
super_dict = {}
for i, supercat in enumerate(df.supercategory.unique()):
    df2 = df[df.supercategory == supercat]
    nsup = sum(df2['cat_id_counts'])
    
    most_common_loc =df2['cat_id_counts'].max()
    # print(supercat, nsup, most_common_loc)
    most_common = int(df2[df2.cat_id_counts == most_common_loc]['category_id'])
    super_dict[i] = {'supercat_id': i,
                     'supercategory': supercat,
                     'top_category_id': most_common,
                     'cat_id_percentage': most_common_loc / nsup}

df_super = pd.DataFrame.from_dict(super_dict, orient='index')
df_super

Unnamed: 0,supercat_id,supercategory,top_category_id,cat_id_percentage
0,0,Urchin,160,0.999643
1,1,Fish,37,0.540938
2,2,Sea star,119,0.582984
3,3,Anemone,10,0.623248
4,4,Sea cucumber,146,0.682629
5,5,Feather star,125,0.985955
6,6,Sea fan,203,0.557692
7,7,Sea pen,214,0.450808
8,8,Shrimp,259,0.454756
9,9,Soft coral,211,1.0


In [9]:
supercat_map = df_super[['supercat_id', 'supercategory']].to_dict()['supercategory']
dataset = {'path': '/data/dataset',
           'train': 'images/train',
           'val': 'images/val',
           'names': supercat_map}
dataset

{'path': '/data/dataset',
 'train': 'images/train',
 'val': 'images/val',
 'names': {0: 'Urchin',
  1: 'Fish',
  2: 'Sea star',
  3: 'Anemone',
  4: 'Sea cucumber',
  5: 'Feather star',
  6: 'Sea fan',
  7: 'Sea pen',
  8: 'Shrimp',
  9: 'Soft coral',
  10: 'Eel',
  11: 'Squat lobster',
  12: 'Glass sponge',
  13: 'Gastropod',
  14: 'Crab',
  15: 'Worm'}}

In [10]:
dataset

{'path': '/data/dataset',
 'train': 'images/train',
 'val': 'images/val',
 'names': {0: 'Urchin',
  1: 'Fish',
  2: 'Sea star',
  3: 'Anemone',
  4: 'Sea cucumber',
  5: 'Feather star',
  6: 'Sea fan',
  7: 'Sea pen',
  8: 'Shrimp',
  9: 'Soft coral',
  10: 'Eel',
  11: 'Squat lobster',
  12: 'Glass sponge',
  13: 'Gastropod',
  14: 'Crab',
  15: 'Worm'}}

In [11]:
with open("/data/super_dataset.yaml", 'w') as yamlfile:
    data = yaml.dump(dataset, yamlfile)
    print("Write successful")

Write successful


In [12]:
df_supertop = df.merge(df_super, on='supercategory')

In [13]:
# df_super.to_json('../supercat_key.json')

In [14]:
ann_df

Unnamed: 0,id,image_id,category_id,segmentation,area,bbox,iscrowd
1,1,1,1.0,[],7869.0,"[347.0, 188.0, 129.0, 61.0]",0
2,2,2,1.0,[],8775.0,"[346.0, 191.0, 135.0, 65.0]",0
3,3,3,1.0,[],8768.0,"[343.0, 192.0, 137.0, 64.0]",0
4,4,4,88.0,[],315.0,"[623.0, 95.0, 21.0, 15.0]",0
5,5,4,1.0,[],440.0,"[361.0, 175.0, 22.0, 20.0]",0
...,...,...,...,...,...,...,...
23700,23700,5948,283.0,[],4360.0,"[698.0, 544.0, 40.0, 109.0]",0
23701,23701,5949,286.0,[],3792.0,"[532.0, 440.0, 48.0, 79.0]",0
23702,23702,5949,286.0,[],3484.0,"[609.0, 426.0, 52.0, 67.0]",0
23703,23703,5950,286.0,[],3216.0,"[148.0, 323.0, 48.0, 67.0]",0


In [15]:
ann_super_df = ann_df.merge(df_supertop, on='category_id').sort_values('id').reset_index(drop=True)

In [16]:
ann_super_df.head(1)

Unnamed: 0,id,image_id,category_id,segmentation,area,bbox,iscrowd,category,supercategory,cat_id_counts,supercat_id,top_category_id,cat_id_percentage
0,1,1,1.0,[],7869.0,"[347.0, 188.0, 129.0, 61.0]",0,Actiniaria,Anemone,211.0,3,10,0.623248


In [17]:
ann_out = ann_super_df[['id', 'image_id', 'category_id', 'segmentation', 'area', 'bbox', 'iscrowd']]
ann_out = ann_out.assign(category_id=ann_super_df['supercat_id'])
ann_out

Unnamed: 0,id,image_id,category_id,segmentation,area,bbox,iscrowd
0,1,1,3,[],7869.0,"[347.0, 188.0, 129.0, 61.0]",0
1,2,2,3,[],8775.0,"[346.0, 191.0, 135.0, 65.0]",0
2,3,3,3,[],8768.0,"[343.0, 192.0, 137.0, 64.0]",0
3,4,4,2,[],315.0,"[623.0, 95.0, 21.0, 15.0]",0
4,5,4,3,[],440.0,"[361.0, 175.0, 22.0, 20.0]",0
...,...,...,...,...,...,...,...
23699,23700,5948,15,[],4360.0,"[698.0, 544.0, 40.0, 109.0]",0
23700,23701,5949,15,[],3792.0,"[532.0, 440.0, 48.0, 79.0]",0
23701,23702,5949,15,[],3484.0,"[609.0, 426.0, 52.0, 67.0]",0
23702,23703,5950,15,[],3216.0,"[148.0, 323.0, 48.0, 67.0]",0


In [18]:
# ann_out.to_json('../annotation_sup.json')

In [19]:
# ann_super_df.to_json('../master_key.json')

In [20]:
# super_map = {}
# for i, cat in enumerate(df.category_id.unique()):
#     df2 = df[df.category_id == cat]
#     # print(supercat, df[df.supercategory == supercat]['category_id'].unique())
#     super_map[cat] = df2.supercategory.values
# super_map

In [21]:
def supercat_eda(annotation_df, categories_df):
    df = categories_df
    out = {}
    supermap = {i: list(df[df.supercategory == i].id.unique()) for i in df.supercategory.unique()}
    for i in supermap.keys():
        out[i] = 0
        for j in supermap[i]:
            out[i] += (len(annotation_df[annotation_df.category_id == j].image_id.unique()))    
    return out, supermap

supercat_eda(ann_df, cat_df)#.head(15)

({'Anemone': 656,
  'Fish': 1937,
  'Eel': 309,
  'Gastropod': 126,
  'Sea star': 1470,
  'Feather star': 172,
  'Sea cucumber': 486,
  'Urchin': 2647,
  'Glass sponge': 120,
  'Sea fan': 196,
  'Soft coral': 111,
  'Sea pen': 316,
  'Stony coral': 0,
  'Black coral': 0,
  'Crab': 77,
  'Shrimp': 253,
  'Squat lobster': 106,
  'Barnacle': 0,
  'Sea spider': 0,
  'Worm': 19},
 {'Anemone': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
  'Fish': [11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   30,
   31,
   32,
   33,
   34,
   35,
   36,
   37,
   38,
   39,
   40,
   41,
   42,
   43,
   44,
   45,
   46,
   47,
   48,
   49,
   50,
   51,
   52,
   53,
   54,
   55,
   56],
  'Eel': [57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73],
  'Gastropod': [74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87],
  'Sea star': [88,
   89,
   90,
   91,
   92,
   93,
   94,
   95,
   96,
   97,
   98,
   9