In [1]:
from pycocotools.coco import COCO
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import matplotlib.patches as patches
import pandas as pd
import yaml
%matplotlib inline

In [14]:
DATAPATH = "/data/"
EVALPATH = "/data/eval/"
TRAINPATH = "/data/train/"

In [15]:
coco = COCO("object_detection/train.json")
coco_eval = COCO("object_detection/eval.json")

loading annotations into memory...
Done (t=0.14s)
creating index...
index created!
loading annotations into memory...
Done (t=0.03s)
creating index...
index created!


In [16]:
# mapping of top species depth, shallow corresponds to < 800 m, source: google search
shallow_species = {160: True,
                   37: False,
                   119: True,
                   51: True,
                   10: True,
                   146: False, # 1087m
                   52: False,
                   88: False,
                   125: False,
                   203: False, # 927m
                   214: True, # not much data
                   1: False,
                   259: True,
                   9: False, # 1000m
                   105: True,
                   211: False,
                   133: True,
                   142: False,
                   70: False,
                   260: True,
                   274: True,
                   174: False,
                   205: False, # not much data
                   120: False,
                   219: False, # not much data
                   81: False,
                   69: False,
                   104: True,
                   218: False,
                   16: False,
                   103: True,
                   224: False,
                   228: False,
                   242: False,
                   61: True, # mostly
                   116: True,
                   255: False,
                   202: False,
                   108: False, # unknown
                   11: False
                   }

In [17]:
cat_df = pd.DataFrame.from_dict(coco.cats, orient='index')
cat_df = cat_df.join(pd.Series(shallow_species, name='shallow_species'), on='id').dropna()
cat_df = cat_df.reset_index(drop=True)

In [18]:
cat_df

Unnamed: 0,id,name,supercategory,shallow_species
0,1,Actiniaria,Anemone,False
1,9,Liponema brevicorne,Anemone,False
2,10,Metridium farcimen,Anemone,True
3,11,Actinopterygii,Fish,False
4,16,Anoplopoma fimbria,Fish,False
5,37,Merluccius productus,Fish,False
6,51,Sebastes,Fish,True
7,52,Sebastolobus,Fish,False
8,61,Eptatretus,Eel,True
9,69,Lycodes cortezianus,Eel,False


In [19]:
cat_dict =  cat_df['name'].to_dict()

In [20]:
cat_dict

{0: 'Actiniaria',
 1: 'Liponema brevicorne',
 2: 'Metridium farcimen',
 3: 'Actinopterygii',
 4: 'Anoplopoma fimbria',
 5: 'Merluccius productus',
 6: 'Sebastes',
 7: 'Sebastolobus',
 8: 'Eptatretus',
 9: 'Lycodes cortezianus',
 10: 'Lycodes diapterus',
 11: 'Gastropoda',
 12: 'Asteroidea',
 13: 'Luidia foliolata',
 14: 'Mediaster',
 15: 'Mediaster aequalis',
 16: 'Myxoderma platyacanthum',
 17: 'Pterasteridae',
 18: 'Rathbunaster californicus',
 19: 'Stylasterias forreri',
 20: 'Crinoidea',
 21: 'Apostichopus leukothele',
 22: 'Pannychia',
 23: 'Psolus squamatus',
 24: 'Strongylocentrotus fragilis',
 25: 'Hexactinellida',
 26: 'Swiftia kofoidi',
 27: 'Swiftia simplex',
 28: 'Paragorgiidae',
 29: 'Heteropolypus ritteri',
 30: 'Acanthoptilum',
 31: 'Funiculina',
 32: 'Funiculina-Halipteris complex',
 33: 'Pennatulacea',
 34: 'Umbellula',
 35: 'Chionoecetes tanneri',
 36: 'Caridea',
 37: 'Pandalus platyceros',
 38: 'Pasiphaea',
 39: 'Pleuroncodes planipes'}

In [21]:
dataset = {'path': '/data/dataset',
           'train': 'images/train',
           'val': 'images/val',
           'names': cat_dict}

In [13]:
with open("/data/dataset.yaml", 'w') as yamlfile:
    data = yaml.dump(dataset, yamlfile)
    print("Write successful")

Write successful


In [None]:
cat_df['index'] = range(len(cat_df))
map_dict = cat_df[['id', 'index']].to_dict()['id']
map_dict = {value:key for key, value in map_dict.items()}

In [None]:
cat_df[cat_df.shallow_species == True]['id'].to_list()

In [None]:
ann_df = pd.DataFrame.from_dict(coco.anns, orient='index')
# print(ann_df['image_id'].unique().__len__())
ann_df = ann_df.join(pd.Series(shallow_species, name='shallow_species'), on='category_id').dropna()
ann_df['original_category'] = ann_df['category_id']
ann_df['category_id'] = ann_df['category_id'].map(map_dict)
ann_df


In [None]:
print(ann_df['image_id'].unique().__len__())
remaining_images = pd.Series(ann_df['image_id'].unique(), name='id')
remaining_images

In [None]:
train_img_df = pd.DataFrame.from_dict(coco.imgs, orient='index')
train_img_df = train_img_df.merge(remaining_images, on=['id'])
train_img_df

In [None]:
eval_img_df = pd.DataFrame.from_dict(coco_eval.imgs, orient='index')
eval_img_df

In [None]:
cat_df.to_json('category_key.json')
ann_df.to_json('annotation.json')
train_img_df.to_json('train_image_data.json')
eval_img_df.to_json('eval_image_data.json')

## EDA goals
View statistics for:
- Images
  - total number of images
  - number of images by height/width
- Objects
  - Number of each type of category
  - Supercategory/category breakdown


In [None]:
def images_eda(image_df):
    n_total = len(image_df)
    dims = pd.Series(tuple(zip(image_df.width, image_df.height)))
    print(f'Total number of images in dataset: {n_total}')
    print('Top resolutions:')
    print(dims.value_counts().head(25))

images_eda(img_df)


In [None]:
def object_eda(annotation_df, categories_df):
    cat_count = annotation_df.category_id.value_counts().to_frame()
    df = categories_df.join(cat_count).sort_values('category_id', ascending=False).reset_index(drop=True)
    df = df.rename(columns={'id': 'category_id',
                              'name': 'category',
                              'supercategory': 'supercategory',
                              'category_id': 'cat_id_counts'})
    df = df[df.cat_id_counts > 50]
    print(f'There are {len(df.dropna())} of 290 species present in the dataset.')
    print(f'There are {len(df.dropna().supercategory.unique())} of 20 semantic supercategories present.')
    return df

object_eda(ann_df, cat_df).dropna()

In [None]:
def supercat_eda(annotation_df, categories_df):
    df = categories_df
    out = {}
    supermap = {i: list(df[df.supercategory == i].id.unique()) for i in df.supercategory.unique()}
    for i in supermap.keys():
        out[i] = 0
        for j in supermap[i]:
            out[i] += (len(annotation_df[annotation_df.category_id == j].image_id.unique()))    
    return out, supermap

supercat_eda(ann_df, cat_df)#.head(15)

## Search
Build search functions for:
- Images
  - list image ids with given category_id
  - show images with bbox
  - sort image lists by bbox size

In [None]:
ann_df2 = ann_df.rename(columns={'id': 'annotation_id'})
cat_df2 = cat_df.rename(columns={'id': "category_id"})
img_df2 = train_img_df.rename(columns={'id': 'image_id'})
all_df = pd.merge(ann_df2, cat_df2, on='category_id')
all_df = pd.merge(all_df, img_df2, on='image_id')

all_df

In [None]:
all_df = all_df.join(pd.Series(shallow_species, name='shallow_species'), on='category_id').dropna()
all_df

In [None]:
all_df.image_id.unique().__len__() / 5950

In [None]:
def images_by_cat(df, category_id):
    print(f'{cat_df.loc[category_id]}')
    df = df[df['category_id'] == category_id]
    print(f'\nFound {len(df)} images: \n\n')
    return df

In [None]:
def get_image_with_annotation(img_id, category_id=None):
    row = img_df[img_df.id == img_id]
    filename = row.file_name[img_id]
    anns = ann_df[ann_df.image_id == img_id]
    if category_id != None:
        anns = anns[anns.category_id == category_id]
    print(filename)
    return filename, anns

In [None]:
def show_image(img_id, category_id=None):
    filename, anns = get_image_with_annotation(img_id)
    img_path = TRAINPATH + filename
    img = mpimg.imread(img_path)
    
    _, ax = plt.subplots()
    ax.imshow(img)
    for _, ann in anns.iterrows():
        x, y, width, height = ann['bbox']
        if category_id == ann.category_id:
            rect = patches.Rectangle((x, y), width, height, linewidth=1, edgecolor='r', facecolor='none')
        else:
            rect = patches.Rectangle((x, y), width, height, linewidth=1, edgecolor='b', facecolor='none')
        ax.add_patch(rect)
    plt.show()

In [None]:
def show_image_by_category(df, category_id):
    subdf = images_by_cat(df, category_id)
    images = subdf['image_id']
    for image in images:
        show_image(image, category_id)

In [None]:
images_by_cat(all_df, 1)

In [None]:
_, anns = get_image_with_annotation(5)
anns

In [None]:
show_image(717, 26)

In [None]:
show_image_by_category(all_df, 259)

In [None]:
# all_df.to_json(DATAPATH + 'y_clean.json')

## Submission

In [None]:
import glob
import os

def remap(cat_lst, mapper, shallow):
    cat_df[cat_df.shallow_species == True]['index'].to_list()
    out = []
    osd = 0.9 # set for base case that there are objects found
    for i in cat_lst:
        osd = 0.5
        if i in shallow:
            osd = 0.1 # update if shallow species present
        out.append(str(mapper.get(i)))
    return out, osd


out = {}
filelist = glob.glob('runs/detect/predict/labels/*.txt')
# filelist = glob.glob('test*.txt')
shallow = cat_df[cat_df.shallow_species == True]['index'].to_list()
mapper = cat_df[['id', 'index']].to_dict()['id']

for i, file in enumerate(filelist):
    with open(file, 'r') as f:
        cats = []

        for line in f.readlines():
            cats.append(int(line.split(' ')[0]))

        cats, osd = remap(cats, mapper, shallow)
        cats = ' '.join(cats)
        if len(cats) == 0:
            cats = '160'

    out[i] = {'id': os.path.basename(file)[:-4], 'categories': cats, 'osd': osd}

df = pd.DataFrame.from_dict(out, orient='index')
df[['id', 'categories', 'osd']].to_csv('submission_1.csv', index=False)