In [190]:
import json
import urllib
import glob
import csv
import numpy as np
import pandas as pd
import requests

In [4]:
def getResponse(url):
    operUrl = urllib.request.urlopen(url)
    if(operUrl.getcode()==200):
        data = operUrl.read()
        jsonData = json.loads(data)
    else:
        print("Error receiving data", operUrl.getcode())
    return jsonData

In [80]:
'''
The structure of the label map will be:
source : dest

This way when utilizing the map, the target label will be able to use
label = label_map.get(source) with None check for rejections
'''
label_map = {}

## Steps For Creating Dictionary ##

* Determine current mapping concept
* Get a concept and all of its descendants
* For exclusions, get all of its descendants
* Delete from the current list of taxa
* Repeat for all exclusion concepts
* Repeat for all high level concepts of current mapping

Reference https://docs.google.com/document/d/1CcZ4xmHUslIQjabQjeNOHj2ysXOhThjTZYBCbG885hw/edit?usp=sharing

In [177]:
# Mapping for current taxa
current_high_level_label = "Ray"

In [179]:
current_concept = 'Torpediniformes'
# Call to get concept and its descendants
json_data_taxa = getResponse('http://dsg.mbari.org/kb/v1/phylogeny/taxa/' + current_concept)

In [119]:
# Here we get the list of terms to exclude from the current taxa search
# Reference https://docs.google.com/document/d/1CcZ4xmHUslIQjabQjeNOHj2ysXOhThjTZYBCbG885hw/edit?usp=sharing
current_exclusion_concept = 'Brisingida'
current_exclusion = getResponse('http://dsg.mbari.org/kb/v1/phylogeny/taxa/' + current_exclusion_concept)

In [120]:
# Match the exclusion list to the items in the current search
elements_to_prune = []
for element in current_exclusion:
    for i,entry in enumerate(json_data_taxa):
        if element.get('name') == entry.get('name') and element.get('rank') == entry.get('rank'):
            elements_to_prune.append(i)

In [121]:
# Delete the elements. It's important to do the list in reverse, because if you do it in ascending index order, your indices will be messed up.
for idx in elements_to_prune[::-1]:
    del(json_data_taxa[idx])

In [180]:
# Apply mapping
for element in json_data_taxa:
    label_map[element.get('name')] = current_high_level_label

In [163]:
# delete mistakes by mapping, only use if necessary
keys = [key for key in label_map.keys()]
for key in keys:
    if label_map.get(key) == 'Barnacle':
        print(key)
        #del label_map[key]

In [181]:
# Write the mapping to file
out_path = ''
with open(out_path, 'w') as fp:
    json.dump(label_map, fp)

In [140]:
# Read it back to make sure it's ok
with open(out_path, 'r') as fp:
    test_file = json.load(fp)

### Generate Train and Val splits on the data downloaded from FathomNet ###

In [146]:
# First you need to download all the images and metadata in FathomNet that you want to use into a directory

json_files = glob.glob('/mnt/md0/Projects/Fathomnet/Data_Files/2021-06-04-Download/*.json')

In [None]:
# Generate a train and val split

train_file = open('/mnt/md0/Projects/Fathomnet/Training_Files/2021-06-29-Detectron/train_v2.csv','w')
train_csv = csv.writer(train_file,delimiter=',')
val_file = open('/mnt/md0/Projects/Fathomnet/Training_Files/2021-06-29-Detectron/val_v2.csv','w')
val_csv = csv.writer(val_file,delimiter=',')

'''
Loop over media, and gather annotations. Randomly assign media to train or val
'''
for i,ann_file in enumerate(json_files):
    if i % 100 == 0:
        print(f'File {i} of {len(json_files)}')
    media_file = ann_file.split('.json')[0] + '.png'
    with open(ann_file,'r') as fp:
        anns = json.load(fp)
        
    rows = []
    for ann in anns.get('boundingBoxes'):
        try:
            row = [
                    media_file, 
                    int(ann.get('x')),
                    int(ann.get('y')),
                    int((ann.get('x') + ann.get('width'))),
                    int((ann.get('y') + ann.get('height'))),
                    label_map.get(ann.get('concept'))]
        except:
            print("Bad Row")
            print(ann)
            continue

        if row[1] == row[3] or row[2] == row[4]:
            print('bad dimensions')
            print(ann)
            continue
        if row[1] > row[3] or row[2] > row[4]:
            print('bad dimensions')
            print(ann)
            continue
            
        if label_map.get(ann.get('concept')) is not None:
            rows.append(row)
            
    if np.random.random() < 0.85:
        [train_csv.writerow(row) for row in rows]
    else:
        [val_csv.writerow(row) for row in rows]

In [156]:
category_dict = {
        "Anemone" : 0,
        "Fish" : 1,
        "Eel" : 2,
        "Gastropod" : 3,
        "Sea star" : 4,
        "Feather star" : 5,
        "Sea cucumber" : 6,
        "Urchin" : 7,
        "Glass sponge" : 8,
        "Sea fan" :9,
        "Soft coral" : 10,
        "Sea pen" : 11,
        "Stony coral" : 12,
        "Ray" : 13,
        "Crab" : 14,
        "Shrimp" : 15,
        "Squat lobster" : 16,
        "Flatfish" : 17,
        "Sea spider" : 18,
        "Worm" : 19
}

In [157]:
species = [key for key in category_dict.keys()]

In [3]:
train_file_df = pd.read_csv('/mnt/md0/Projects/Fathomnet/Training_Files/2021-06-29-Detectron/train_file_v2.csv',names=['filename', 'x1','y1','x2','y2','label'])

In [5]:
train_file_df['label'].value_counts()

Urchin           25568
Fish             23199
Sea cucumber     21470
Anemone          17173
Sea star         13767
Sea fan          12077
Sea pen           9198
Glass sponge      7940
Crab              7001
Shrimp            4954
Worm              4503
Gastropod         3853
Flatfish          3846
Soft coral        3612
Ray               2930
Feather star      2899
Squat lobster     2641
Eel               2371
Stony coral        318
Sea spider         210
Name: label, dtype: int64

In [6]:
val_file_df = pd.read_csv('/mnt/md0/Projects/Fathomnet/Training_Files/2021-06-29-Detectron/val_file_v2.csv',names=['filename', 'x1','y1','x2','y2','label'])

In [7]:
val_file_df['label'].value_counts()

Urchin           4173
Sea cucumber     3754
Fish             3742
Anemone          2924
Sea star         2317
Sea fan          2097
Sea pen          1652
Glass sponge     1443
Crab             1200
Gastropod         800
Shrimp            772
Worm              752
Flatfish          608
Soft coral        555
Squat lobster     530
Ray               476
Feather star      405
Eel               358
Stony coral        48
Sea spider         41
Name: label, dtype: int64

In [196]:
train_file_df['url'] = ''
train_file_df['id'] = ''
train_file_df['uuid'] = ''

In [197]:
for idx in range(len(train_file_df)):
    with open(train_file_df.iat[idx,0].split('.png')[0] + '.json','r') as json_file:
        a = json.load(json_file)
    train_file_df.iat[idx,6] = a.get('url')
    train_file_df.iat[idx,7] = a.get('id')
    train_file_df.iat[idx,8] = a.get('uuid')
    train_file_df.iat[idx,0] = train_file_df.iat[idx,0].split('/')[-1]

In [47]:
# Forget what this is for, but probably fixing some sort of file naming issue
train_file_df.iat[0,0] = '/mnt/md0/Projects/Fathomnet/Data_Files/2021-06-04-Download/Beringraja-rhina03_13_12_16.png'

In [198]:
# Write new train file with url, id, and uuid fields, for later use and provenance
train_file_df.to_csv('/mnt/md0/Projects/Fathomnet/Training_Files/2021-06-29-Detectron/val_file_v3_df.csv',index=False)

In [187]:
uniq = train_file_df['uuid'].value_counts().to_dict()

In [43]:
uniq_val = val_file_df['uuid'].value_counts().to_dict()

In [45]:
unique_val_uuids = [key for key in uniq_val.keys()]

In [42]:
unique_train_uuids = [key for key in uniq.keys()]

In [46]:
for key in unique_val_uuids:
    if key in unique_train_uuids:
        continue
    else:
        unique_train_uuids.append(key)

In [188]:
train_file_df['uuid'].value_counts()

d320585b-d8db-4e78-a8d4-9fb05655bcf4    98
065f33b7-84ed-456f-b76f-d913707dc517    91
35ce1c47-d045-48fd-b122-4566429a36f0    89
8868298a-5a85-40d6-a36f-a8ff91661a03    86
46ffba91-4ce9-451b-a707-f0f19ea92f3e    83
                                        ..
12181d1f-7de1-446e-82b4-8a36b7109ee9     1
6b1a73d1-af78-4601-aad3-4268667ab802     1
19b66c91-4f4e-481f-b967-13ea9839f35f     1
d15607f3-ed4d-4cf8-8faa-ba54c3e149fd     1
245ff1b3-be7b-46a3-b4bf-83a6ed192fec     1
Name: uuid, Length: 27882, dtype: int64

In [73]:
fnames = []
for uuid in unique_train_uuids:
    try:
        fnames.append(train_file_df[train_file_df['uuid'] == uuid].iat[0,0])
    except:
        fnames.append(val_file_df[val_file_df['uuid'] == uuid].iat[0,0])

In [78]:
with open('/mnt/md0/Projects/Fathomnet/Training_Files/2021-06-29-Detectron/benthic_label_map.json','r') as json_file:
    label_map = json.load(json_file)

In [70]:
val_file_df = pd.read_csv('/mnt/md0/Projects/Fathomnet/Training_Files/2021-06-29-Detectron/val_file_v2_df.csv')

In [79]:
# This is where we look for things to remove or rename based on low population of labels
for key in label_map.keys():
    if label_map.get(key) == 'Black coral':
        print(key)
        break

Alternatipathes
