In [15]:
import ultralytics
import pycocowriter.coco2yolo
import os
import urllib.request
import json
import pyworms
import pycocowriter.cocomerge
import shutil
from hierarchical_loss import worms_utils, viz_utils

In [2]:
REFRESH = True

In [3]:
GFISHER_DATA_URL = "https://storage.googleapis.com/nmfs_odp_hq/nodd_tools/datasets/gfisher/annotations_worms.json"

In [4]:
# Ultralytics default settings.  See https://docs.ultralytics.com/quickstart/#ultralytics-settings
# The root directory where datasets are expected to be stored, and where ultralytics will download curated datasets
DATASETS = ultralytics.settings['datasets_dir']

# Find where data is downloaded/stored.  This is where ultralytics will download the coco128 data.
DATA = os.path.join(DATASETS, 'gfisher')
RAW_DATA = os.path.join(DATA, 'raw_data')
HIERARCHY_DATA = os.path.join(DATA, 'hierarchy_data')
GFISHER_ANNOTATIONS = os.path.join(RAW_DATA, "annotations.json")
IMAGES_PATH = os.path.join(DATA, 'images')
WORMS_RECORDS = os.path.join(HIERARCHY_DATA, 'worms_records.json')
WORMS_TREES = os.path.join(HIERARCHY_DATA, 'worms_trees.json')
NAME_APHIAID_MAP = os.path.join(HIERARCHY_DATA, 'worms_name_id_map.json')
HIERARCHY = os.path.join(HIERARCHY_DATA, 'hierarchy.json')
# in a separate folder because of ultralytics being picky about folder contents
GFISHER_HIERARCHICAL_ANNOTATIONS = os.path.join(DATA, "hierarchical_annotations.json")

# Find model configurations.  These are in the repository https://github.com/csbrown-noaa/hierarchical_yolo.
# If you aren't running this from the cloned repo, you will need to go acquire these and change MODEL_CONFIGS to reflect the location of the model config files.
GFISHER_MODELS = '../gfisher_data_hierarchical_model/models'
YOLO_MODEL_YAML = os.path.join(GFISHER_MODELS, 'hierarchical_gfisher_yolov8.yaml')
YOLO_DATASET_YAML = os.path.join(GFISHER_MODELS, 'hierarchical_gfisher.yaml')

# Get the GFISHER data

In [5]:
if not os.path.exists(DATA):
    os.mkdir(DATA)
if not os.path.exists(RAW_DATA):
    os.mkdir(RAW_DATA)
if not os.path.exists(HIERARCHY_DATA):
    os.mkdir(HIERARCHY_DATA)
if not os.path.exists(GFISHER_ANNOTATIONS):
    urllib.request.urlretrieve(GFISHER_DATA_URL, GFISHER_ANNOTATIONS)

# Expand the categories to support a hierarchical structure

In [6]:
with open(GFISHER_ANNOTATIONS, 'r') as f:
    gfisher_coco = json.load(f)

In [7]:
# compare below to check if new categories are added
list(map(lambda x: len(gfisher_coco[x]), ['images', 'annotations', 'categories']))

[231631, 640070, 123]

In [8]:
gfisher_categories = [cat['name'] for cat in gfisher_coco['categories']]

## Get the higher-level categories

In [10]:
if REFRESH:
    worms_records = {}
    for category in gfisher_categories:
        worms_records[category] = (pyworms.aphiaRecordsByName(category)[0])
    with open(WORMS_RECORDS, 'w') as f:
        json.dump(worms_records, f)
else:
    with open(WORMS_RECORDS, 'r') as f:
        worms_records = json.load(f)

In [11]:
if REFRESH:
    worms_trees = {}
    for category, record in worms_records.items():
        worms_trees[category] = worms_utils.get_WORMS_tree(record['AphiaID'])
    with open(WORMS_TREES, 'w') as f:
        json.dump(worms_trees, f)
else:
    with open(WORMS_TREES, 'r') as f:
        worms_trees = json.load(f)

In [17]:
if REFRESH:
    name_id_map = {}
    childparent_tree = {}
    for category, tree in worms_trees.items():
        parent_name = None
        while tree:
            name_id_map[tree['scientificname']] = tree['AphiaID']
            if parent_name:
                childparent_tree[tree['scientificname']] = parent_name
            parent_name = tree['scientificname']
            tree = tree['child']
    with open(NAME_APHIAID_MAP, 'w') as f:
        json.dump(name_id_map, f)
    with open(HIERARCHY, 'w') as f:
        json.dump(childparent_tree, f)
else:
    with open(NAME_APHIAID_MAP, 'r') as f:
        name_id_map = json.load(f)
    with open(HIERARCHY, 'r') as f:
        childparent_tree = json.load(f)

In [18]:
childparent_tree

{'Animalia': 'Biota',
 'Chordata': 'Animalia',
 'Vertebrata': 'Chordata',
 'Gnathostomata': 'Vertebrata',
 'Osteichthyes': 'Gnathostomata',
 'Actinopterygii': 'Osteichthyes',
 'Actinopteri': 'Actinopterygii',
 'Teleostei': 'Actinopteri',
 'Perciformes': 'Teleostei',
 'Percoidei': 'Perciformes',
 'Epinephelidae': 'Percoidei',
 'Mycteroperca': 'Epinephelidae',
 'Mycteroperca microlepis': 'Mycteroperca',
 'Serranidae': 'Percoidei',
 'Serranus': 'Serranidae',
 'Serranus phoebe': 'Serranus',
 'Carangiformes': 'Teleostei',
 'Carangidae': 'Carangiformes',
 'Seriola': 'Carangidae',
 'Seriola fasciata': 'Seriola',
 'Eupercaria incertae sedis': 'Teleostei',
 'Lutjanidae': 'Eupercaria incertae sedis',
 'Rhomboplites': 'Lutjanidae',
 'Rhomboplites aurorubens': 'Rhomboplites',
 'Lutjanus': 'Lutjanidae',
 'Lutjanus synagris': 'Lutjanus',
 'Callionymiformes': 'Teleostei',
 'Callionymidae': 'Callionymiformes',
 'Epinephelus': 'Epinephelidae',
 'Epinephelus morio': 'Epinephelus',
 'Labridae': 'Eupercar

In [19]:
viz_utils.viz_tree(childparent_tree).show()

Biota : 
└── Animalia : 
    ├── Arthropoda : 
    │   └── Crustacea : 
    │       └── Multicrustacea : 
    │           └── Malacostraca : 
    │               └── Eumalacostraca : 
    │                   └── Eucarida : 
    │                       └── Decapoda : 
    │                           └── Pleocyemata : 
    │                               ├── Anomura : 
    │                               └── Brachyura : 
    │                                   └── Eubrachyura : 
    │                                       └── Heterotremata : 
    │                                           └── Majoidea : 
    │                                               └── Epialtidae : 
    │                                                   └── Pisinae : 
    │                                                       └── Stenocionops : 
    │                                                           └── Stenocionops spinimanus : 
    └── Chordata : 
        └── Vertebrata : 
            └── Gnathostoma

## Create the appropriate data structure

In [20]:
coco_categories = [
    {
        'name': name,
        'id': i
    } for i, name in enumerate(name_id_map)
]
new_coco = {
    'images': [],
    'licenses': [],
    'annotations': [],
    'categories': coco_categories
}

In [21]:
gfisher_expanded_category_coco = pycocowriter.cocomerge.coco_merge(gfisher_coco, new_coco)
gfisher_expanded_category_coco = pycocowriter.cocomerge.coco_collapse_categories(gfisher_expanded_category_coco)
gfisher_expanded_category_coco = pycocowriter.cocomerge.coco_reindex_categories(gfisher_expanded_category_coco)
with open(GFISHER_HIERARCHICAL_ANNOTATIONS, 'w') as f:
    json.dump(gfisher_expanded_category_coco, f)

In [22]:
gfisher_expanded_category_coco.keys()

dict_keys(['annotations', 'images', 'info', 'categories', 'licenses'])

In [23]:
# compare above to check if new categories are added
list(map(lambda x: len(gfisher_expanded_category_coco[x]), ['images', 'annotations', 'categories']))

[231631, 640070, 260]

In [24]:
sorted(gfisher_expanded_category_coco['categories'], key = lambda x: x['name'])

[{'name': 'Acanthuriformes', 'id': 1},
 {'name': 'Actinopteri', 'id': 2},
 {'name': 'Actinopterygii', 'id': 3},
 {'name': 'Albula', 'id': 4},
 {'name': 'Albula vulpes', 'id': 5},
 {'name': 'Albulidae', 'id': 6},
 {'name': 'Albuliformes', 'id': 7},
 {'name': 'Albulinae', 'id': 8},
 {'name': 'Alectis', 'id': 9},
 {'name': 'Alectis ciliaris', 'id': 10},
 {'name': 'Anguilliformes', 'id': 11},
 {'name': 'Animalia', 'id': 12},
 {'name': 'Anisotremus', 'id': 13},
 {'name': 'Anisotremus virginicus', 'id': 14},
 {'name': 'Anomura', 'id': 15},
 {'name': 'Anthiadidae', 'id': 16},
 {'name': 'Archosargus', 'id': 17},
 {'name': 'Archosargus probatocephalus', 'id': 18},
 {'name': 'Arthropoda', 'id': 19},
 {'name': 'Aulopiformes', 'id': 20},
 {'name': 'Aulopoidei', 'id': 21},
 {'name': 'Balistes', 'id': 22},
 {'name': 'Balistes capriscus', 'id': 23},
 {'name': 'Balistes vetula', 'id': 24},
 {'name': 'Balistidae', 'id': 25},
 {'name': 'Balistoidei', 'id': 26},
 {'name': 'Biota', 'id': 27},
 {'name': 'B

In [25]:
all_categories = set(map(lambda x: x['name'], gfisher_expanded_category_coco['categories']))

In [26]:
childparent_tree_categories = set(childparent_tree.keys()) | set(childparent_tree.values())

In [27]:
all_categories - childparent_tree_categories

set()

# Convert to YOLO

In [29]:
if REFRESH:
    pycocowriter.coco2yolo.coco2yolo(DATA, DATA)

[KAnnotations /home/noaa_brown/datasets/gfisher/hierarchical_annotations.json: 100% ━━━━━━━━━━━━ 231631/231631 5.7Kit/s 41.0s<0.0s
COCO data converted successfully.
Results saved to /home/noaa_brown/gfisher_data_hierarchical_model/notebooks/coco_converted
loading annotations into memory...
Done (t=2.23s)
creating index...
index created!
loading annotations into memory...
Done (t=2.18s)
creating index...
index created!
downloaded 0/231631 images (t=0.0s)
downloaded 1/231631 images (t=0.0s)
downloaded 2/231631 images (t=0.0s)
downloaded 3/231631 images (t=0.0s)
downloaded 4/231631 images (t=0.0s)
downloaded 5/231631 images (t=0.0s)
downloaded 6/231631 images (t=0.0s)
downloaded 7/231631 images (t=0.0s)
downloaded 8/231631 images (t=0.0s)
downloaded 9/231631 images (t=0.0s)
downloaded 10/231631 images (t=0.0s)
downloaded 11/231631 images (t=0.0s)
downloaded 12/231631 images (t=0.0s)
downloaded 13/231631 images (t=0.0s)
downloaded 14/231631 images (t=0.0s)
downloaded 15/231631 images (t=0

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



downloaded 154995/231631 images (t=0.0s)
downloaded 154996/231631 images (t=0.0s)
downloaded 154997/231631 images (t=0.0s)
downloaded 154998/231631 images (t=0.0s)
downloaded 154999/231631 images (t=0.0s)
downloaded 155000/231631 images (t=0.0s)
downloaded 155001/231631 images (t=0.0s)
downloaded 155002/231631 images (t=0.0s)
downloaded 155003/231631 images (t=0.0s)
downloaded 155004/231631 images (t=0.0s)
downloaded 155005/231631 images (t=0.0s)
downloaded 155006/231631 images (t=0.0s)
downloaded 155007/231631 images (t=0.0s)
downloaded 155008/231631 images (t=0.0s)
downloaded 155009/231631 images (t=0.0s)
downloaded 155010/231631 images (t=0.0s)
downloaded 155011/231631 images (t=0.0s)
downloaded 155012/231631 images (t=0.0s)
downloaded 155013/231631 images (t=0.0s)
downloaded 155014/231631 images (t=0.0s)
downloaded 155015/231631 images (t=0.0s)
downloaded 155016/231631 images (t=0.0s)
downloaded 155017/231631 images (t=0.0s)
downloaded 155018/231631 images (t=0.0s)
downloaded 15501

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [30]:
shutil.copyfile(os.path.join(DATA, 'train.yaml'), YOLO_DATASET_YAML)

'../gfisher_data_hierarchical_model/models/hierarchical_gfisher.yaml'