In [1]:
import pycocowriter.coco
import subprocess
from pynoddgcs.connect import GCS
import uuid
import pycocowriter.cocomerge
import json
import tempfile
import os

In [2]:
SOURCE_BUCKET = "nmfs_odp_hq"
SOURCE_DIR = "nodd_tools/datasets/gfisher"

In [3]:
class TempPath():
    def __init__(self, path):
        self.path = path

class TempDir(TempPath):
    def __enter__(self):
        os.mkdir(self.path)
        return self.path
    def __exit__(self, type, value, traceback):
        shutil.rmtree(self.path)

class TempFile(TempPath):
    def __enter__(self):
        return self.path
    def __exit__(self, type, value, traceback):
        try:
            os.remove(self.path)
        except FileNotFoundError as e:
            # this is fine
            pass

In [4]:
def get_relative_bucket_path(full_bucket_path, bucket):
    return full_bucket_path[len('gs://' + bucket + '/'):]

def get_filename_from_bucket_path(full_bucket_path):
    return full_bucket_path.split('/')[-1]

In [5]:
file_output = subprocess.run(["gsutil", "ls", "gs://" + SOURCE_BUCKET + "/" + SOURCE_DIR], capture_output=True, text=True).stdout

In [6]:
files = list(map(str.strip, file_output.split("\n")))
files[0]

'gs://nmfs_odp_hq/nodd_tools/datasets/gfisher/2021_NCD-047b/'

In [7]:
annotation_files = [file + "annotations.json" for file in files if file.endswith('/')]
annotation_files[0], annotation_files[-1]

('gs://nmfs_odp_hq/nodd_tools/datasets/gfisher/2021_NCD-047b/annotations.json',
 'gs://nmfs_odp_hq/nodd_tools/datasets/gfisher/SC4-camera5_03-21-21_23-33-49.000/annotations.json')

In [8]:
coco_info = pycocowriter.coco.COCOInfo(
    description = "GFISHER human annotations using VIAME, 2021-2024",
    contributor = "CScott Brown (scott.brown@noaa.gov)",
    year = "2025",
    version = "0.1"
)

In [9]:
client = GCS()

In [10]:
with (
    tempfile.TemporaryDirectory() as input_dir,
    TempFile('annotations.json') as coco_filename
):
    # download all of the files
    # give each annotation_file a unique random name locally
    destination_files = [os.path.join(input_dir, uuid.uuid4().hex + ".json") for annotation_file in annotation_files]
    for annotation_file, destination_file in zip(annotation_files, destination_files):
        print(annotation_file + " -> " + destination_file)
        client.download(SOURCE_BUCKET, get_relative_bucket_path(annotation_file, SOURCE_BUCKET), destination_file)

    # load them
    cocos = []
    for destination_file in destination_files:
        with open(destination_file, 'r') as f:
            cocos.append(json.load(f))
    # merge them
    merged_coco = pycocowriter.cocomerge.coco_merge(*cocos, info = coco_info)
    # dump them
    with open(coco_filename, 'w') as f:
        json.dump(merged_coco, f)
    # upload them
    client.upload(SOURCE_BUCKET, coco_filename, SOURCE_DIR + "/annotations.json")

gs://nmfs_odp_hq/nodd_tools/datasets/gfisher/2021_NCD-047b/annotations.json -> /tmp/tmppti4pn9r/4a48c4c7bae442e6b0448e77e98ee4df.json
gs://nmfs_odp_hq/nodd_tools/datasets/gfisher/2021_NCD-055b/annotations.json -> /tmp/tmppti4pn9r/00d80e0d5f814a2fb70f49541282d9c4.json
gs://nmfs_odp_hq/nodd_tools/datasets/gfisher/2021_NCD-070b/annotations.json -> /tmp/tmppti4pn9r/aab6e8d6dd5e4198a84a13005042f193.json
gs://nmfs_odp_hq/nodd_tools/datasets/gfisher/2021_NCD_084d/annotations.json -> /tmp/tmppti4pn9r/5a4b5fc3f2a449029f8e1e74a78fd4c2.json
gs://nmfs_odp_hq/nodd_tools/datasets/gfisher/2021_NCD_086e/annotations.json -> /tmp/tmppti4pn9r/a7c10da017f54906b2ee08a2de9aba69.json
gs://nmfs_odp_hq/nodd_tools/datasets/gfisher/2021_NCD_112a/annotations.json -> /tmp/tmppti4pn9r/15e029512b33492798fbf5e9102be551.json
gs://nmfs_odp_hq/nodd_tools/datasets/gfisher/2021_NCD_147a/annotations.json -> /tmp/tmppti4pn9r/81722926da684517a5ea54c556d44809.json
gs://nmfs_odp_hq/nodd_tools/datasets/gfisher/2021_NCD_152b/ann