In [36]:
import pycocowriter.coco
import subprocess
from pynoddgcs.connect import GCS
import uuid
import pycocowriter.cocomerge
import json
import tempfile
import os

In [3]:
SOURCE_BUCKET = "nmfs_odp_hq"
SOURCE_DIR = "nodd_tools/datasets/gfisher"

In [15]:
class TempPath():
    def __init__(self, path):
        self.path = path

class TempDir(TempPath):
    def __enter__(self):
        os.mkdir(self.path)
        return self.path
    def __exit__(self, type, value, traceback):
        shutil.rmtree(self.path)

class TempFile(TempPath):
    def __enter__(self):
        return self.path
    def __exit__(self, type, value, traceback):
        try:
            os.remove(self.path)
        except FileNotFoundError as e:
            # this is fine
            pass

In [48]:
def get_relative_bucket_path(full_bucket_path, bucket):
    return full_bucket_path[len('gs://' + bucket + '/'):]

def get_filename_from_bucket_path(full_bucket_path):
    return full_bucket_path.split('/')[-1]

In [56]:
file_output = subprocess.run(["gsutil", "ls", "gs://" + SOURCE_BUCKET + "/" + SOURCE_DIR], capture_output=True, text=True).stdout

In [8]:
files = list(map(str.strip, file_output.split("\n")))
files[0]

In [58]:
annotation_files = [file + "annotations.json" for file in files if file.endswith('/')]
annotation_files[0], annotation_files[-1]

('gs://nmfs_odp_hq/nodd_tools/datasets/gfisher/2021_NCD-047b/annotations.json',
 'gs://nmfs_odp_hq/nodd_tools/datasets/gfisher/SC4-camera5_03-21-21_23-33-49.000/annotations.json')

In [59]:
coco_info = pycocowriter.coco.COCOInfo(
    description = "GFISHER human annotations using VIAME, 2021-2024",
    contributor = "CScott Brown (scott.brown@noaa.gov)",
    year = "2025",
    version = "0.1"
)

In [60]:
client = GCS()

In [61]:
help(client)

Help on GCS in module pynoddgcs.connect object:

class GCS(builtins.object)
 |  Helper class for uploading and download files from Google Cloud Storage
 |
 |  Methods defined here:
 |
 |  __init__(self)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |
 |  authenticate(self)
 |      Get user's default credentials on this machine.
 |
 |      Returns nothing, sets the values into object attributes
 |
 |  check_auth(self)
 |      Authenticates the user if not already authenticated.
 |      See the `authenticate` method.
 |
 |  download(self, bucket, source, destination=None)
 |      Downloads a file from GCS, handling both public and private buckets.
 |
 |      Parameters
 |      ----------
 |      bucket: str
 |          the bucket name
 |      source: str
 |          the file path within the bucket
 |      destination: str | None
 |          the file path to which we would like to download the file.
 |          default to source path within current working direct

In [65]:
with (
    tempfile.TemporaryDirectory() as input_dir,
    TempFile('annotations.json') as coco_filename
):
    # download all of the files
    # give each annotation_file a unique random name locally
    destination_files = [os.path.join(input_dir, uuid.uuid4().hex + ".json") for annotation_file in annotation_files]
    for annotation_file, destination_file in zip(annotation_files, destination_files):
        print(annotation_file + " -> " + destination_file)
        client.download(SOURCE_BUCKET, get_relative_bucket_path(annotation_file, SOURCE_BUCKET), destination_file)

    # load them
    cocos = []
    for destination_file in destination_files:
        with open(destination_file, 'r') as f:
            cocos.append(json.load(f))
    # merge them
    merged_coco = pycocowriter.cocomerge.coco_merge(*cocos, info = coco_info)
    # dump them
    with open(coco_filename, 'w') as f:
        json.dump(merged_coco, f)
    # upload them
    client.upload(SOURCE_BUCKET, coco_filename, SOURCE_DIR + "/annotations.json")

gs://nmfs_odp_hq/nodd_tools/datasets/gfisher/2021_NCD-047b/annotations.json -> /tmp/tmpn7oykvu7/4335fea09d1b40a98c1fc0c91064e90e.json
gs://nmfs_odp_hq/nodd_tools/datasets/gfisher/2021_NCD-055b/annotations.json -> /tmp/tmpn7oykvu7/e13fbe29a5174b5bb1e127a9182ea4a1.json
gs://nmfs_odp_hq/nodd_tools/datasets/gfisher/2021_NCD-070b/annotations.json -> /tmp/tmpn7oykvu7/ccebc12de7814024bdf54df8386e31f3.json
gs://nmfs_odp_hq/nodd_tools/datasets/gfisher/2021_NCD_084d/annotations.json -> /tmp/tmpn7oykvu7/350308ea709e4c0bae3621c45f050d93.json
gs://nmfs_odp_hq/nodd_tools/datasets/gfisher/2021_NCD_086e/annotations.json -> /tmp/tmpn7oykvu7/e6820c942e9f4dbb843f9b6a43085b08.json
gs://nmfs_odp_hq/nodd_tools/datasets/gfisher/2021_NCD_112a/annotations.json -> /tmp/tmpn7oykvu7/2c455c255fca4260982f1d8dc5677e9d.json
gs://nmfs_odp_hq/nodd_tools/datasets/gfisher/2021_NCD_147a/annotations.json -> /tmp/tmpn7oykvu7/2f03a984af2c402bb3926008d8d8c68e.json
gs://nmfs_odp_hq/nodd_tools/datasets/gfisher/2021_NCD_152b/ann