In [1]:
from viame2coco.viame2coco import viame2coco
import pycocowriter
import pynoddgcs.connect
from google.cloud import storage
from pynoddgcs.connect import GCS
from pynoddgcs.publish import NODDCOCODataset
import subprocess
import functools
import shutil
import tempfile
import os
import contextlib
import csv
import logging

#logging.basicConfig(level=logging.INFO)

In [2]:
class TempPath():
    def __init__(self, path):
        self.path = path

class TempDir(TempPath):
    def __enter__(self):
        os.mkdir(self.path)
        return self.path
    def __exit__(self, type, value, traceback):
        shutil.rmtree(self.path)

class TempFile(TempPath):
    def __enter__(self):
        return self.path
    def __exit__(self, type, value, traceback):
        try:
            os.remove(self.path)
        except FileNotFoundError as e:
            # this is fine
            pass

In [3]:
SOURCE_BUCKET = "nmfs_odp_sefsc"
SOURCE_DIR = "PEMD/Gulf of Mexico Reef Fish Annotated Library/For_Training"
DESTINATION_BUCKET = "nmfs_odp_hq"
DESTINATION_DIR = "nodd_tools/datasets/gfisher"

In [4]:
def get_relative_bucket_path(full_bucket_path, bucket):
    return full_bucket_path.strip('gs://' + bucket + '/')

def get_filename_from_bucket_path(full_bucket_path):
    return full_bucket_path.split('/')[-1]

In [5]:
file_output = subprocess.run(["gsutil", "ls", "gs://" + SOURCE_BUCKET + "/" + SOURCE_DIR], capture_output=True, text=True).stdout

In [6]:
files = list(map(str.strip, file_output.split("\n")))

In [7]:
csv_files = filter(lambda x: x.endswith('.csv'), files)

In [8]:
csv_files_leaves = set(map(lambda x: x.split('/')[-1].strip('.csv'), csv_files))

In [9]:
csv_video_pairs = {}
for file in files:
    root_name = '.'.join(file.split('/')[-1].split('.')[:-1])
    if (root_name in csv_files_leaves):
        if (root_name not in csv_video_pairs):
            csv_video_pairs[root_name] = [None, None]
        if file.endswith('.csv'):
            csv_video_pairs[root_name][0] = file
        else:
            csv_video_pairs[root_name][1] = file

In [10]:
csv_video_pairs

{'2021_NCD-047b': ['gs://nmfs_odp_sefsc/PEMD/Gulf of Mexico Reef Fish Annotated Library/For_Training/2021_NCD-047b.csv',
  'gs://nmfs_odp_sefsc/PEMD/Gulf of Mexico Reef Fish Annotated Library/For_Training/2021_NCD-047b.mp4'],
 '2021_NCD-055b': ['gs://nmfs_odp_sefsc/PEMD/Gulf of Mexico Reef Fish Annotated Library/For_Training/2021_NCD-055b.csv',
  'gs://nmfs_odp_sefsc/PEMD/Gulf of Mexico Reef Fish Annotated Library/For_Training/2021_NCD-055b.mp4'],
 '2021_NCD-070b': ['gs://nmfs_odp_sefsc/PEMD/Gulf of Mexico Reef Fish Annotated Library/For_Training/2021_NCD-070b.csv',
  'gs://nmfs_odp_sefsc/PEMD/Gulf of Mexico Reef Fish Annotated Library/For_Training/2021_NCD-070b.mp4'],
 '2021_NCD_084d': ['gs://nmfs_odp_sefsc/PEMD/Gulf of Mexico Reef Fish Annotated Library/For_Training/2021_NCD_084d.csv',
  'gs://nmfs_odp_sefsc/PEMD/Gulf of Mexico Reef Fish Annotated Library/For_Training/2021_NCD_084d.mp4'],
 '2021_NCD_086e': ['gs://nmfs_odp_sefsc/PEMD/Gulf of Mexico Reef Fish Annotated Library/For_Trai

In [11]:
client = GCS()

In [12]:
try:
    with open('completed_videos.csv', 'r') as f:
        completed = [x[0] for x in list(csv.reader(f))]
except FileNotFoundError as e:
    completed = []

## Why is this one file full of empty rows???

In [13]:
if 'SC2-camera3_03-22-21_16-59-34.000NOFISH' not in completed:
    completed.append('SC2-camera3_03-22-21_16-59-34.000NOFISH')

In [14]:
set_completed = set(completed)
failed = []
completed

['SC2-camera3_03-22-21_16-59-34.000NOFISH',
 '2021_NCD-047b',
 '2021_NCD-055b',
 '2021_NCD-070b',
 '2021_NCD_084d',
 '2021_NCD_086e',
 '2021_NCD_112a',
 '2021_NCD_147a',
 '2021_NCD_152b',
 '2021_NCD_153a',
 '2021_NCN-062a',
 '2021_NCN_042a',
 '2021_NCN_042a2',
 '2021_NCN_048b',
 '2021_NCN_070b',
 '2021_NCO-002b',
 '2021_NCO-004e2',
 '2021_NCO-025d',
 '2021_NCO-073d',
 '2021_NCO-073d2',
 '2021_NCO-091a',
 '2021_NCO-100a',
 '2021_NCO_004e',
 '761901008_cam1_1',
 '761901008_cam1_2',
 '761901008_cam1_3',
 '761901008_cam1_4',
 '761901008_cam1_5',
 '761901019_cam4',
 '761901033_cam1_1',
 '761901033_cam1_2',
 '761901033_cam3',
 '761901033_cam5_1',
 '761901033_cam5_2',
 '761901066_cam3',
 '761901183_cam1_1',
 '761901183_cam1_2',
 '761901183_cam1_3',
 '761901183_cam1_4',
 '761901183_cam1_5',
 '761901183_cam1_6',
 '761901183_cam4_1',
 '761901183_cam4_2',
 '761901183_cam4_3',
 '761901183_cam5_1',
 '761901183_cam5_2',
 '761901183_cam5_3',
 '761901183_cam5_4',
 '761901231_Cam2',
 '761901231_Cam3',


In [15]:
to_do = set(csv_video_pairs.keys()) - set(completed)
to_do

set()

In [16]:
%%capture data_upload_log
DESCRIPTION = 'VIAME-sourced annotations for {}'
MIN_CONFIDENCE = 0 # these data are all human-reviewed!

#video_root_name = '2021_NCD-047b'
#csv_bucketfile, video_bucketfile = csv_video_pairs[video_root_name]

#csv_video_pairs_test_subset = {x: csv_video_pairs[x] for x in ['2021_NCD-055b', '2021_NCD-047b']}

#curdir = os.getcwd()

incomplete_csv_video_pairs = {k: v for k, v in csv_video_pairs.items() if k not in set_completed}

for video_root_name, (csv_bucketfile, video_bucketfile) in incomplete_csv_video_pairs.items():
    
    try:
        with (
                tempfile.TemporaryDirectory() as input_dir,
                tempfile.TemporaryDirectory() as output_root,
                TempDir(os.path.join(output_root, video_root_name)) as output_dir,
                contextlib.chdir(output_dir),
                TempFile('annotations.json') as coco_filename
            ):
            print('pickles')
            csv_filename = os.path.join(input_dir, get_filename_from_bucket_path(csv_bucketfile))
            video_filename = os.path.join(input_dir, get_filename_from_bucket_path(video_bucketfile))
            print('turtles')
            client.download(SOURCE_BUCKET, get_relative_bucket_path(csv_bucketfile, SOURCE_BUCKET), csv_filename)
            client.download(SOURCE_BUCKET, get_relative_bucket_path(video_bucketfile, SOURCE_BUCKET), video_filename)
            print('blah')
            description = DESCRIPTION.format(video_root_name)
            cocodata = viame2coco(
                csv_filename,
                description,
                video_file = video_filename,
                video_frame_outfile_dir = '.',
                min_confidence = MIN_CONFIDENCE 
            )
            cocodata.to_json(coco_filename)
            destination_dir = DESTINATION_DIR + '/' + video_root_name
            coco_nodd_dataset = NODDCOCODataset(coco_filename, destination_dir, DESTINATION_BUCKET)
            coco_nodd_dataset.upload()
        completed.append(video_root_name)
    except KeyboardInterrupt as e:
        print("kb interrupt!")
        failed.append(video_root_name)
        break
    except Exception as e:
        failed.append(video_root_name)
        raise e

In [17]:
completed

['SC2-camera3_03-22-21_16-59-34.000NOFISH',
 '2021_NCD-047b',
 '2021_NCD-055b',
 '2021_NCD-070b',
 '2021_NCD_084d',
 '2021_NCD_086e',
 '2021_NCD_112a',
 '2021_NCD_147a',
 '2021_NCD_152b',
 '2021_NCD_153a',
 '2021_NCN-062a',
 '2021_NCN_042a',
 '2021_NCN_042a2',
 '2021_NCN_048b',
 '2021_NCN_070b',
 '2021_NCO-002b',
 '2021_NCO-004e2',
 '2021_NCO-025d',
 '2021_NCO-073d',
 '2021_NCO-073d2',
 '2021_NCO-091a',
 '2021_NCO-100a',
 '2021_NCO_004e',
 '761901008_cam1_1',
 '761901008_cam1_2',
 '761901008_cam1_3',
 '761901008_cam1_4',
 '761901008_cam1_5',
 '761901019_cam4',
 '761901033_cam1_1',
 '761901033_cam1_2',
 '761901033_cam3',
 '761901033_cam5_1',
 '761901033_cam5_2',
 '761901066_cam3',
 '761901183_cam1_1',
 '761901183_cam1_2',
 '761901183_cam1_3',
 '761901183_cam1_4',
 '761901183_cam1_5',
 '761901183_cam1_6',
 '761901183_cam4_1',
 '761901183_cam4_2',
 '761901183_cam4_3',
 '761901183_cam5_1',
 '761901183_cam5_2',
 '761901183_cam5_3',
 '761901183_cam5_4',
 '761901231_Cam2',
 '761901231_Cam3',


In [19]:
with open('completed_videos.csv', 'w') as f:
   csv.writer(f).writerows([[x] for x in completed])

In [20]:
failed

[]

In [20]:
print('\n'.join(completed))

2021_NCD-047b
2021_NCD-055b
2021_NCD-070b
2021_NCD_084d
2021_NCD_086e
2021_NCD_112a
2021_NCD_147a
2021_NCD_152b
2021_NCD_153a
2021_NCN-062a
2021_NCN_042a
2021_NCN_042a2
2021_NCN_048b
2021_NCN_070b
2021_NCO-002b
2021_NCO-004e2
2021_NCO-025d
2021_NCO-073d
2021_NCO-073d2
2021_NCO-091a
2021_NCO-100a
2021_NCO_004e
761901008_cam1_1
761901008_cam1_2
761901008_cam1_3
761901008_cam1_4
761901008_cam1_5
761901019_cam4
761901033_cam1_1
761901033_cam1_2
761901033_cam3
761901033_cam5_1
761901033_cam5_2
761901066_cam3
761901183_cam1_1
761901183_cam1_2
761901183_cam1_3
761901183_cam1_4
761901183_cam1_5
761901183_cam1_6
761901183_cam4_1
761901183_cam4_2
761901183_cam4_3
761901183_cam5_1
761901183_cam5_2
761901183_cam5_3
761901183_cam5_4
761901231_Cam2
761901231_Cam3
761901231_Cam5
761901309_cam1_1
761901309_cam1_2
761901309_cam1_3
761901309_cam1_4
761901309_cam2_1
761901309_cam2_2
761901309_cam2_3
761901309_cam3_1
761901309_cam3_2
761901309_cam3_3
761901309_cam3_4
761901309_cam3_5
761901309_cam4_1
7619