In [18]:
import glob
import jsonlines
from pathlib import Path
import natsort
import json
from tqdm import tqdm
import pandas as pd

In [2]:
# Setup local directories
dataset_root = Path("/media/data/warp_data/marine_detection/imerit/whoi-rsi-fish-detection-datasets-22122023")
images_dir = "/media/data/warp_data/marine_detection/imerit/whoi-rsi-fish-detection-datasets-22122023"
manifest_path = "/media/data/warp_data/marine_detection/imerit/whoi-rsi-fish-detection-datasets-22122023/28102023_manifest.json"
coco_json_path = dataset_root / "coco.json"

# Parse manifest to get proper filenames and directories
# We use the relative path and video name with aws_id as GLOBAL_KEYS for LABELBOX
ordered_video_dirs = []
ordered_global_keys = []
ordered_videos_with_aws_ids = []
labelbox_external_id_to_global_key = {}
video_name_to_global_key = {}
replace_root = "s3://whoi-rsi-fish-detection/datasets/imerit_26102023_clips/"
with jsonlines.open(manifest_path) as f:
    for video_num, video_json in enumerate(f):
        video_path = Path(video_json['source-ref'].replace(replace_root, ""))
        ordered_video_dirs.append(video_path.parent)
        aws_id = video_num
        
        labelbox_name = video_path.parent.stem + f"_aws{aws_id}.mp4"
        video_name = str(video_path.parent)
        labelbox_global_key = str(video_path.parent) + f"_aws{aws_id}.mp4"
        
        ordered_videos_with_aws_ids.append(labelbox_name)
        ordered_global_keys.append(labelbox_global_key)
        labelbox_external_id_to_global_key[labelbox_name] = labelbox_global_key
        video_name_to_global_key[video_name] = labelbox_global_key
        
# Parse COCO json
with open(coco_json_path, "r") as f:
    coco = json.load(f)

In [37]:
# Setup references
filestats = {}
for vid_num, vid_seq in enumerate(coco['video_sequences']):
    id = vid_seq['id']
    filename = vid_seq['file_name']
    filestats[id] = {}
    filestats[id]['filename'] = filename
    filestats[id]['annot_count'] = 0
    filestats[id]['fish_count'] = 0
    filestats[id]['inferred_site'] = None

    filename_lower = filename.lower()

    # Attempt to infer location
    if "yaw" in filename_lower or "yz" in filename_lower or "ya" in filename_lower:
        filestats[id]['inferred_site'] = "Yawzi"
    elif "tek" in filename_lower or "tk" in filename_lower:
        filestats[id]['inferred_site'] = "Tektite"
    elif "joel" in filename_lower or "js" in filename_lower:
        filestats[id]['inferred_site'] = "Joels Shoal"
    elif "co" in filename_lower:
        filestats[id]['inferred_site'] = "Cocoloba"
    elif "boo" in filename_lower or "br" in filename_lower:
        filestats[id]['inferred_site'] = "Booby Rock"
    else:
        print(filename, "has no inferred site")

# Gather statistics
total_objs = 0
total_annot = 0
for obj_num, obj in enumerate(coco['object_tracks']):
    filestats[int(obj['video_seq_id'])]['fish_count'] += 1
    total_objs += 1

    filestats[int(obj['video_seq_id'])]['annot_count'] += len(obj['bbox_id_list'])
    total_annot += len(obj['bbox_id_list'])


In [36]:
# Export to CSV format
df = pd.DataFrame(columns=filestats[0].keys())

for vid_num, vid_id in enumerate(filestats):
    df_new_row = pd.DataFrame.from_records([filestats[vid_id]])
    df = pd.concat([df, df_new_row])
df.to_csv("imerit_stats.csv")