# EgoExo Manifest Generator

Generates a set of EgoExo manifest files from the given s3 directory using smart defaults.

### Pre-requisites:
- Jupyter kernel with boto3 and pandas installed
- You have run `aws configure` and set up your AWS keys
- (Highly recommended) Recorded files are already uploaded to s3
- ROOT_DIR has the structure defined in [Ego-Exo Transfer Guidelines](https://docs.google.com/spreadsheets/d/1Gn_aMgOlTBc9SaPjbrwmKeokaoQB-yTA9UaWhEuMuUc/edit#gid=1965268626)  (e.g. \<root\>/data/\<capture_id\>/..., \<root\>/objects/..., \<root\>/additional_data/...)

### How To Use:
1. Set the variables in the cell below.
2. Run all. This will output manifest files to `<OUTPUT_DIRECTORY>/manifest_<MANIFEST_VERSION>/`.
3. Edit the .csv files to add additional metadata and ensure correctness.
4. Continue the instructions in the [Post-Capture Processing Doc](https://docs.google.com/document/d/18DzHt5YrF-mxx7vy9khtoFRzr5gK1NnxfbDS37V2yWA/edit#heading=h.frx7gmruf771)

The following files are generated:
- capture_metadata.csv
- take_metadata.csv
- video_metadata.csv
- video_component_file.csv
- object_metadata.csv
- participant_metadata.csv
- physical_setting.csv
- extra_data.csv

Note: It is **safe** to modify manifest files after generation and re-run.
This notebook will retain existing rows, and only add new ones that didn't exist.

Note: Most cells in this notebook are uneditable for safety. Set 
`"editable": true` in a cell's metadata if you want customize it.

In [None]:
ROOT_DIR = "<full s3 path to root data directory>" # e.g. s3://<university_bucket>/egoexo/
WALKAROUND_CAM_NAMES = ["mobile"] # Exo Cameras that will always have walkarounds
EGO_HAS_WALKAROUND = True # True if Ego Cameras all have walkarounds (this should be the case)
DEFAULT_GOPRO_MODEL_TYPE = "hero10" # The GoPro Model used for captures. Get ID from Doc: https://tinyurl.com/ymr9mb45.
OUTPUT_DIRECTORY = "<full path to output directory here>" # e.g. /Users/<username>/egoexo/
MANIFEST_VERSION = "v1"

In [None]:
# Imports
import os
import boto3
import json
import pandas as pd

In [None]:
# Utilities
s3 = boto3.resource("s3")

def is_s3_path(path):
    return path.startswith('s3://')

def split_s3_path(s3_path):
    path_parts = s3_path.strip().replace("s3://", "").split("/")
    bucket = path_parts.pop(0)
    key = "/".join(path_parts)
    return bucket, key

def ls(path):
    if is_s3_path(path):
        bucket, key = split_s3_path(path)
        if not key.endswith("/"):
            key = key + "/"
        keys = [x.key for x in s3.Bucket(bucket).objects.filter(Prefix=key)]
        return sorted(list(set([x.replace(key, '').split('/')[0] for x in keys if not os.path.basename(x).startswith('.')])))
    if os.path.isfile(path):
        return None
    return sorted([f for f in os.listdir(path) if not os.path.basename(f).startswith('.')])

def is_video(path):
    filename = os.path.basename(path)
    return filename.lower().endswith(".mp4") or filename.lower().endswith(".vrs")

def is_image(path):
    filename = os.path.basename(path)
    extension = os.path.splitext(filename.lower())[1]
    return extension in ['.jpg', '.jpeg', '.png', '.gif', '.tif', '.tiff', '.raw']

def is_csv(path):
    filename = os.path.basename(path)
    return filename.lower().endswith(".csv")

def is_yaml(path):
    filename = os.path.basename(path)
    extension = os.path.splitext(filename.lower())[1]
    return extension in ['.yml', '.yaml']

def is_media(path):
    return is_video(path) or is_image(path)

## Generate capture_metadata.csv

In [None]:
capture_ids = ls(os.path.join(ROOT_DIR, "data"))
capture_ids

In [None]:
clean_capture_ids = [
    x for x in capture_ids if len(set(['ego', 'exo']).intersection(set(ls(os.path.join(ROOT_DIR, "data", x))))) > 0
]
clean_capture_ids

In [None]:
capture_metadata_records = []
for capture_id in clean_capture_ids:
    root = os.path.join(ROOT_DIR, "data", capture_id)
    root_ls = ls(root)
    ego_cameras = (
        ls(os.path.join(root, "ego")) if "ego" in root_ls else []
    )
    exo_cameras = (
        ls(os.path.join(root, "exo")) if "exo" in root_ls else []
    )
    
    take_surveys =  (
        ls(os.path.join(root, "take_surveys")) if "take_surveys" in root_ls else []
    )

    video_count = 0
    for cam in ego_cameras:
        video_count += len(
            [x for x in ls(os.path.join(root, "ego", cam)) if is_video(x)]
        )
    for cam in exo_cameras:
        video_count += len(
            [x for x in ls(os.path.join(root, "exo", cam)) if is_video(x)]
        )
        


    capture_metadata_records.append(
        {
            "university_capture_id": capture_id,
            "university_video_folder_path": root if is_s3_path(ROOT_DIR) else "<fill me in>",
            "number_videos": video_count,
            "number_takes": "<fill me in>",
            "post_surveys_relative_path": os.path.join(root, "post_surveys.csv") if "post_surveys.csv" in root_ls else "<fill me in>",
            "physical_setting_id": "<fill me in>",
            "start_date_recorded_utc": "<fill me in>",
            "additional_metadata": None,
        }
    )

capture_metadata = pd.DataFrame.from_records(capture_metadata_records)
capture_metadata

## Generate take_metadata.csv

In [None]:
take_metadata_records = []
for capture_id in clean_capture_ids:
    take_metadata_records.append(
        {
            "university_capture_id": capture_id,
            "take_id": 0,
            "scenario_id": "<fill me in>",
            "is_narrated": False,
            "is_dropped": False,
            "take_start_seconds_aria": None,
            "object_ids": [],
            "recording_participant_id": None,
            "additional_metadata": None,
        }
    )

take_metadata = pd.DataFrame.from_records(take_metadata_records)
take_metadata

## Generate video_metadata.csv

In [None]:
video_metadata_records = []
for capture_id in clean_capture_ids:
    root = os.path.join(ROOT_DIR, "data", capture_id)
    root_ls = ls(root)
    ego_cameras = (
        ls(os.path.join(root, "ego")) if "ego" in root_ls else []
    )
    exo_cameras = (
        ls(os.path.join(root, "exo")) if "exo" in root_ls else []
    )

    video_count = 0
    for cam in ego_cameras:
        cam_root = os.path.join(root, "ego", cam)
        cam_ls = [x for x in ls(cam_root) if is_video(x)]
        video_metadata_records.append(
            {
                "university_capture_id": capture_id,
                "university_video_id": cam,
                "number_video_components": len(cam_ls),
                "is_ego": True,
                "has_walkaround": EGO_HAS_WALKAROUND,
                "is_redacted": False,
                "includes_audio": True,
                "device_type": "aria" if len(cam_ls) > 0 and cam_ls[0].endswith('.vrs') else DEFAULT_GOPRO_MODEL_TYPE,
                "device_id": cam,
                "video_device_settings": None,
                "additional_metadata": None,
            }
        )

    for cam in exo_cameras:
        cam_root = os.path.join(root, "exo", cam)
        cam_ls = [x for x in ls(cam_root) if is_video(x)]
        video_metadata_records.append(
            {
                "university_capture_id": capture_id,
                "university_video_id": cam,
                "number_video_components": len(cam_ls),
                "is_ego": False,
                "has_walkaround": cam in WALKAROUND_CAM_NAMES,
                "is_redacted": False,
                "includes_audio": True,
                "device_type": "aria" if len(cam_ls) > 0 and cam_ls[0].endswith('.vrs') else DEFAULT_GOPRO_MODEL_TYPE,
                "device_id": cam,
                "video_device_settings": None,
                "additional_metadata": None,
            }
        )
        
    # Remove any 'cameras' with 0 videos. This removes extraneous files like .DS_STORE
    video_metadata_records = [x for x in video_metadata_records if x['number_video_components'] > 0]

video_metadata = pd.DataFrame.from_records(video_metadata_records)
video_metadata

## Generate video_component_file.csv

In [None]:
video_component_file_records = []
for capture_id in clean_capture_ids:
    root = os.path.join(ROOT_DIR, "data", capture_id)
    root_ls = ls(root)
    ego_cameras = (
        ls(os.path.join(root, "ego")) if "ego" in root_ls else []
    )
    exo_cameras = (
        ls(os.path.join(root, "exo")) if "exo" in root_ls else []
    )

    video_count = 0
    for cam in ego_cameras:
        cam_root = os.path.join(root, "ego", cam)
        for i, filename in enumerate(
            sorted([x for x in ls(cam_root) if is_video(x)])
        ):
            video_component_file_records.append(
                {
                    "university_capture_id": capture_id,
                    "university_video_id": cam,
                    "video_component_relative_path": f"ego/{cam}/{filename}",
                    "component_index": i,
                    "is_redacted": False,
                }
            )

    for cam in sorted(exo_cameras):
        cam_root = os.path.join(root, "exo", cam)
        for i, filename in enumerate(
            sorted([x for x in ls(cam_root) if is_video(x)])
        ):
            video_component_file_records.append(
                {
                    "university_capture_id": capture_id,
                    "university_video_id": cam,
                    "video_component_relative_path": f"exo/{cam}/{filename}",
                    "component_index": i,
                    "is_redacted": False,
                }
            )

video_component_file = pd.DataFrame.from_records(video_component_file_records)
video_component_file

## Generate object_metadata.csv

In [None]:
object_file_records = []
object_files = [x for x in ls(os.path.join(ROOT_DIR, "objects"))]

for object_file in object_files:
    file_stem = os.path.splitext(object_file)[0]
    
    object_file_records.append({
        "university_object_id": file_stem,
        "object_name": file_stem[:file_stem.rindex("_")] if "_" in file_stem else file_stem,
        "object_relative_path": os.path.join("objects", object_file) + ("" if is_media(object_file) else "/"),
        "physical_setting_id": "<fill me in>",
        "additional_metadata": None,
    })

object_metadata = pd.DataFrame.from_records(object_file_records)
object_metadata

## Generate participant_metadata.csv

In [None]:
participant_records = []
participant_records.append({
    "participant_id": "participant_0",
    "scenario_id": "<fill me in>",
    "collection_date": "<fill me in with YYYY-MM-DD>",
    "pre_survey_data": json.dumps({"recording_location": "<fill me in>"}) + "...",
    "participant_metadata": json.dumps({"age_range": "<fill me in>"}) + "...",
})

participant_metadata = pd.DataFrame.from_records(participant_records)
participant_metadata

## Generate physical_setting.csv

In [None]:
# Placeholder file; please fill out csv on your own
physical_setting_records = []
physical_setting_records.append({
    "setting_id": "<fill me in>",
    "name": "<fill me in>"
})

physical_setting = pd.DataFrame.from_records(physical_setting_records)
physical_setting

## Generate extra_data.csv

In [None]:
# Placeholder file; please fill out csv on your own
extra_data_records = []
extra_data_records.append({
    "extra_data_id": "<fill me in>",
    "university_capture_id": "<fill me in>",
    "take_id": "<fill me in>",
    "annotation_data": "<fill me in>"
})

extra_data = pd.DataFrame.from_records(extra_data_records)
extra_data

## Write new manifest files to Output Directory

In [None]:
manifest_dir = os.path.join(OUTPUT_DIRECTORY, f"manifest_{MANIFEST_VERSION}")
file_dfs = {
    "capture_metadata.csv": {
        "data": capture_metadata,
        "primary_keys": ["university_capture_id"]
    },
    "take_metadata.csv": {
        "data": take_metadata,
        "primary_keys": ["university_capture_id", "take_id"]
    },
    "video_metadata.csv": {
        "data": video_metadata,
        "primary_keys": ["university_capture_id", "university_video_id"]
    },
    "video_component_file.csv": {
        "data": video_component_file,
        "primary_keys": ["university_capture_id", "university_video_id", "component_index", "video_component_relative_path"]
    },
    "object_metadata.csv": {
        "data": object_metadata,
        "primary_keys": ["university_object_id"]
    },
    "participant_metadata.csv": {
        "data": participant_metadata,
        "primary_keys": ["participant_id"]
    },
    "physical_setting.csv": {
        "data": physical_setting,
        "primary_keys": ["setting_id"]
    },
    "extra_data.csv": {
        "data": extra_data,
        "primary_keys": ["university_capture_id", "take_id", "annotation_data"]
    }
}

os.makedirs(manifest_dir, exist_ok = True)
for filename in file_dfs.keys():
    filepath = os.path.join(manifest_dir, filename)
    data, primary_keys = file_dfs[filename]['data'], file_dfs[filename]['primary_keys']

    # Filter out rows that already exist in this folder
    # based on the dataframe's primary keys
    if os.path.isfile(filepath):
        assert len([c for c in data.columns if c.endswith('_y') or c.endswith('_x')]) == 0, "Please rename columns ending in _x or _y"
        
        try:
            preexisting_data = pd.read_csv(filepath)
        except pd.errors.EmptyDataError as e:
            continue # Empty files are equivalent to missing files
            
        data = pd.merge(data, preexisting_data, on=primary_keys, how="outer", indicator=True
              ).query('_merge=="left_only"')
        # Fix column names post-merge and drop merge-specific columns
        data = data.drop(columns = ['_merge'] + [c for c in data.columns if c.endswith('_y')])
        data = data.rename(columns={
            c: c[:-2]
            for c in data.columns if c.endswith('_x')
        })
        data = pd.concat([preexisting_data, data])
        print(f"Retained pre-existing data for {filename}")
    
    with open(filepath, "w") as f:
        data.to_csv(f, index=False)

print(f"Successfully wrote {len(file_dfs)} files to {manifest_dir}")