# Ego4D Annotation Transformation
This notebook helps you transform Ego4D annotations into different variants, e.g. with scaled down bboxes.

## Prerequisites
1. Use the [Ego4D CLI](https://ego4d-data.org/docs/start-here/) to download the annotations dataset.
2. Install all the packages in this notebook using `requirements.txt`.

## **Useful Links:**

[Ego4D Docs - Start Here!](https://ego4d-data.org/docs/start-here/#Download-The-CLI)

[Data Overview](https://ego4d-data.org/docs/data-overview/)

[Official Ego4D Site](https://ego4d-data.org/)

In [None]:
# Set your options here

import os

CLI_OUTPUT_DIR = "/Users/<userid>/ego4d" # Replace with the full path to the --output_directory you pass to the cli
VERSION = "v1"

METADATA_PATH = os.path.join(CLI_OUTPUT_DIR, "ego4d.json")
ANNOTATIONS_PATH = os.path.join(CLI_OUTPUT_DIR, VERSION, "annotations")

assert os.path.exists(METADATA_PATH), f"Metadata doesn't exist at {METADATA_PATH}. Is the CLI_OUTPUT_DIR right? Do you satisfy the pre-requisites?"
assert os.path.exists(os.path.join(ANNOTATIONS_PATH, "manifest.csv")), "Annotation metadata doesn't exist. Did you download it with the CLI?"

In [None]:
# Imports and consts
import random
import simplejson as json

from collections import namedtuple
from typing import List

annotation_files = [
    "av_train.json",
    "av_val.json",
    
    "fho_hands_train.json",
    "fho_hands_val.json",
    # "fho_lta_taxonomy.json",
    "fho_lta_train.json",
    "fho_lta_val.json",
    "fho_scod_train.json",
    "fho_scod_val.json",
    "fho_sta_train.json",
    "fho_sta_val.json",
    
    # "manifest.csv",
    # "manifest.ver",
    
    "moments_train.json",
    "moments_val.json",
    
    "narration.json",
    # "narration_noun_taxonomy.csv",
    # "narration_verb_taxonomy.csv",
    
    "nlq_train.json",
    "nlq_val.json",
    "vq_train.json",
    "vq_val.json"
]

def load_json_from_path(path):
    with open(path) as json_file:
        return json.load(json_file)

# Load Metadata Into Memory
First load video metadata into a dict. This is useful for mappers/selectors that need video resolution or other info.

In [None]:
# Load ego4D video metadata into a dictionary for easy indexing
meta = load_json_from_path(METADATA_PATH)

metadata = {
    video['video_uid']: {
        **{
            k: v
            for k, v in video.items()
                if k != 'video_metadata'
        },
        **video['video_metadata']
    }
    for video in meta['videos']
}

print(len(metadata))
print(f"Keys Accessible in Metadata: {list(metadata[list(metadata.keys())[0]].keys())}")

# Define JSON Transformation Utils
We transform the annotation jsons with a selector/mapper architecture. The selector gets a json object and decides whether it needs to be changed. If it does, then the mapper takes in that object and returns a transformed one to take its place.

We call each selector/mapper pair a 'Transform'.

We input an ordered list of Transforms to be applied; only the first matching Transform for each object is used.

In [None]:
# Applies a list of Transforms to the given json object
# using a recursive DFS
# 
# Each Transform is a selector and a mapper. The selector
# identifies json objects that should be transformed, and
# the mapper transforms them.
# 
# Selectors have type: (obj: any, context: dict) -> bool
# Mappers have type: (obj: any, context: dict) -> any
# 
# Context fields are passed into the selector/mapper for
# all children once they're seen. See wiki for schema
# structure and add any context fields you need to use
# below.

Transform = namedtuple('Transformation', 'selector mapper')
context_fields = ['video_uid', 'video_id', 'clip_uid']

def _apply_transforms(obj, transforms, context=None):
    context = context or {}
    
    for transform in transforms:
        selector, mapper = transform
        if selector(obj, context):
            return mapper(obj, context)
        
    if type(obj) is dict:
#       Context fields are propagated down to their children in
#       the context object, so mappers/selectors can use them
        context = {
            **context,
            **{
                k: obj[k]
                for k in context_fields
                    if k in obj
            }
        }
        
        return {
            k: _apply_transforms(v, transforms, {**context, 'key': k})
            for k, v in obj.items()
        }
    elif type(obj) is list:
        return [
            _apply_transforms(v, transforms, context)
            for v in obj
        ]
    return obj

def transform_annotations(input_path: str, output_path: str, transforms: List[Transform]):
    print(f"\nloading {input_path}...")
    original_obj = load_json_from_path(input_path)
    print("transforming...")
    transformed_obj = _apply_transforms(original_obj, transforms)
    print(f"writing {output_path}...")
    with open(output_path, 'w') as f:
        json.dump(transformed_obj, f)
    print("done.")
    

# Define Transforms
Here are a few sample Transforms.

In [None]:
def scale_ratio(video_uid, new_height):
    return float(new_height) / metadata[video_uid]['display_resolution_height']

# dummy example, increments all floats/ints by 1
increment_nums = Transform(
    lambda obj, ctx: isinstance(obj, int) or isinstance(obj, float), # Selector
    lambda obj, ctx: obj + 1 # Mapper
)

# scale bboxes - works for the schemas of av, fho_scod, and vq
def scale_bboxes(new_height): return Transform(
    lambda obj, ctx: type(obj) is dict and 'x' in obj and 'y' in obj and 'width' in obj and 'height' in obj, # Selector
    lambda obj, ctx: { # Mapper
        **obj, # av has extra properties to retain, e.g. person id
        'x': obj['x'] * scale_ratio(ctx['video_uid'], new_height),
        'y': obj['y'] * scale_ratio(ctx['video_uid'], new_height),
        'width': obj['width'] * scale_ratio(ctx['video_uid'], new_height),
        'height': obj['height'] * scale_ratio(ctx['video_uid'], new_height),
    }
)

# scale bboxes - works for the schema of fho_sta
def fho_sta_scale_boxes(new_height): return Transform(
    lambda obj, ctx: type(obj) is list and ctx.get('key') == 'box', # Selector
    lambda obj, ctx: [ # Mapper
        point * scale_ratio(ctx['video_id'], new_height)
        for point in obj
    ]
)

# Execute transformations and output new annotation files
Now we apply these Transforms to the original annotation files and create new ones.

In [None]:
# transform_jobs has format: [
#    ( input file (in annotations directory), output path, list of Transforms to apply ), ...
# ]
transform_jobs = [
    ('av_train.json', 'av_train_height-540.json', [scale_bboxes(540)]),
    ('av_val.json', 'av_val_height-540.json', [scale_bboxes(540)]),
    ('fho_hands_train.json', 'fho_hands_train_height-540.json', [scale_bboxes(540)]),
    ('fho_hands_val.json', 'fho_hands_val_height-540.json', [scale_bboxes(540)]),
    ('fho_scod_train.json', 'fho_scod_train_height-540.json', [scale_bboxes(540)]),
    ('fho_scod_val.json', 'fho_scod_val_height-540.json', [scale_bboxes(540)]),
    ('vq_train.json', 'vq_train_height-540.json', [scale_bboxes(540)]),
    ('vq_val.json', 'vq_val_height-540.json', [scale_bboxes(540)]),
    
    ('fho_sta_train.json', 'fho_sta_train_height-540.json', [fho_sta_scale_boxes(540)]),
    ('fho_sta_val.json', 'fho_sta_val_height-540.json', [fho_sta_scale_boxes(540)]),
]

for j in transform_jobs:
    transform_annotations(os.path.join(ANNOTATIONS_PATH, j[0]), j[1], j[2])

# Test Outputs

Always validate the annotation output before using it. We look at a specific json path to verify that the bboxes have been scaled appropriately, then do a quick 'deep diff' to catch high-level changes.

In [None]:
# Manual json path checks

old_av, new_av = load_json_from_path(os.path.join(ANNOTATIONS_PATH, 'av_train.json')), load_json_from_path('av_train_height-540.json')
f = lambda x: x['videos'][0]['clips'][0]['persons'][1]['tracking_paths'][0]['track'][0]

print(old_av['videos'][0]['video_uid'])
print(f(old_av), "\n", f(new_av))
# len(old_av['videos'][0]['clips'][0]['persons'][0]['tracking_paths'])

In [None]:
# Deepdiffs
# 'No change' results are not reliable when using the max_diffs arg. 'Changed' results are.
# max_diffs arg is usually required to finish diffing in a reasonable amount of time

from deepdiff import DeepDiff
from pprint import pprint, pformat

def print_json_file_diff(a, b, print_limit=5000):
    print(f"loading {a}, {b}...")
    obj_a, obj_b = load_json_from_path(a), load_json_from_path(b)
    print("diffing...")
    x = DeepDiff(obj_a, obj_b, max_diffs=500)
    print("pformatting...")
    y = pformat(x)
    print("printing...")
    print(y[:print_limit])
    
for j in transform_jobs:
    print_json_file_diff(os.path.join(ANNOTATIONS_PATH, j[0]), j[1])