# Introduction

Expert Commentary was an annotation task annotated by ~50 experts across a large portion of the dataset. Some takes are annotated multiple times by different experts. 

The annotation task required experts to provide:
- Audio commentary at timepoints (of their choosing) with optional drawing overlays
- Profiency score (and reason why in plain-text) of the performance of the camera wearer/participant

For audio commentary, we have transcribed each commentary with Whisper (large-v2). Here is how you use the annotations, with some basic analysis and visualization (how to use the drawing data).

## Data Setup

In [None]:
import json
import os
import random
from collections import defaultdict

import torch
import numpy as np
import pandas as pd

import spacy

from tqdm import tqdm

In [None]:
egoexo_root = "/dataset/placeholder/dir/" # NOTE: changeme to your download path
# egoexo_root = "/large_experiments/egoexo/v2/"
egoexo_annotation_root = os.path.join(egoexo_root, "annotations")
assert os.path.exists(egoexo_root), "please make sure you have downloaded egoexo or check your path"
assert os.path.exists(egoexo_annotation_root), "please download annotations with --parts annotations or check your path"

In [None]:
takes = json.load(open(os.path.join(egoexo_root, "takes.json")))
takes_by_name = {t["take_name"]: t for t in takes}

In [None]:
expert_commentary_transc_path = os.path.join(egoexo_annotation_root, "expert_commentary_train.json")
assert os.path.exists(expert_commentary_transc_path), "please re-download egoexo's annotations (--parts annotations) to get expert commentary"

ecs = json.load(open(expert_commentary_transc_path))
all_anns = ecs["annotations"]

In [None]:
all_transc = []
for take_uid, anns in all_anns.items():
    for ann in anns:
        all_transc.extend(
            [
                {
                    "take": ann["take_name"],
                    "commentary": ann["commentary"],
                    **x,
                }
                for x in ann["commentary_data"]
            ]
        )
len(all_transc)

In [None]:
errors = [x for x in all_transc if x["error"]]
all_transc_succ = [x for x in all_transc if not x["error"]]
len(all_transc), len(all_transc_succ), len(errors)

## Basic Usage & Analysis on Transcriptions

In [None]:
run_spacy_analysis = False # set me to True, be warned this will take ~20minutes

In [None]:
nlp = spacy.load("en_core_web_md")
stats = {
    "num_nouns": [],
    "num_verbs": [],
    "num_sents": [],
    "num_words": [],
    "words_per_sentence": [],
}

noun_counts = defaultdict(int)
verb_counts = defaultdict(int)
if run_spacy_analysis:
    for x in tqdm(all_transc_succ):
        doc = nlp(x["text"])
        num_sents = len(list(doc.sents))
        num_words = len(doc)
        words_per_sentence = num_words / num_sents if num_sents > 0 else None
        toks_by_class = defaultdict(list)
        for tok in doc:
            toks_by_class[tok.pos_].append(tok)
        num_nouns = len(toks_by_class["NOUN"]) + len(toks_by_class["PROPN"])
        num_verbs = len(toks_by_class["VERBS"])
        for tok in toks_by_class["NOUN"]:
            noun_counts[tok.text] += 1
        for tok in toks_by_class["PROPN"]:
            noun_counts[tok.text] += 1
        for tok in toks_by_class["VERB"]:
            if tok.text == "'s":
                continue
            verb_counts[tok.text] += 1
    
        stats["num_nouns"].append(num_nouns)
        stats["num_verbs"].append(num_verbs)
        stats["num_sents"].append(num_sents)
        stats["num_words"].append(num_words)
        stats["words_per_sentence"].append(words_per_sentence)

noun_counts_sorted = sorted(noun_counts.items(), key=lambda x: -x[1])
verb_counts_sorted = sorted(verb_counts.items(), key=lambda x: -x[1])

num_anns = len({x["commentary"] for x in all_transc_succ})
num_takes = len({x["take"] for x in all_transc_succ})

comms_per_ann = defaultdict(list)
for x in all_transc_succ:
    comms_per_ann[x["commentary"]].append(x)

comms_per_ann_arr = np.array([len(xs) for xs in comms_per_ann.values()])

comm_per_min = []
for comm, xs in comms_per_ann.items():
    tn = xs[0]["take"]
    if tn not in takes_by_name:
        continue
    take = takes_by_name[tn]
    take_min = take["duration_sec"] / 60
    num_comms = len(xs)
    comm_per_min.append(num_comms / take_min)

comms_per_min = np.array(comm_per_min)

stats_df = pd.DataFrame(stats)

In [None]:
num_experts = len(set(ann["commentary"].split("/")[-1] for anns in all_anns.values() for ann in anns))

In [None]:
print(
f"""
# Annotations = {num_anns}
# Takes Annotated = {num_takes}
# Commentaries = {len(all_transc_succ)}
Avg Commentaries per Annotation = {comms_per_ann_arr.mean():.3f} (std dev = {comms_per_ann_arr.std():.3f})
# Sentences = {stats_df.num_sents.sum()}
Avg Sentences per Commentary = {stats_df.num_sents.mean():.3f} (std dev = {stats_df.num_sents.std():.3f})
# Words = {stats_df.num_words.sum()}
Avg Words per Sentence = {stats_df.words_per_sentence.mean():.3f} (std dev = {stats_df.words_per_sentence.std():.3f})
# Unique Nouns = {len(noun_counts_sorted)}
# Unique Verbs = {len(verb_counts_sorted)}
Average Commentaries per Minute = {comms_per_min.mean():.3f}
# Experts = {num_experts}
"""
)

In [None]:
for x, count in noun_counts_sorted[0:150]:
    print(f"{x} : {count}")

In [None]:
for x, count in verb_counts_sorted[0:150]:
    print(f"{x} : {count}")

## Profiency Score & Path Drawing for an Annotation

In [None]:
from ego4d.egoexo.expert_commentary import get_paths_for_commentary_time

needs_visualization = True # do you want to visualize the path drawing?
if needs_visualization:
    from ego4d.research.readers import TorchAudioStreamReader
    import cv2
    from PIL import Image

In [None]:
takes_with_comm = list(all_anns.keys())

In [None]:
takes_with_vid = {
    take["take_uid"]
    for take in takes
    if take["frame_aligned_videos"]["best_exo"]["0"]["relative_path"] is not None
}
takes_to_sample = set(takes_with_comm) & set(takes_with_vid)
take_uid = random.sample(takes_with_comm, k=1)[0]
annotator_idx = random.randint(0, len(all_anns[take_uid]) - 1)

# here are example (take, annotator) pairs where there are cleared out paths
# take_uid, annotator_idx = ('6d258ba3-363e-4a40-b739-2b1b6e13fa8a', 1)
# take_uid, annotator_idx = ('3043fd07-a52a-4adc-9a19-12e7e1c29df4', 2)

ann = all_anns[take_uid][annotator_idx]
annotator_idx

In [None]:
# data_path = os.path.join(egoexo_annotation_root, "expert_commentary/", ann["commentary"], "data.json")
data_path = os.path.join("/checkpoint/miguelmartin/expert_commentary/exports/240207/data/", ann["commentary"], "data.json")
data = json.load(open(data_path))

In [None]:
# Rating of the performance (why & a 1-10 score)
data["proficiency"]

In [None]:
take = takes_by_name[ann["take_name"]]
best_exo_video_path = os.path.join(
    egoexo_root,
    take["root_dir"],
    take["frame_aligned_videos"]["best_exo"]["0"]["relative_path"],
)
assert os.path.exists(best_exo_video_path), f"""
please download collages for this take (via `--parts take`), use `--uid {take['take_uid']}` to just download this take
"""
best_exo_video_path

In [None]:
video_reader = TorchAudioStreamReader(
    best_exo_video_path,
    resize=None,
    crop=None,
    mean=None,
    std=None,
    frame_window_size=1,
    stride=1,
    gpu_idx=-1,
    axis_order="thwc",
    uint8_scale=True,
)

In [None]:
def draw_paths(paths, img):
    ret = img.copy()
    h, w, c = img.shape
    round_fn = int # TODO: could round to closest integer
    for path in paths:
        # NOTE: points are scaled 
        from_pt = (round_fn(path["from"]["x"] * w), round_fn(path["from"]["y"] * h), )
        to_pt = (round_fn(path["to"]["x"] * w), round_fn(path["to"]["y"] * h), )
        ret = cv2.line(
            ret,
            from_pt,
            to_pt,
            (255, 0, 0),
            2,
        )
    return ret

In [None]:
comm_idx = None
if comm_idx is None:
    for comm_idx, comm in enumerate(data["annotations"]):
        if len(comm["events"]) > 0:
            break
else:
    comm = data["annotations"][comm_idx]
if len(comm["events"]) == 0:
    print("WARN: no draw events associated to this commentary")

comm_dur = comm["duration_approx"]

# sample `num_pts` points from [0, comm_dur) uniformally 
num_pts = 9
comm_ts = np.linspace(0, comm_dur, num_pts)
# get the path for each timestamp t
comm_paths_per_t = [get_paths_for_commentary_time(comm, t) for t in comm_ts]

# compute the number of points per timestamp
comm_num_paths_per_t = [len(xs) for xs in comm_paths_per_t]

dict(zip(comm_ts, comm_num_paths_per_t)) 

In [None]:
# render the paths for the frame
comm_video_t = comm["video_time"]
frame = video_reader[int(comm_video_t * 30)]["video"][0]
img = frame.numpy()
img_per_t = [
    Image.fromarray(draw_paths(paths, img))
    for paths in comm_paths_per_t
]

In [None]:
img_per_t[0]

In [None]:
img_per_t[5]

In [None]:
img_per_t[-1]