# Atomic Actions Descriptions Tutorial

In this tutorial, we will:
1. Read the annotation file
2. Compute aggregate stats on the atomic action descriptions
3. Embed the descriptions with Sentence Transformers (https://www.sbert.net/)
4. Perform clustering on these embedddings to find common phrases using sklearn
5. Visualize the embeddings with Dimension Reduction (t-SNE)

In [None]:
import json
import time
import os
import traceback
import random
from collections import defaultdict

import numpy as np
import pandas as pd
from tqdm import tqdm

In [None]:
RELEASE_DIR = "/path/placeholder"  # NOTE: changeme
assert os.path.exists(RELEASE_DIR), "change RELEASE_DIR to where you downloaded the dataset to"

egoexo = {
    "takes": os.path.join(RELEASE_DIR, "takes.json"),
    "takes_dropped": os.path.join(RELEASE_DIR, "takes_dropped.json"),
    "captures": os.path.join(RELEASE_DIR, "captures.json"),
    "physical_setting": os.path.join(RELEASE_DIR, "physical_setting.json"),
    "participants": os.path.join(RELEASE_DIR, "participants.json"),
    "visual_objects": os.path.join(RELEASE_DIR, "visual_objects.json"),
}

TASK_ID_CAT = {
    0: "Unknown",
    1000: "Cooking",
    2000: "Health",
    4000: "Bike Repair",
    5000: "Music",
    6000: "Basketball",
    7000: "Rock Climbing",
    8000: "Soccer",
    9000: "Dance",
}

for k, v in egoexo.items():
    egoexo[k] = json.load(open(v))

takes = egoexo["takes"] + egoexo["takes_dropped"]
captures = egoexo["captures"]
takes_by_uid = {x["take_uid"]: x for x in takes}

In [None]:
annotation_dir = os.path.join(RELEASE_DIR, "annotations/")

In [None]:
atomic_anns = json.load(open(os.path.join(annotation_dir, "atomic_descriptions_latest.json")))
anns = atomic_anns["annotations"]

## Read the Annotation File

In [None]:
# as a flat list
all_descs = [
    (take_uid, x)
    for take_uid, xs in anns.items()
    for y in xs
    for x in y["descriptions"]
]
all_descs[0:2]

In [None]:
all_anns = [y for xs in anns.values() for y in xs]

In [None]:
# Grouped by the task the take is associated to.
# Below also retrieves (for stats):
# 1. the coverage per task
# 2. the density of annotations (relating to take time)
coverage_by_task = defaultdict(lambda: defaultdict(int))
desc_per_minute = defaultdict(list)
desc_by_task = defaultdict(list)
for take_uid, vs in anns.items():
    take = takes_by_uid.get(take_uid)
    if take is None:
        task = "Redacted"
        continue
    else:
        task = int(take["task_id"]) // 1000 if take["task_id"] is not None else "Dropped"

    cat_name = task
    if isinstance(task, int):
        cat_name = TASK_ID_CAT[task * 1000]
    task = cat_name
    
    # get all the annotated descriptions by their task
    for x in vs:
        desc_by_task[task].extend(x["descriptions"])
        desc_by_task["all"].extend(x["descriptions"])
    
    # compute coverage
    coverage_by_task[task][len(vs)] += 1
    coverage_by_task["all"][len(vs)] += 1
    
    # compute density
    if take is not None:
        for y in vs:
            num_narr = len(y["descriptions"])
            dur = (take["duration_sec"] / 60)
            narrs_per_min = num_narr / dur
            # desc_per_minute[task].append((take_uid, narrs_per_min, take["duration_sec"], num_narr, dur))
            desc_per_minute[task].append(narrs_per_min)
            desc_per_minute["all"].append(narrs_per_min)

dict(coverage_by_task["all"]), coverage_by_task.keys()

In [None]:
desc_by_task["all"][0:3]

## Compute & Display Statistics

In [None]:
HAS_SPACY = False
PREFER_SPACY = False  # if you want stats on nouns/verbs

if PREFER_SPACY:
    try:
        import spacy
        nlp = spacy.load("en_core_web_md")
        print("spacy loaded!", flush=True)
        HAS_SPACY = True
    except Exception:
        print(f"WARN: spacy could not be loaded. This is not necessary to run the notebook. :\n{traceback.format_exc()}")

In [None]:
def create_desc_stats():
    return {
        "num_nouns": [],
        "num_verbs": [],
        "num_sents": [],
        "num_words": [],
        "words_per_sentence": [],
    }

# get the unique nouns & verbs by category
noun_counts = defaultdict(lambda: defaultdict(int))
verb_counts = defaultdict(lambda: defaultdict(int))
desc_stats = defaultdict(create_desc_stats)

if HAS_SPACY:
    print("Processing with spacy. WARN: this will take ~15 minutes")
    for take_uid, x in tqdm(all_descs):
        take = takes_by_uid.get(take_uid)
        if take is None:
            task = "Redacted"
            continue
        else:
            task = int(take["task_id"]) // 1000 if take["task_id"] is not None else "Dropped"

        doc = nlp(x["text"])
        num_sents = len(list(doc.sents))
        num_words = len(doc)
        words_per_sentence = num_words / num_sents if num_sents > 0 else None

        # group the tokens by their class (noun, verb, etc.)
        toks_by_class = defaultdict(list)
        for tok in doc:
            toks_by_class[tok.pos_].append(tok)

        num_nouns = len(toks_by_class["NOUN"]) + len(toks_by_class["PROPN"])
        num_verbs = len(toks_by_class["VERBS"])
        for tok in toks_by_class["NOUN"]:
            noun_counts["all"][tok.text] += 1
            noun_counts[task][tok.text] += 1
        for tok in toks_by_class["PROPN"]:
            noun_counts["all"][tok.text] += 1
            noun_counts[task][tok.text] += 1
        for tok in toks_by_class["VERB"]:
            if tok.text == "'s":
                continue
            verb_counts["all"][tok.text] += 1
            verb_counts[task][tok.text] += 1

        for part in [task, "all"]:
            desc_stats[part]["num_nouns"].append(num_nouns)
            desc_stats[part]["num_verbs"].append(num_verbs)
            desc_stats[part]["num_sents"].append(num_sents)
            desc_stats[part]["num_words"].append(num_words)
            desc_stats[part]["words_per_sentence"].append(words_per_sentence)

In [None]:
desc_stats_df = pd.DataFrame(desc_stats["all"])

In [None]:
all_descs[0]

In [None]:
num_takes_covered = len(anns)
desc_per_ann = np.array([len(y["descriptions"]) for y in all_anns])
noun_counts_sorted = sorted(noun_counts["all"].items(), key=lambda x: -x[1])
verb_counts_sorted = sorted(verb_counts["all"].items(), key=lambda x: -x[1])

In [None]:
if HAS_SPACY:
    print(f"""
      # Annotations = {len(all_anns)}
      # Takes Annotated = {num_takes_covered}
      Unique Annotaations Per Take = {dict(coverage_by_task["all"])}
      # Descriptions = {len(anns)}
      Avg Narrations per Annotation = {desc_per_ann.mean():.3f} (std dev = {desc_per_ann.std():.3f})
      # Sentences = {desc_stats_df.num_sents.sum()}
      Avg Sentences per Description = {desc_stats_df.num_sents.mean():.3f} (std dev = {desc_stats_df.num_sents.std():.3f})
      # Words = {desc_stats_df.num_words.sum()}
      Avg Words per Sentence = {desc_stats_df.words_per_sentence.mean():.3f} (std dev = {desc_stats_df.words_per_sentence.std():.3f})
      # Unique Nouns = {len(noun_counts_sorted)}
      # Unique Verbs = {len(verb_counts_sorted)}
      """)

In [None]:
stats = {
    "Category": [],
    "Takes >= 1x Coverage": [],
    "Takes >= 2x Coverage": [],
    "Number of Descriptions": [],
    "Descriptions Per Minute": [],
    "Unique Nouns": [],
    "Unique Verbs": [],
}
for cat in coverage_by_task.keys():
    cat_name = cat
    dpm = np.array(desc_per_minute[cat])
    stats["Category"].append(cat_name)
    stats["Takes >= 1x Coverage"].append(coverage_by_task[cat][1])
    stats["Takes >= 2x Coverage"].append(coverage_by_task[cat][2])
    stats["Descriptions Per Minute"].append(f"{dpm.mean():.3f} (+- {dpm.std():.3f})")
    stats["Number of Descriptions"].append(len(desc_by_task[cat]))
    stats["Unique Nouns"].append(len(noun_counts[cat]))
    stats["Unique Verbs"].append(len(verb_counts[cat]))

stats_df = pd.DataFrame(stats)
stats_df

## Embed the Descriptions

In [None]:
from sentence_transformers import SentenceTransformer, util

embedder = SentenceTransformer('all-mpnet-base-v2', device="cuda")

def txt_simm(txt1, txt2):
    query_embedding = embedder.encode(txt1)
    passage_embedding = embedder.encode([txt2])
    
    return util.dot_score(query_embedding, passage_embedding)

In [None]:
embs_by_task = {}

# NOTE:
# this is a little redundant as "all" contains all parts
# but this is fast enough to not matter
for task, xs in tqdm(desc_by_task.items(), total=len(desc_by_task)):
    txts = [x["text"] for x in xs]
    embs_by_task[task] = embedder.encode(txts)

## Cluster the Descriptions

In [None]:
from sklearn.cluster import AgglomerativeClustering, KMeans

In [None]:
def cluster_task(task, num_clusters_or_threshold, cluster_type="kmeans"):
    embs = embs_by_task[task]
    descs = desc_by_task[task]

    # https://github.com/UKPLab/sentence-transformers/blob/master/examples/applications/clustering/kmeans.py
    # num_clusters = 20
    if cluster_type == "kmeans":
        clustering_model = KMeans(n_clusters=num_clusters_or_threshold)
    else:
        clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=num_clusters_or_threshold)
        
    clustering_model.fit(embs)
    
    cluster_assignment = clustering_model.labels_
    # if isinstance(cluster_assignment, list):
    #     cluster_assignment = {i: x for i, x in enumerate(cluster_assignment)}
    
    clustered_sentences = {}
    for sentence_id, cluster_id in enumerate(cluster_assignment):
        if cluster_id not in clustered_sentences:
            clustered_sentences[cluster_id] = []
        clustered_sentences[cluster_id].append(descs[sentence_id]["text"])
    
    for cluster_id, cluster in sorted(clustered_sentences.items(), key=lambda x: x[0]):
        print("Cluster ", cluster_id)
        print(random.sample(cluster, min(len(cluster), 5)), len(cluster))
        print("")

In [None]:
cluster_task("Soccer", 20)
# cluster_task("Soccer", 1.5, "aggl")

In [None]:
# cluster_task("Cooking", 20)
# cluster_task("Cooking", 1.5, "aggl")

In [None]:
# cluster_task("Bike Repair", 30)

In [None]:
# cluster_task("Health", 20)

In [None]:
# cluster_task("Basketball", 1.25, "aggl")

## Embedding Visualization & Dimension Reduction

In [None]:
import plotly.express as px
import plotly.graph_objects as go

from PIL import Image
from sklearn import preprocessing

USING_SKLEARN = True
PREFER_SKLEARN = True

if PREFER_SKLEARN:
    from sklearn.manifold import TSNE
    print("Using sklearn")
else:
    try:
        # https://pypi.org/project/tsne-torch/
        # https://github.com/CannyLab/tsne-cuda/blob/master/INSTALL.md
        from tsnecuda import TSNE
        USING_SKLEARN = False
        print("Using CannyLab's tsnecuda")
    except Execption:
        from sklearn.manifold import TSNE
        print(f"WARN: sklearn TSNE is used and not preferred: {traceback.format_exc()}")

In [None]:
task = "Soccer"  # NOTE: changeme to what you want to cluster
embs = embs_by_task[task]
descs = desc_by_task[task]

In [None]:
t1 = time.time()
X = embs_by_task[task]
X_norm = preprocessing.MinMaxScaler(feature_range=(-1, 1)).fit_transform(X)
X_tsne = TSNE(
    n_components=2,
    verbose=1,
    n_iter=5000,
    perplexity=300.0, # NOTE: adjust me
).fit_transform(X_norm)
t2 = time.time()

In [None]:
t2 - t1

In [None]:
xys = X_tsne.tolist()
data_df = {
    "x": [x for x, _ in xys],
    "y": [y for _, y in xys],
    "description": [x["text"] for x in descs],
}

In [None]:
px.scatter(data_df, x="x", y="y", color=None, hover_data=["description"])