In [None]:
import random
import pprint
import pyarrow.parquet as pq
from datasets import load_dataset
from utils.visualize import plot_tree_timeline
import matplotlib.pyplot as plt

### Streamed loading from huggingface

In [None]:
dataset = load_dataset(
    "parquet",
    data_files=f"hf://datasets/facebook/Action100M-preview/data/*.parquet",
    streaming=True,
)
it = iter(dataset["train"])

sample = next(it)
pprint.pprint(sample, width=128, sort_dicts=False)

### Loading from local parquet

In [None]:
PARQUET_FILE = "./data/examples.parquet"
pf = pq.ParquetFile(PARQUET_FILE)

# pick and load a random row group
rg_idx = random.randrange(pf.num_row_groups)
table = pf.read_row_group(rg_idx, columns=["video_uid", "metadata", "nodes"])
print(f"Loaded {PARQUET_FILE} row_group={rg_idx} with {table.num_rows} rows")

# pick and print a random row from that row group
row_idx = random.randrange(table.num_rows)
sample = table.slice(row_idx, 1).to_pydict()
pprint.pprint(sample, width=128, sort_dicts=False)

### Visualize the hierachical structure of action annotation

In [None]:
nodes = sample['nodes']

print(f"\nTotal nodes in Tree-of-Captions: {len(nodes)}")
print(f"Tree depth: {max(n['level'] for n in nodes) + 1} levels")

nodes_with_annotation = [n for n in nodes if n.get('gpt') is not None]
print(f"\nTotal nodes with structured annotation: {len(nodes_with_annotation)}/{len(nodes)}")

print(f"Video link: https://www.youtube.com/watch?v={sample['video_uid']}")
fig = plot_tree_timeline(nodes)
plt.show()