In [None]:
%matplotlib inline

In [None]:
import os
import matplotlib.pyplot as plt
import polars as pl
from pathlib import Path

import coola
from coola.utils import str_mapping

In [None]:
plt.style.use("bmh")
plt.rcParams["figure.figsize"] = (16, 5)

In [None]:
from arctix.dataset import ego4d
from arctix.dataset.ego4d import Column

In [None]:
# Define the path where to store/load the raw data
data_path = Path(os.environ["ARCTIX_DATA_PATH"]).joinpath("ego4d")

## Load raw data

You can use the function `arctix.dataset.ego4d.fetch_data` to load the raw data in a `polars.DataFrame` format. 
Note that if the data was not downloaded in the dataset path, the `fetch_data` function automatically downloads the data. 

In [None]:
data_raw, metadata_raw = ego4d.fetch_data(data_path, split="train")
data_raw

In [None]:
data_raw.describe()

In [None]:
print(str_mapping(metadata_raw))

## Analyze raw data

In [None]:
group_len = data_raw.group_by([Column.CLIP_ID]).len()
group_len

In [None]:
max_len = group_len.select(pl.col("len")).max().item()
min_len = group_len.select(pl.col("len")).min().item()
nbins = max_len - min_len + 1

fig, ax = plt.subplots()
_ = ax.hist(group_len.select(pl.col("len")), bins=nbins, range=[min_len, max_len])

_ = ax.set_title("distribution of sequence length")
_ = ax.set_xlabel("sequence length")
_ = ax.set_ylabel("count")

In [None]:
col = Column.VERB
num_actions = data_raw.select(pl.col(col)).n_unique()
fig, ax = plt.subplots()
_ = ax.hist(data_raw.select(pl.col(col)), bins=num_actions)

_ = ax.set_title("distribution of actions")
_ = ax.set_xlabel("action")
_ = ax.set_ylabel("count")
_ = ax.tick_params(axis="x", labelrotation=90)

## Prepare data

In [None]:
data, metadata = ego4d.prepare_data(data_raw, metadata_raw)
data

In [None]:
print(str_mapping(metadata))

In [None]:
arrays = ego4d.to_array(data)

In [None]:
print(coola.summary(arrays))

In [None]:
print(str_mapping(arrays))

In [None]:
data_list = ego4d.to_list(data)

In [None]:
for key, value in data_list.items():
    print(key, value[:3])