In [None]:
%matplotlib inline

In [None]:
import os
import matplotlib.pyplot as plt
import polars as pl
from pathlib import Path

import coola
from coola.utils import str_mapping
from collections import Counter

In [None]:
plt.style.use("bmh")
plt.rcParams["figure.figsize"] = (16, 5)

In [None]:
from arctix.dataset import epic_kitchen_100
from arctix.dataset.epic_kitchen_100 import Column
from arctix.utils.ngram import find_seq_ngrams, plot_ngrams

In [None]:
# Define the path where to store/load the raw data
data_path = Path(os.environ["ARCTIX_DATA_PATH"]).joinpath("epic_kitchen_100")

In [None]:
# config = pl.Config.set_tbl_cols(-1).set_tbl_rows(10).set_tbl_width_chars(100).set_fmt_table_cell_list_len(100)

## Load raw data

In [None]:
data_raw, metadata_raw = epic_kitchen_100.fetch_data(data_path, split="train")
data_raw

In [None]:
data_raw.describe()

In [None]:
data_raw.filter(pl.col("all_nouns").list.len() > 1)

In [None]:
print(str_mapping(metadata_raw))

## Analyze raw data

In [None]:
group_len = data_raw.group_by([Column.VIDEO_ID]).len()
group_len

In [None]:
max_len = group_len.select(pl.col("len")).max().item()
min_len = group_len.select(pl.col("len")).min().item()
nbins = max_len - min_len + 1
print(f"min={min_len}  max={max_len}  nbins={nbins}")

fig, ax = plt.subplots()
_ = ax.hist(group_len.select(pl.col("len")), bins=nbins, range=[min_len, max_len])

_ = ax.set_title("distribution of sequence length")
_ = ax.set_xlabel("sequence length")
_ = ax.set_ylabel("count")

In [None]:
col = Column.VERB
num_actions = data_raw.select(pl.col(col)).n_unique()
fig, ax = plt.subplots()
_ = ax.hist(data_raw.select(pl.col(col)), bins=num_actions)

_ = ax.set_title("distribution of actions")
_ = ax.set_xlabel("action")
_ = ax.set_ylabel("count")
_ = ax.tick_params(axis="x", labelrotation=90)

## Prepare data

In [None]:
data, metadata = epic_kitchen_100.prepare_data(data_raw, metadata_raw)
data

In [None]:
print(str_mapping(metadata))

In [None]:
arrays = epic_kitchen_100.to_array(data)

In [None]:
print(coola.summary(arrays))

In [None]:
print(str_mapping(arrays))

In [None]:
data_list = epic_kitchen_100.to_list(data)

In [None]:
for key, value in data_list.items():
    print(key, value[:3])

## Analyze n-grams

In [None]:
ngrams = find_seq_ngrams(data_list["verb"], n=2)
counter = Counter(ngrams)
counter.most_common(10)

In [None]:
fig, ax = plt.subplots(figsize=(16, 16))
plot_ngrams(ngrams, ax=ax)