In [None]:
%matplotlib inline

In [None]:
import os
import math
import matplotlib.pyplot as plt
import polars as pl
import numpy as np
import seaborn as sns
from pathlib import Path

import coola
from coola.utils import str_mapping, repr_mapping

In [None]:
plt.style.use("bmh")
plt.rcParams["figure.figsize"] = (16, 5)

In [None]:
from arctix.dataset.multithumos import Column, fetch_data, prepare_data, to_array_data
from arctix.utils.masking import convert_sequences_to_array, generate_mask_from_lengths

In [None]:
data_path = Path(os.environ['ARCTIX_DATA_PATH']).joinpath("multithumos")

## Load raw data

In [None]:
data_raw = fetch_data(data_path)
data_raw

In [None]:
data_raw.describe()

## Analyze raw data

In [None]:
group_len = data_raw.group_by([Column.VIDEO]).len()
group_len

In [None]:
max_len = group_len.select(pl.col('len')).max().item()
min_len = group_len.select(pl.col('len')).min().item()
nbins = max_len - min_len + 1
print(f'min={min_len}  max={max_len}  nbins={nbins}')

fig, ax = plt.subplots()
_ = ax.hist(group_len.select(pl.col('len')), bins=nbins, range=[min_len, max_len])

_ = ax.set_title('distribution of sequence length')
_ = ax.set_xlabel('sequence length')
_ = ax.set_ylabel('count')

In [None]:
num_actions = data_raw.select(pl.col(Column.ACTION)).n_unique()
fig, ax = plt.subplots()
_ = ax.hist(data_raw.select(pl.col(Column.ACTION)), bins=num_actions)

_ = ax.set_title('distribution of actions')
_ = ax.set_xlabel('action')
_ = ax.set_ylabel('count')
_ = ax.tick_params(axis="x", labelrotation=90)

## Prepare data

In [None]:
data, metadata = prepare_data(data_raw)
data

In [None]:
print(str_mapping(metadata))

In [None]:
arrays = to_array_data(data)
print(coola.summary(arrays))
print(str_mapping(arrays))