In [1]:
import h5py
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
from collections import defaultdict
from aging.organization.paths import FOLDERS
from aging.size_norm.data import clean
from aging.video import write_movie_av
from scipy.ndimage import rotate

In [2]:
def make_slice(pos, pad):
    pos = int(pos) + pad
    return slice(pos - 40, pos + 40)


def is_ok_position(pos, pad, max_pos):
    pos = int(pos) + pad
    return ((pos - 40) > 0) and ((pos + 40) < max_pos)

In [3]:
version = 7
syllable_path = f"/n/groups/datta/win/longtogeny/data/ontogeny/version_{version:02d}/all_data_pca/syllables.h5"
pca_path = f"/n/groups/datta/win/longtogeny/data/ontogeny/version_{version:02d}/all_data_pca/pca_scores.h5"

In [4]:
uuid_map = defaultdict(dict)
bad_examples = []

for folder in tqdm(FOLDERS):
    for file in folder.glob('**/results_00.h5'):
        try:
            with h5py.File(file, 'r') as h5f:
                uuid = h5f['metadata/uuid'][()].decode()
                uuid_map[folder][uuid] = file
        except OSError:
            bad_examples.append(file)
uuid_map = dict(uuid_map)

  0%|          | 0/7 [00:00<?, ?it/s]

In [5]:
min_dur = 5  # frames
max_dur = 60
pad = 30  # frames
n_examples = 40
seed = 0
frames_key = 'frames'  # or win_size_norm_frames_v4
cm_keys = ['centroid_x_px', 'centroid_y_px', 'angle']

radius = 2
center = (80 // 2, ) * 2  # hard-coded for aging project
height_thresh = 10

In [6]:
base_crowd_movie_folder = Path(
    f"/n/groups/datta/win/longtogeny/data/ontogeny/version_{version:02d}/all_data_pca/crowd_movies"
)
for FOLDER in tqdm(FOLDERS, desc="Processing experiments"):
    crowd_movie_folder = base_crowd_movie_folder / FOLDER.name
    # open syllable and pca h5 file
    uuid_dict = uuid_map[FOLDER]
    with h5py.File(syllable_path, "r") as syll_h5, h5py.File(pca_path, "r") as pca_h5:
        # construct df for syllables in this experiment:
        df = []
        for uuid, file in uuid_dict.items():
            try:
                df.append(
                    pd.DataFrame(
                        dict(
                            syllables=syll_h5[uuid][()],
                            uuid=uuid,
                            file=str(file),
                            frame_idx=pca_h5["scores_idx"][uuid][()],
                        )
                    )
                )
            except KeyError:
                continue

    # concatenate dataframe, compute syllable durations
    df = pd.concat(df, ignore_index=True)
    df["onsets"] = df.groupby("uuid", sort=False)["syllables"].transform(
        lambda v: v.diff() != 0
    )
    idx = np.where(df["onsets"])[0]
    df.loc[df["onsets"], "duration"] = np.append(np.diff(idx), [len(df) - idx[-1]])
    df["duration"] = df["duration"].ffill().astype("uint16")
    syllables = df["syllables"].unique()
    df = df.dropna(subset=["frame_idx"])
    df["frame_idx"] = df["frame_idx"].astype("uint32")
    usages_map = dict(
        map(reversed, enumerate(df.query("onsets")["syllables"].value_counts().index))
    )

    crowd_matrix_dict = {}
    total_examples = []

    for syllable in syllables:
        mask = (
            (df["syllables"] == syllable)
            & df["duration"].between(min_dur, max_dur)
            & df["onsets"]
            & (df["frame_idx"] > pad)
        )
        examples = df[mask].sample(n=min(n_examples, mask.sum()), random_state=seed)
        examples = examples.sort_values(by=["file", "frame_idx"])
        if len(examples) == 0:
            continue
        total_examples.append(examples)
        crowd_matrix_dict[syllable] = np.zeros(
            (examples["duration"].max() + pad * 2, 424, 512), dtype="uint8"
        )
    total_examples = pd.concat(total_examples, ignore_index=True)

    for file, _df in tqdm(
        total_examples.groupby("file", sort=False), desc="Extracting syllables"
    ):
        with h5py.File(file, "r") as h5f:
            extents = np.where(h5f["metadata/extraction/roi"])
            y_min = extents[0].min()
            x_min = extents[1].min()
            for idx, row in _df.sort_values(by="frame_idx").iterrows():
                agg_feats = {}
                start = row.frame_idx - pad
                end = row.frame_idx + row.duration + pad
                agg_feats["frames"] = np.array(
                    [clean(f) for f in h5f[frames_key][start:end]]
                )
                for key in cm_keys:
                    agg_feats[key] = h5f["scalars"][key][start:end]
                flips = h5f["metadata/extraction/flips"][start:end]
                # agg_feats["angle"] -= np.pi * flips
                agg_feats["angle"] = np.rad2deg(agg_feats["angle"])
                rotated = np.array(
                    [
                        rotate(f, a, reshape=False)
                        for f, a in zip(agg_feats["frames"], agg_feats["angle"])
                    ],
                    dtype="uint8",
                )
                rotated[
                    pad:-pad,
                    center[0] - radius : center[0] + radius,
                    center[1] - radius : center[1] + radius,
                ] = 255
                for i, (x, y, frame) in enumerate(
                    zip(
                        agg_feats["centroid_x_px"],
                        agg_feats["centroid_y_px"],
                        rotated,
                    )
                ):
                    if np.isnan(x) or np.isnan(y):
                        continue
                    if not is_ok_position(y, y_min, 424) or not is_ok_position(
                        x, x_min, 512
                    ):
                        continue
                    ys, xs = make_slice(y, y_min), make_slice(x, x_min)
                    crowd_matrix_dict[row.syllables][i, ys, xs] = frame + (
                        (frame < height_thresh)
                        * crowd_matrix_dict[row.syllables][i, ys, xs]
                    )

    for syllable, mov in tqdm(crowd_matrix_dict.items(), desc="Writing movies"):
        write_movie_av(
            crowd_movie_folder
            / f"usage_order_{usages_map[syllable]:02d}-syllable_{syllable:02d}.mp4",
            np.clip(mov, 0, 95),
            cmap="cubehelix",
        )

Processing experiments:   0%|          | 0/7 [00:00<?, ?it/s]

Extracting syllables:   0%|          | 0/224 [00:00<?, ?it/s]

Writing movies:   0%|          | 0/72 [00:00<?, ?it/s]

Extracting syllables:   0%|          | 0/360 [00:00<?, ?it/s]

Writing movies:   0%|          | 0/75 [00:00<?, ?it/s]

Extracting syllables:   0%|          | 0/989 [00:00<?, ?it/s]

Writing movies:   0%|          | 0/72 [00:00<?, ?it/s]

Extracting syllables:   0%|          | 0/870 [00:00<?, ?it/s]

Writing movies:   0%|          | 0/72 [00:00<?, ?it/s]

Extracting syllables:   0%|          | 0/876 [00:00<?, ?it/s]

Writing movies:   0%|          | 0/72 [00:00<?, ?it/s]

Extracting syllables:   0%|          | 0/840 [00:00<?, ?it/s]

Writing movies:   0%|          | 0/73 [00:00<?, ?it/s]

Extracting syllables:   0%|          | 0/169 [00:00<?, ?it/s]

Writing movies:   0%|          | 0/72 [00:00<?, ?it/s]