In [3]:
import json
import numpy as np
import operator
import os
import pandas as pd

from functools import reduce
from pathlib import Path
from tqdm.auto import tqdm
from typing import Any, Dict, Mapping, Optional, Sequence

import keypoint_moseq as kpms

  from .autonotebook import tqdm as notebook_tqdm
2026-01-28 14:27:35.579288: W external/org_tensorflow/tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /projects/compsci/jgeorge/USERS/chouda/miniforge3/envs/keypoint_moseq_gpu/lib/python3.9/site-packages/cv2/../../lib64:/cm/shared/apps/slurm/current/lib64/slurm:/cm/shared/apps/slurm/current/lib64
2026-01-28 14:27:35.579306: W external/org_tensorflow/tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)


In [15]:
unsupervised_aging_dir = Path(os.environ["UNSUPERVISED_AGING"])

mode = "do"

project_name, model_name = {
    "b6": (
        "2025-07-03_kpms-v2",
        "2025-07-07_model-2"
    ),
    "do": (
        "2025-07-16_kpms-v3",
        "2025-07-16_model-4"
    )
}[mode]

kpms_dir      = unsupervised_aging_dir / "data/kpms_projects"
dataset_dir   = unsupervised_aging_dir / "data/datasets/2025-12-19_missing-files/"

project_dir = kpms_dir / project_name

In [16]:
results = kpms.load_results(project_dir, model_name)
print(f"num entries: {len(results)}")

num entries: 15


In [17]:
def _get_latent_embedding_statistics() -> Dict[str, Sequence]:
    stats = []
    for _, info in tqdm(results.items()):
        latent_embeddings = info["latent_state"]

        means   = latent_embeddings.mean(axis=0)
        medians = np.median(latent_embeddings, axis=0)
        stds    = latent_embeddings.std(axis=0, ddof=0)

        features = np.concatenate((means, medians, stds))
        stats.append(features)

    trans = list(map(list, zip(*stats)))
    feature_len = len(trans)
    assert feature_len % 3 == 0

    ret = {}
    for i in range(feature_len):
        label = ("mean" if i < feature_len // 3 else
                 "median" if i < 2 * feature_len // 3 else "std")
        ret[f"latent_embedding_{label}_{i % (feature_len // 3)}"] = trans[i]
    return ret

latent_embedding_statistics = _get_latent_embedding_statistics()

  0%|                                                                                                                                         | 0/15 [00:00<?, ?it/s]

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 35.98it/s]


In [18]:
def _get_syllable_frequency_statistics(th: float = 0.0) -> Dict[str, Sequence[float]]:
    sequences = [info["syllable"] for info in results.values()]
    uniq = sorted({s for seq in sequences for s in seq})
    if th > 0.0:
        global_counts = {s: 0 for s in uniq}
        for seq in sequences:
            for s in seq:
                global_counts[s] += 1
        total = sum(global_counts.values())
        uniq = [s for s in uniq if total and global_counts[s] / total >= th]

    if not uniq:
        return {}

    idx = {s: i for i, s in enumerate(uniq)}
    n = len(uniq)

    out = {}
    for s in uniq:
        out[f"avg_bout_length_{s}"] = []
        out[f"total_duration_{s}"] = []
        out[f"num_bouts_{s}"] = []

    for _, info in tqdm(results.items()):
        seq = info["syllable"]
        dur = None
        for k in ("durations", "duration", "syllable_durations", "syllable_duration"):
            if k in info and hasattr(info[k], "__len__") and len(info[k]) == len(seq):
                dur = info[k]
                break

        total_len = [0]*n
        bout_cnt = [0]*n
        sum_dur = [0.0]*n

        prev = None
        run_len = 0
        for i, s in enumerate(seq):
            if s == prev:
                run_len += 1
            else:
                if prev in idx:
                    j = idx[prev]
                    total_len[j] += run_len
                    bout_cnt[j] += 1
                prev = s
                run_len = 1
            if s in idx and dur is not None:
                sum_dur[idx[s]] += float(dur[i])

        if prev in idx:
            j = idx[prev]
            total_len[j] += run_len
            bout_cnt[j] += 1

        if dur is None:
            for j in range(n):
                sum_dur[j] = float(total_len[j])

        for j, s in enumerate(uniq):
            abl = (total_len[j] / bout_cnt[j]) if bout_cnt[j] else 0.0
            out[f"avg_bout_length_{s}"].append(abl)
            out[f"total_duration_{s}"].append(sum_dur[j])
            out[f"num_bouts_{s}"].append(int(bout_cnt[j]))

    return out

syllable_frequency_statistics = _get_syllable_frequency_statistics()

  0%|                                                                                                                                         | 0/15 [00:00<?, ?it/s]

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 68.14it/s]


In [19]:
def _get_metasyllable_transition_matrix(
    grouped_syllables: Optional[Mapping[str, Sequence[int]]] = None,
    *,
    ignore_unknown: bool = False,
    include_frequencies: bool = True,
) -> Dict[str, Sequence[float]]:
    if grouped_syllables is None:
        grouped_syllables = {}

    sequences = [info["syllable"] for info in results.values()]
    vocab_size = max(s for seq in sequences for s in seq) + 1
    all_indices = set(range(vocab_size))

    seen = set()
    for name, idxs in grouped_syllables.items():
        bad = set(idxs) - all_indices
        if bad:
            raise ValueError(f"Group '{name}' contains invalid indices {sorted(bad)}.")
        if seen.intersection(idxs):
            raise ValueError("Duplicate indices detected across groups.")
        seen.update(idxs)

    if not ignore_unknown:
        unknown = sorted(all_indices - seen)
        if unknown:
            grouped_syllables = dict(grouped_syllables)
            grouped_syllables["unknown"] = unknown

    names      = list(grouped_syllables.keys())
    idx_sets   = [set(grouped_syllables[n]) for n in names]
    g          = len(names)
    feats      = {f"transition_matrix_{a}_{b}": [] for a in names for b in names if a != b}
    if include_frequencies:
        feats.update({f"metasyllable_frequency_{n}": [] for n in names})

    idx_to_group = {}
    for gi, s in enumerate(idx_sets):
        for idx in s:
            idx_to_group[idx] = gi

    for _, info in tqdm(results.items()):
        seq = info["syllable"]
        G = np.zeros((g, g), dtype=float)
        for a, b in zip(seq[:-1], seq[1:]):
            if a in idx_to_group and b in idx_to_group:
                G[idx_to_group[a], idx_to_group[b]] += 1

        np.fill_diagonal(G, 0)
        row_sums = G.sum(axis=1, keepdims=True)
        np.divide(G, row_sums, out=G, where=row_sums != 0)

        for i, ai in enumerate(names):
            for j, bj in enumerate(names):
                if ai != bj:
                    feats[f"transition_matrix_{ai}_{bj}"].append(G[i, j])

        if include_frequencies:
            counts = np.zeros(g, dtype=int)
            for s in seq:
                if s in idx_to_group:
                    counts[idx_to_group[s]] += 1
            total_tokens = len(seq)
            freqs = counts / total_tokens if total_tokens else counts
            for i, name in enumerate(names):
                feats[f"metasyllable_frequency_{name}"].append(freqs[i])
    return feats


_metasyllable_groupings = {
    "b6": {
        "kpms_dendrogram_0": [0, 2, 10, 54, 35, 9, 30, 16, 26, 20, 6, 15],
        "kpms_dendrogram_1": [24, 42, 52, 50, 48, 57, 33, 38, 60, 12, 58, 22, 43],
        "kpms_dendrogram_2": [19, 59, 1, 3, 14, 18, 34, 5, 7, 46, 40, 4, 11, 45],
        "kpms_dendrogram_3": [13, 8, 17, 39, 51, 21, 36, 61, 31, 49, 28, 44, 55, 37, 25, 32, 27, 56],
        "kpms_dendrogram_4": [53, 62, 29, 41, 23, 47]
    },
    "do": {
        "kpms_dendogram_0": [12, 20, 28, 14, 26],
        "kpms_dendogram_1": [33, 23, 39, 13, 3, 11, 18],
        "kpms_dendogram_2": [24, 9, 6, 25, 15, 21, 16, 35, 2, 10, 17],
        "kpms_dendogram_3": [4, 34, 22, 30, 27, 29, 32, 19],
        "kpms_dendogram_4": [5, 7, 8, 43, 55, 0, 1, 31],
    },
    "combined": {
        "kpms_dendrogram_0": [41, 11, 23, 39, 22, 37, 28, 32, 5, 34, 1, 31, 20, 13, 25],
        "kpms_dendrogram_1": [45, 46],
        "kpms_dendrogram_2": [44, 50, 4, 2, 18, 53, 24, 8, 35, 14, 15, 10, 17, 26, 30, 7, 43, 9, 42, 48, 6, 29],
        "kpms_dendrogram_3": [47, 27, 36],
        "kpms_dendrogram_4": [40, 21, 12, 33, 16, 0, 3, 19, 38],
    }
}[mode]

# c_1204
# _metasyllable_groupings = {
#     "kpms_dendrogram_0": [19, 30],
#     "kpms_dendrogram_1": [40, 34, 3, 5, 12, 31, 18, 22, 39, 2, 10],
#     "kpms_dendrogram_2": [38, 46, 29, 11, 42, 33, 7, 13, 0, 17, 1, 9, 24, 36, 26, 16, 32, 20, 8, 21],
#     "kpms_dendrogram_3": [27, 35],
#     "kpms_dendrogram_4": [23, 37, 25, 28, 15, 6, 4, 14]
# }

kpms_dendrogram_metasyllable_transition_matrix = _get_metasyllable_transition_matrix(_metasyllable_groupings, ignore_unknown=True, include_frequencies=False)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 25.44it/s]


In [20]:
def _merge_features(
    features: Sequence[Dict[str, Sequence[Any]]]
) -> pd.DataFrame:
    names = list(map(lambda path: path.removesuffix(".csv"), results.keys()))
    merged_features = reduce(operator.or_, [{"name": names}] + features, {})
    return pd.DataFrame(merged_features)

unsupervised_features_df = _merge_features([
    latent_embedding_statistics, 
    syllable_frequency_statistics,
    # old_syllable_frequency_statistics,
    kpms_dendrogram_metasyllable_transition_matrix
])
list(unsupervised_features_df.columns)

['name',
 'latent_embedding_mean_0',
 'latent_embedding_mean_1',
 'latent_embedding_mean_2',
 'latent_embedding_mean_3',
 'latent_embedding_mean_4',
 'latent_embedding_mean_5',
 'latent_embedding_mean_6',
 'latent_embedding_mean_7',
 'latent_embedding_mean_8',
 'latent_embedding_mean_9',
 'latent_embedding_mean_10',
 'latent_embedding_mean_11',
 'latent_embedding_median_0',
 'latent_embedding_median_1',
 'latent_embedding_median_2',
 'latent_embedding_median_3',
 'latent_embedding_median_4',
 'latent_embedding_median_5',
 'latent_embedding_median_6',
 'latent_embedding_median_7',
 'latent_embedding_median_8',
 'latent_embedding_median_9',
 'latent_embedding_median_10',
 'latent_embedding_median_11',
 'latent_embedding_std_0',
 'latent_embedding_std_1',
 'latent_embedding_std_2',
 'latent_embedding_std_3',
 'latent_embedding_std_4',
 'latent_embedding_std_5',
 'latent_embedding_std_6',
 'latent_embedding_std_7',
 'latent_embedding_std_8',
 'latent_embedding_std_9',
 'latent_embedding_st

In [None]:
path_name = unsupervised_aging_dir / "final_data_curation" / f"2025-01-28_{mode}_15.csv"
unsupervised_features_df.to_csv(path_name, index=False)
print(f"Saved to {path_name}")

Saved to /projects/kumar-lab/miaod/projects/unsupervised-aging/final_data_curation/2025-01-28_do_15.csv
