In [38]:
import json
import numpy as np
import operator
import os
import pandas as pd

from functools import reduce
from pathlib import Path
from tqdm.auto import tqdm
from typing import Any, Dict, Mapping, Optional, Sequence

import keypoint_moseq as kpms

In [39]:
unsupervised_aging_dir = Path(os.environ["UNSUPERVISED_AGING"])

project_name  = "2025-09-20_kpms-v5_150_6"
model_name    = "2025-09-21_model-1"
kpms_dir      = unsupervised_aging_dir / "data/kpms_projects"
dataset_dir   = unsupervised_aging_dir / "data/datasets/combined_1126/"
poses_csv_dir = dataset_dir / "poses_csv"

supervised_features_path = unsupervised_aging_dir / "data/archive/B6DO_video.csv"
adj_metadata_path = unsupervised_aging_dir / "data/adj_metadata_sheets/combined_1126_adj_metadata.csv"

project_dir = kpms_dir / project_name

In [40]:
results = kpms.load_results(project_dir, model_name)

In [41]:
sequences = [pose_dict["syllable"] for pose_dict in results.values()]
unique_syllables = sorted({s for seq in sequences for s in seq})
print(len(unique_syllables), max(unique_syllables))

95 94


In [42]:
def _get_latent_embedding_statistics() -> Dict[str, Sequence]:
    stats = []
    for _, info in tqdm(results.items()):
        latent_embeddings = info["latent_state"]

        means   = latent_embeddings.mean(axis=0)
        medians = np.median(latent_embeddings, axis=0)
        stds    = latent_embeddings.std(axis=0, ddof=0)

        features = np.concatenate((means, medians, stds))
        stats.append(features)

    trans = list(map(list, zip(*stats)))
    feature_len = len(trans)
    assert feature_len % 3 == 0

    ret = {}
    for i in range(feature_len):
        label = ("mean" if i < feature_len // 3 else
                 "median" if i < 2 * feature_len // 3 else "std")
        ret[f"latent_embedding_{label}_{i % (feature_len // 3)}"] = trans[i]
    return ret

latent_embedding_statistics = _get_latent_embedding_statistics()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1126/1126 [00:21<00:00, 52.18it/s]


In [43]:
def _get_syllable_frequency_statistics(th: float = 0.0) -> Dict[str, Sequence[float]]:
    sequences = [info["syllable"] for info in results.values()]
    uniq = sorted({s for seq in sequences for s in seq})
    if th > 0.0:
        global_counts = {s: 0 for s in uniq}
        for seq in sequences:
            for s in seq:
                global_counts[s] += 1
        total = sum(global_counts.values())
        uniq = [s for s in uniq if total and global_counts[s] / total >= th]

    if not uniq:
        return {}

    idx = {s: i for i, s in enumerate(uniq)}
    n = len(uniq)

    out = {}
    for s in uniq:
        out[f"avg_bout_length_{s}"] = []
        out[f"total_duration_{s}"] = []
        out[f"num_bouts_{s}"] = []

    for _, info in tqdm(results.items()):
        seq = info["syllable"]
        dur = None
        for k in ("durations", "duration", "syllable_durations", "syllable_duration"):
            if k in info and hasattr(info[k], "__len__") and len(info[k]) == len(seq):
                dur = info[k]
                break

        total_len = [0]*n
        bout_cnt = [0]*n
        sum_dur = [0.0]*n

        prev = None
        run_len = 0
        for i, s in enumerate(seq):
            if s == prev:
                run_len += 1
            else:
                if prev in idx:
                    j = idx[prev]
                    total_len[j] += run_len
                    bout_cnt[j] += 1
                prev = s
                run_len = 1
            if s in idx and dur is not None:
                sum_dur[idx[s]] += float(dur[i])

        if prev in idx:
            j = idx[prev]
            total_len[j] += run_len
            bout_cnt[j] += 1

        if dur is None:
            for j in range(n):
                sum_dur[j] = float(total_len[j])

        for j, s in enumerate(uniq):
            abl = (total_len[j] / bout_cnt[j]) if bout_cnt[j] else 0.0
            out[f"avg_bout_length_{s}"].append(abl)
            out[f"total_duration_{s}"].append(sum_dur[j])
            out[f"num_bouts_{s}"].append(int(bout_cnt[j]))

    return out

syllable_frequency_statistics = _get_syllable_frequency_statistics()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1126/1126 [00:17<00:00, 63.07it/s]


In [44]:
def _old_get_syllable_frequency_statistics(th: float = 0.0) -> Dict[str, Sequence[int]]:
    sequences = [info["syllable"] for info in results.values()]
    uniq = sorted({s for seq in sequences for s in seq})
    if th > 0.0:
        global_counts = {s: 0 for s in uniq}
        for seq in sequences:
            for s in seq:
                global_counts[s] += 1
        total = sum(global_counts.values())
        uniq = [s for s in uniq if global_counts[s] / total >= th]

    idx = {s: i for i, s in enumerate(uniq)}
    n = len(uniq)

    freqs_per_video = []
    for _, info in tqdm(results.items()):
        seq = info["syllable"]
        cnt = np.zeros(n, dtype=int)
        for s in seq:
            if s in idx:
                cnt[idx[s]] += 1
        total_tokens = len(seq)
        freqs = cnt / total_tokens if total_tokens else cnt
        freqs_per_video.append(freqs)

    transposed = list(map(list, zip(*freqs_per_video)))
    return {f"syllable_frequency_{s}": transposed[i] for i, s in enumerate(uniq)}

old_syllable_frequency_statistics = _old_get_syllable_frequency_statistics()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1126/1126 [00:27<00:00, 40.74it/s]


In [45]:
def _get_metasyllable_transition_matrix(
    grouped_syllables: Optional[Mapping[str, Sequence[int]]] = None,
    *,
    ignore_unknown: bool = False,
    include_frequencies: bool = True,
) -> Dict[str, Sequence[float]]:
    if grouped_syllables is None:
        grouped_syllables = {}

    sequences = [info["syllable"] for info in results.values()]
    vocab_size = max(s for seq in sequences for s in seq) + 1
    all_indices = set(range(vocab_size))

    seen = set()
    for name, idxs in grouped_syllables.items():
        bad = set(idxs) - all_indices
        if bad:
            raise ValueError(f"Group '{name}' contains invalid indices {sorted(bad)}.")
        if seen.intersection(idxs):
            raise ValueError("Duplicate indices detected across groups.")
        seen.update(idxs)

    if not ignore_unknown:
        unknown = sorted(all_indices - seen)
        if unknown:
            grouped_syllables = dict(grouped_syllables)
            grouped_syllables["unknown"] = unknown

    names      = list(grouped_syllables.keys())
    idx_sets   = [set(grouped_syllables[n]) for n in names]
    g          = len(names)
    feats      = {f"transition_matrix_{a}_{b}": [] for a in names for b in names if a != b}
    if include_frequencies:
        feats.update({f"metasyllable_frequency_{n}": [] for n in names})

    idx_to_group = {}
    for gi, s in enumerate(idx_sets):
        for idx in s:
            idx_to_group[idx] = gi

    for _, info in mo.status.progress_bar(results.items()):
        seq = info["syllable"]
        G = np.zeros((g, g), dtype=float)
        for a, b in zip(seq[:-1], seq[1:]):
            if a in idx_to_group and b in idx_to_group:
                G[idx_to_group[a], idx_to_group[b]] += 1

        np.fill_diagonal(G, 0)
        row_sums = G.sum(axis=1, keepdims=True)
        np.divide(G, row_sums, out=G, where=row_sums != 0)

        for i, ai in enumerate(names):
            for j, bj in enumerate(names):
                if ai != bj:
                    feats[f"transition_matrix_{ai}_{bj}"].append(G[i, j])

        if include_frequencies:
            counts = np.zeros(g, dtype=int)
            for s in seq:
                if s in idx_to_group:
                    counts[idx_to_group[s]] += 1
            total_tokens = len(seq)
            freqs = counts / total_tokens if total_tokens else counts
            for i, name in enumerate(names):
                feats[f"metasyllable_frequency_{name}"].append(freqs[i])
    return feats


# nature-aging_634
# _metasyllable_groupings = {
#     "kpms_dendrogram_0": [0, 2, 10, 54, 35, 9, 30, 16, 26, 20, 6, 15],
#     "kpms_dendrogram_1": [24, 42, 52, 50, 48, 57, 33, 38, 60, 12, 58, 22, 43],
#     "kpms_dendrogram_2": [19, 59, 1, 3, 14, 18, 34, 5, 7, 46, 40, 4, 11, 45],
#     "kpms_dendrogram_3": [13, 8, 17, 39, 51, 21, 36, 61, 31, 49, 28, 44, 55, 37, 25, 32, 27, 56],
#     "kpms_dendrogram_4": [53, 62, 29, 41, 23, 47]
# }

# geroscience_492
# _metasyllable_groupings = {
#     "kpms_dendogram_0": [12, 20, 28, 14, 26],
#     "kpms_dendogram_1": [33, 23, 39, 13, 3, 11, 18],
#     "kpms_dendogram_2": [24, 9, 6, 25, 15, 21, 16, 35, 2, 10, 17],
#     "kpms_dendogram_3": [4, 34, 22, 30, 27, 29, 32, 19],
#     "kpms_dendogram_4": [5, 7, 8, 43, 55, 0, 1, 31],
# }

# combined_1126
# _metasyllable_groupings = {
#     "kpms_dendrogram_0": [41, 11, 23, 39, 22, 37, 28, 32, 5, 34, 1, 31, 20, 13, 25],
#     "kpms_dendrogram_1": [45, 46],
#     "kpms_dendrogram_2": [44, 50, 4, 2, 18, 53, 24, 8, 35, 14, 15, 10, 17, 26, 30, 7, 43, 9, 42, 48, 6, 29],
#     "kpms_dendrogram_3": [47, 27, 36],
#     "kpms_dendrogram_4": [40, 21, 12, 33, 16, 0, 3, 19, 38],
# }

# kpms_dendrogram_metasyllable_transition_matrix = _get_metasyllable_transition_matrix(_metasyllable_groupings, ignore_unknown=True, include_frequencies=False)

In [46]:
def _merge_features(
    features: Sequence[Dict[str, Sequence[Any]]]
) -> pd.DataFrame:
    names = list(map(lambda path: path.removesuffix(".csv"), results.keys()))
    merged_features = reduce(operator.or_, [{"name": names}] + features, {})
    return pd.DataFrame(merged_features)

unsupervised_features_df = _merge_features([
    latent_embedding_statistics, 
    syllable_frequency_statistics,
    old_syllable_frequency_statistics,
    # kpms_dendrogram_metasyllable_transition_matrix
])
unsupervised_features_df

Unnamed: 0,name,latent_embedding_mean_0,latent_embedding_mean_1,latent_embedding_mean_2,latent_embedding_mean_3,latent_embedding_mean_4,latent_embedding_mean_5,latent_embedding_mean_6,latent_embedding_mean_7,latent_embedding_mean_8,...,syllable_frequency_85,syllable_frequency_86,syllable_frequency_87,syllable_frequency_88,syllable_frequency_89,syllable_frequency_90,syllable_frequency_91,syllable_frequency_92,syllable_frequency_93,syllable_frequency_94
0,Batch2__LL1-B6__2023-07-06_MFS__DO2271_DO_F_25075,-0.575984,0.622953,0.054566,-0.109040,0.030229,-0.087828,-0.460101,0.269448,-0.325282,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000065,0.010458,0.0,0.0,0.0
1,Batch2__LL1-B6__2023-07-06_MFS__DO2272_DO_F_25076,-0.332595,0.718123,-0.176037,0.019785,-0.077807,0.145618,-0.355610,0.348274,-0.016595,...,0.000231,0.000000,0.000000,0.0,0.000000,0.000037,0.005594,0.0,0.0,0.0
2,Batch2__LL1-B6__2023-07-06_MFS__DO2273_DO_F_25077,-0.049419,0.474470,0.128843,-0.163383,-0.024935,-0.024942,-0.161158,0.126137,-0.072999,...,0.002977,0.000000,0.000000,0.0,0.000000,0.000083,0.000767,0.0,0.0,0.0
3,Batch2__LL1-B6__2023-07-06_MFS__DO2274_DO_F_25078,3.364841,-4.972226,-34.006750,-23.399518,-6.632558,-2.772867,1.163762,0.788437,-2.158905,...,0.012843,0.000000,0.000009,0.0,0.000000,0.000028,0.000000,0.0,0.0,0.0
4,Batch2__LL1-B6__2023-07-06_MFS__DO2331_DO_M_25047,-0.284884,0.271840,0.191407,-0.125589,0.116700,-0.188117,-0.164489,0.010362,-0.286379,...,0.001008,0.000000,0.000028,0.0,0.000000,0.000074,0.104920,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1121,NV15-B7B8__2021-01-25__DO-40-2107,0.421110,-0.888423,0.039449,-0.183078,0.033054,0.087944,-0.107629,0.072146,0.141790,...,0.000000,0.000055,0.000018,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0
1122,NV15-B7B8__2021-01-25__DO-40-2117,-0.020992,-0.346748,-0.244720,0.038690,0.027558,0.004309,0.160947,0.077125,-0.027762,...,0.001738,0.000860,0.000018,0.0,0.000102,0.000055,0.000361,0.0,0.0,0.0
1123,NV15-B7B8__2021-01-25__DO-40-2126,0.469172,-0.300893,0.202119,-0.068729,0.006088,-0.048926,0.097555,-0.071305,0.139040,...,0.000971,0.000157,0.000000,0.0,0.000000,0.000000,0.000333,0.0,0.0,0.0
1124,NV15-B7B8__2021-02-03__DO-20-1188,0.503000,-0.695000,0.016938,-0.062795,-0.054958,0.015161,0.234934,0.035365,0.187887,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0


In [47]:
### merge with metadata matrix (should have same number of rows)

metadata_df = pd.read_csv(adj_metadata_path)
metadata_unsupervised_features_df = metadata_df.merge(unsupervised_features_df, on="name", how="inner")
metadata_unsupervised_features_df

Unnamed: 0,name,mouse_id,sex,batch,tester,age,fi,weight,diet,strain,...,syllable_frequency_85,syllable_frequency_86,syllable_frequency_87,syllable_frequency_88,syllable_frequency_89,syllable_frequency_90,syllable_frequency_91,syllable_frequency_92,syllable_frequency_93,syllable_frequency_94
0,LL1-B2B__2019-12-24_SPD__LL1-1_AgedB6-0420,AgedB6-0420,Male,Batch1,Hannah,55.000000,7.287800,49.00,AL,B6,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
1,LL1-B2B__2020-01-02_SPD__LL1-1_AgedB6-0744,AgedB6-0744,Male,Batch1,Mackenzie,29.000000,4.320384,40.43,AL,B6,...,0.001914,0.000000,0.000000,0.000000,0.000028,0.000009,0.000000,0.0,0.0,0.0
2,LL1-B2B__2020-01-02_SPD__LL1-4_AgedB6-0746,AgedB6-0746,Female,Batch1,Mackenzie,28.000000,2.320384,27.53,AL,B6,...,0.000074,0.000000,0.000000,0.000000,0.000000,0.000000,0.000878,0.0,0.0,0.0
3,LL1-B2B__2020-06-16_SPD__AgedB6-0411,AgedB6-0411,Female,Batch1,Hannah,80.000000,5.787800,40.01,AL,B6,...,0.000000,0.000000,0.000213,0.000971,0.000361,0.000712,0.001248,0.0,0.0,0.0
4,LL1-B2B__2020-06-17_SPD__AgedB6-0420,AgedB6-0420,Male,Batch1,Hannah,80.000000,8.787800,55.85,AL,B6,...,0.000065,0.000000,0.000000,0.000000,0.000000,0.000000,0.001859,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1121,Batch2__LL4-B6__2023-07-10_MFS__DO2305_DO_F_25109,DO-2305,Female,New,Sean Deats,25.857143,3.348214,45.21,AL,DO,...,0.001045,0.000000,0.000000,0.000000,0.000000,0.000000,0.039603,0.0,0.0,0.0
1122,Batch2__LL5-B6__2023-07-06_MFS__DO2348_DO_M_25063,DO-2348,Male,New,Sean Deats,25.285714,2.570436,37.13,AL,DO,...,0.002053,0.000028,0.000000,0.000000,0.000000,0.000000,0.001581,0.0,0.0,0.0
1123,Batch2__LL5-B6__2023-07-10_MFS__DO2307_DO_F_25111,DO-2307,Female,New,Sean Deats,25.857143,2.181547,40.38,AL,DO,...,0.001248,0.000000,0.000000,0.000000,0.000000,0.000037,0.003283,0.0,0.0,0.0
1124,Batch2__LL1-B6__2023-07-06_MFS__DO2334_DO_M_25050,DO-2334,Male,New,Sean Deats,25.285714,2.959325,43.94,AL,DO,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.007111,0.0,0.0,0.0


In [48]:
supervised_features_df = pd.read_csv(supervised_features_path)
supervised_columns = [
    col for col in supervised_features_df.columns if col not in ["NetworkFilename", "PoseFilename", "Batch", "Tester", "AgeGroup", "MouseID", "Strain", "Diet", "Weight", "Sex", "AgeW", "AgeAtVid", "CFI_norm", "FLL", "score"]
]

_filename_to_name = {
    row["name"].split("__")[-1]: row["name"] for _, row in metadata_unsupervised_features_df.iterrows()
}
_names = []
for _, row in supervised_features_df.iterrows():
    if row["PoseFilename"].startswith("/"):
        _names.append(_filename_to_name.get(row["PoseFilename"][1:], pd.NA))
    else:
        _names.append(row["NetworkFilename"].removesuffix(".avi").replace("/", "__"))
supervised_features_df["name"] = _names
supervised_features_df = supervised_features_df.drop_duplicates()

### drop only duplicated row (unused)
_mask = (
    (supervised_features_df["name"] == "LL3-B2B__2020-01-02_SPD__LL3-4_AgedB6-0842")
    & (supervised_features_df["AgeW"] == 20)
)
supervised_features_df.drop(supervised_features_df[_mask].index, inplace=True)

supervised_features_df = supervised_features_df[["name"] + supervised_columns].copy()

supervised_features_df

Unnamed: 0,name,median_angular_velocity,median_base_tail_lateral_displacement,median_limb_duty_factor,median_nose_lateral_displacement,median_speed_cm_per_sec,median_step_length1,median_step_width,median_stride_length,median_tip_tail_lateral_displacement,...,grooming_duration_secs,Rearing_supported_T5,Rearing_supported_T20,Rearing_supported_T55,Rearing_unsupported_T5,Rearing_unsupported_T20,Rearing_unsupported_T55,Grooming_T5,Grooming_T20,Grooming_T55
0,LL1-B2B__2019-09-04_SPD__LL1-1_AgedB6-0396,-2.019110,0.166338,0.571429,0.120898,15.546574,1.273532,2.977061,3.979775,0.507276,...,189.58934,0.092778,0.051472,0.046495,0.002333,0.016333,0.017051,0.006111,0.020250,0.052576
1,LL1-B2B__2019-09-04_SPD__LL1-2_AgedB6-0394,-3.093400,0.159374,0.625000,0.115014,15.398583,2.125433,3.079423,4.229126,0.446310,...,84.87922,0.072111,0.059944,0.057899,0.001444,0.007528,0.019929,0.004111,0.018278,0.019586
2,LL1-B2B__2019-09-04_SPD__LL1-2_AgedB6-0686,1.733777,0.195995,0.562500,0.098389,15.339029,2.343707,3.302769,4.085287,0.534684,...,121.90750,0.042778,0.057583,0.046131,0.004667,0.013056,0.018707,0.012444,0.015917,0.041778
3,LL1-B2B__2019-09-04_SPD__LL1-3_AgedB6-0404,-5.665217,0.192360,0.611111,0.118556,15.818149,2.454813,2.802681,5.051089,0.492250,...,113.25598,0.063889,0.063944,0.063909,0.001111,0.003361,0.007455,0.006889,0.007472,0.022121
4,LL1-B2B__2019-09-04_SPD__LL1-3_AgedB6-0687,-2.532764,0.177434,0.583333,0.110600,12.769344,2.404231,2.961535,3.352435,0.523673,...,34.00826,0.039778,0.030028,0.026727,0.001778,0.000528,0.003081,0.009778,0.009806,0.007515
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1134,Batch2__LL4-B6__2023-07-10_MFS__DO2305_DO_F_25109,-3.051629,0.120370,0.603896,0.078928,14.422121,3.547365,3.107371,6.821204,0.186639,...,65.20002,0.024556,0.020306,0.016343,0.000556,0.000417,0.000152,0.000000,0.014639,0.016939
1135,Batch2__LL5-B6__2023-07-06_MFS__DO2348_DO_M_25063,2.133048,0.162071,0.555556,0.092367,21.328195,2.723941,3.100602,6.138987,0.381408,...,125.46667,0.026333,0.032861,0.070576,0.000000,0.000000,0.001980,0.010778,0.027917,0.047323
1136,Batch2__LL5-B6__2023-07-10_MFS__DO2307_DO_F_25111,-11.242211,0.141575,0.583333,0.128946,16.562525,1.253099,2.968717,5.324403,0.485926,...,0.00000,0.091000,0.079083,0.064313,0.000000,0.004917,0.004071,0.023778,0.007806,0.108727
1137,Batch2__LL1-B6__2023-07-06_MFS__DO2334_DO_M_25050,-2.269023,0.150810,0.562500,0.094270,21.130466,3.073845,2.817691,5.330215,0.260465,...,116.33336,0.006333,0.006528,0.004667,0.006333,0.006222,0.004556,0.008111,0.020667,0.013303


In [49]:
features_df = metadata_unsupervised_features_df.merge(
    supervised_features_df,
    on="name",
    how="inner"
)
features_df

Unnamed: 0,name,mouse_id,sex,batch,tester,age,fi,weight,diet,strain,...,grooming_duration_secs,Rearing_supported_T5,Rearing_supported_T20,Rearing_supported_T55,Rearing_unsupported_T5,Rearing_unsupported_T20,Rearing_unsupported_T55,Grooming_T5,Grooming_T20,Grooming_T55
0,LL1-B2B__2019-12-24_SPD__LL1-1_AgedB6-0420,AgedB6-0420,Male,Batch1,Hannah,55.000000,7.287800,49.00,AL,B6,...,68.20147,0.047222,0.059278,0.045596,0.002000,0.002111,0.004485,0.018111,0.014250,0.024364
1,LL1-B2B__2020-01-02_SPD__LL1-1_AgedB6-0744,AgedB6-0744,Male,Batch1,Mackenzie,29.000000,4.320384,40.43,AL,B6,...,170.43687,0.088111,0.076472,0.070101,0.011222,0.057806,0.114293,0.022778,0.024194,0.049485
2,LL1-B2B__2020-01-02_SPD__LL1-4_AgedB6-0746,AgedB6-0746,Female,Batch1,Mackenzie,28.000000,2.320384,27.53,AL,B6,...,148.81530,0.044444,0.048361,0.045515,0.008111,0.009500,0.015737,0.016556,0.017889,0.018061
3,LL1-B2B__2020-06-16_SPD__AgedB6-0411,AgedB6-0411,Female,Batch1,Hannah,80.000000,5.787800,40.01,AL,B6,...,69.73327,0.029667,0.018194,0.031596,0.001333,0.019694,0.024646,0.024778,0.016806,0.016273
4,LL1-B2B__2020-06-17_SPD__AgedB6-0420,AgedB6-0420,Male,Batch1,Hannah,80.000000,8.787800,55.85,AL,B6,...,72.26663,0.017667,0.024333,0.028747,0.005333,0.002417,0.005384,0.012333,0.005833,0.017707
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1113,Batch2__LL4-B6__2023-07-10_MFS__DO2305_DO_F_25109,DO-2305,Female,New,Sean Deats,25.857143,3.348214,45.21,AL,DO,...,65.20002,0.024556,0.020306,0.016343,0.000556,0.000417,0.000152,0.000000,0.014639,0.016939
1114,Batch2__LL5-B6__2023-07-06_MFS__DO2348_DO_M_25063,DO-2348,Male,New,Sean Deats,25.285714,2.570436,37.13,AL,DO,...,125.46667,0.026333,0.032861,0.070576,0.000000,0.000000,0.001980,0.010778,0.027917,0.047323
1115,Batch2__LL5-B6__2023-07-10_MFS__DO2307_DO_F_25111,DO-2307,Female,New,Sean Deats,25.857143,2.181547,40.38,AL,DO,...,0.00000,0.091000,0.079083,0.064313,0.000000,0.004917,0.004071,0.023778,0.007806,0.108727
1116,Batch2__LL1-B6__2023-07-06_MFS__DO2334_DO_M_25050,DO-2334,Male,New,Sean Deats,25.285714,2.959325,43.94,AL,DO,...,116.33336,0.006333,0.006528,0.004667,0.006333,0.006222,0.004556,0.008111,0.020667,0.013303


In [50]:
_all_unsupervised_columns = (
    list(latent_embedding_statistics.keys()) + 
    list(syllable_frequency_statistics.keys())
    # list(kpms_dendrogram_metasyllable_transition_matrix.keys())
)

_old_all_unsupervised_columns = (
    list(latent_embedding_statistics.keys()) + 
    list(old_syllable_frequency_statistics.keys())
    # list(kpms_dendrogram_metasyllable_transition_matrix.keys())
)

Xcats = {
    project_name: _all_unsupervised_columns,
    f"{project_name}__old": _old_all_unsupervised_columns,
    "supervised": supervised_columns,
}
Xcats.keys()

dict_keys(['2025-09-20_kpms-v5_150_6', '2025-09-20_kpms-v5_150_6__old', 'supervised'])

In [51]:
# features_df.drop("fll", axis=1, inplace=True)
rows_with_na = features_df[features_df.isna().any(axis=1)].copy()

rows_with_na["na_cols"] = (
    rows_with_na
    .isna()
    .apply(lambda r: [c for c, is_na in r.items() if is_na], axis=1)
)

rows_with_na["na_cols"]

0      [fll]
1      [fll]
2      [fll]
3      [fll]
4      [fll]
       ...  
621    [fll]
622    [fll]
623    [fll]
624    [fll]
625    [fll]
Name: na_cols, Length: 626, dtype: object

In [52]:
_uid = f"{dataset_dir.name}__{project_name}__{model_name}"

from datetime import datetime
current_datetime = datetime.now()
formatted_date = current_datetime.strftime("%Y-%m-%d")

feature_matrix_output_dir  = unsupervised_aging_dir / "data/feature_matrices"
feature_matrix_output_path = feature_matrix_output_dir / f"{formatted_date}_feature-matrix__{_uid}.csv"
xcats_output_path          = feature_matrix_output_dir / f"{formatted_date}_xcats__{_uid}.json"

features_df.to_csv(feature_matrix_output_path)
with xcats_output_path.open("w") as f:
    json.dump(Xcats, f, indent=2)

print(f"wrote feature matrix to  `{feature_matrix_output_path}`")
print(f"wrote X category JSON to `{xcats_output_path}`")
print(f"X category keys: {Xcats.keys()}")

wrote feature matrix to  `/projects/kumar-lab/miaod/projects/unsupervised-aging/data/feature_matrices/2025-10-06_feature-matrix__combined_1126__2025-09-20_kpms-v5_150_6__2025-09-21_model-1.csv`
wrote X category JSON to `/projects/kumar-lab/miaod/projects/unsupervised-aging/data/feature_matrices/2025-10-06_xcats__combined_1126__2025-09-20_kpms-v5_150_6__2025-09-21_model-1.json`
X category keys: dict_keys(['2025-09-20_kpms-v5_150_6', '2025-09-20_kpms-v5_150_6__old', 'supervised'])
