In [None]:
import datetime
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from pathlib import Path
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
data_dir = Path("data/")
output_dir = Path("outputs/")
age_md_path = data_dir / "agedB6_mice_meta.csv"
fi_md_path = data_dir / "df_video_features.csv"
video_list_path = data_dir / "B6J_good_videos_batch_file.data"

In [None]:
age_md = pd.read_csv(age_md_path)
print(age_md.columns, age_md.shape)

In [None]:
fi_md = pd.read_csv(fi_md_path)
print(fi_md.columns, fi_md.shape)

In [None]:
age_md["DOB_dt"] = pd.to_datetime(age_md["DOB"], format="%m/%d/%y")
age_md["TestDate_dt"] = pd.to_datetime(age_md["TestDate"], format="%m/%d/%y")
age_md["Age_days"] = age_md.apply(lambda row: (row["TestDate_dt"] - row["DOB_dt"]).days, axis=1)
age_md["Age_weeks"] = age_md["Age_days"] // 7
age_md["Age_months"] = age_md["Age_days"] // 30

In [None]:
### find entries in both tables

merged_md = pd.merge(age_md, fi_md, how="inner", left_on="MouseID", right_on="MouseID")
mask = abs(merged_md["Age_weeks"] - merged_md["TestAge"]) <= 1
md = merged_md[mask].copy()
print(md.columns, md.shape)

In [None]:
md["path_head"] = md.apply(lambda row: Path(row['NetworkFilename']).with_suffix(""), axis=1)
md["video_path"] = md.apply(lambda row: f"{row['path_head']}_trimmed_overlay.mp4", axis=1)
md["pose_path"] = md.apply(lambda row: f"{row['path_head']}__trimmed_overlay_pose_est_v6.h5", axis=1)

In [None]:
print(md["video_path"])

In [None]:
### find entries with valid filepaths

with video_list_path.open("r") as f:
    videos = list(map(lambda x: x.strip()[58:], f.readlines()))
print(videos[:5])
md = md[md["video_path"].isin(videos)].copy()
print(md.columns, md.shape)

In [None]:
sub_md_cols = ["pose_path", "MouseID", "Sex_y", "Age_weeks", "score"]
sub_md = md[sub_md_cols].copy()

sub_md = sub_md.rename(columns={
    "MouseID": "mouse_id",
    "Sex_y": "sex",
    "Age_weeks": "age",
    "score": "fi",
})

sub_md.to_csv(output_dir / "condensed_metadata.csv", index=False)

In [None]:
age_counts = md["Age_months"].value_counts().sort_index()
age_max = max(md["Age_months"])

plt.figure()
plt.bar(age_counts.index, age_counts.values)
plt.xlabel("Age (months)")
plt.xticks(np.arange(0, age_max + 1, 6))
plt.ylabel("Number of Mice")
plt.show()

plt.figure()
plt.bar(age_counts.index, age_counts.values / sum(age_counts.values))
plt.xlabel("Age (months)")
plt.xticks(np.arange(0, age_max + 1, 6))
plt.ylabel("Frequency")
plt.show()

In [None]:
fi_counts = md["score"].value_counts().sort_index()

plt.figure()
plt.bar(fi_counts.index, fi_counts.values)
plt.xlabel("Biological Age (Frailty)")
plt.ylabel("Number of Mice")
plt.show()

In [None]:
n_score_bins = 5

# 2. create integer‐coded quantile bins for score
md['score_bin'] = pd.qcut(md['score'],
                          q=n_score_bins,
                          labels=False,
                          duplicates='drop')

# 3. combine score_bin and Sex_y into one strata label
#    (you can keep it as a tuple or turn into a string)
md['strata'] = list(zip(md['score_bin'], md['Sex_y']))

# 4. set up StratifiedShuffleSplit to pull exactly 250 samples
sss = StratifiedShuffleSplit(n_splits=1, test_size=250, random_state=42)

# 5. draw the sample
for _, test_idx in sss.split(md, md['strata']):
    stratified_subset = md.iloc[test_idx].reset_index(drop=True)
stratified_subset[:3]

In [None]:
bucket_counts = stratified_subset["strata"].astype(str).value_counts().sort_index()

plt.figure()
plt.bar(bucket_counts.index, bucket_counts.values)
plt.xlabel("Bucket")
plt.ylabel("Number of Mice")
plt.show()

In [None]:
len(stratified_subset)

In [None]:
with open(output_dir / "good_video_paths.txt", "w") as f:
    f.writelines("\n".join(list(stratified_subset["video_path"])))
with open(output_dir / "good_pose_paths.txt", "w") as f:
    f.writelines("\n".join(list(stratified_subset["pose_path"])))