In [None]:
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import statsmodels.api as sm
import umap

from pathlib import Path

In [None]:
data_dir = Path("data/")
output_dir = Path("outputs/")

In [None]:
videos_df = pd.read_pickle(data_dir / "videos_df.pkl")
videos_df.head()

In [None]:
videos_df["name"]

In [None]:
metadata_df = pd.read_csv(output_dir / "condensed_metadata.csv")
print(metadata_df.shape)
metadata_df.head()

In [None]:
metadata_df["pose_path"][0]

In [None]:
import re
import pandas as pd

def pose_to_csv_name(pose_path: str) -> str:
    category, *_, fname = pose_path.split('/')
    fname = fname.rsplit('.', 1)[0]
    fname = re.sub(r'__trimmed_overlay', '', fname)
    return f'{category}_{fname}.csv'

metadata_df = metadata_df.copy()
metadata_df['name'] = metadata_df['pose_path'].apply(pose_to_csv_name)

videos_df = videos_df.copy()
videos_df['name'] = videos_df['name'].astype(str)

merged_df = (
    metadata_df
      .merge(videos_df, on='name', how='inner', suffixes=('_meta', '_video'))
      .drop(columns=['pose_path'])
)

merged_df

In [None]:
num_videos = len(merged_df)
num_syllables = len(merged_df["freqs"][0])
print(num_videos, num_syllables)

In [None]:
x = merged_df["fi"].values

rmse_by_idx = []
for i in range(num_syllables):
    y = merged_df["freqs"].apply(lambda a: a[i]).values
    rmse = np.sqrt(np.mean((y - np.polyval(np.polyfit(x, y, 1), x)) ** 2))
    rmse_by_idx.append(rmse)

sorted_idx = np.argsort(rmse_by_idx)

n_cols = math.ceil(math.sqrt(num_syllables))
n_rows = math.ceil(num_syllables / n_cols)
fig, axes = plt.subplots(n_rows, n_cols, figsize=(4 * n_cols, 4 * n_rows), constrained_layout=True)
axes = axes.flat

for rank, idx in enumerate(sorted_idx):
    ax = axes[rank]
    y = merged_df["freqs"].apply(lambda a: a[idx]).values
    X = sm.add_constant(x)
    model = sm.OLS(y, X).fit()
    xp = np.linspace(x.min(), x.max(), 100)
    Xp = sm.add_constant(xp)
    pred = model.get_prediction(Xp)
    ci = pred.conf_int()
    y_pred = pred.predicted_mean
    rmse = rmse_by_idx[idx]
    ax.scatter(x, y)
    ax.plot(xp, y_pred)
    ax.fill_between(xp, ci[:, 0], ci[:, 1], alpha=0.2)
    ax.text(0.05, 0.95, f"RMSE = {rmse:.3f}", transform=ax.transAxes, ha="left", va="top")
    ax.set_title(f"Freq[{idx}]")
    ax.set_xlabel("FI")
    ax.set_ylabel("Value")

for k in range(num_syllables, len(axes)):
    fig.delaxes(axes[k])

plt.show()

In [None]:
x = merged_df["age"].values

rmse_by_idx = []
for i in range(num_syllables):
    y = merged_df["freqs"].apply(lambda a: a[i]).values
    rmse = np.sqrt(np.mean((y - np.polyval(np.polyfit(x, y, 1), x)) ** 2))
    rmse_by_idx.append(rmse)

sorted_idx = np.argsort(rmse_by_idx)

n_cols = math.ceil(math.sqrt(num_syllables))
n_rows = math.ceil(num_syllables / n_cols)
fig, axes = plt.subplots(n_rows, n_cols, figsize=(4 * n_cols, 4 * n_rows), constrained_layout=True)
axes = axes.flat

for rank, idx in enumerate(sorted_idx):
    ax = axes[rank]
    y = merged_df["freqs"].apply(lambda a: a[idx]).values
    X = sm.add_constant(x)
    model = sm.OLS(y, X).fit()
    xp = np.linspace(x.min(), x.max(), 100)
    Xp = sm.add_constant(xp)
    pred = model.get_prediction(Xp)
    ci = pred.conf_int()
    y_pred = pred.predicted_mean
    rmse = rmse_by_idx[idx]
    ax.scatter(x, y)
    ax.plot(xp, y_pred)
    ax.fill_between(xp, ci[:, 0], ci[:, 1], alpha=0.2)
    ax.text(0.05, 0.95, f"RMSE = {rmse:.3f}", transform=ax.transAxes, ha="left", va="top")
    ax.set_title(f"Freq[{idx}]")
    ax.set_xlabel("Age")
    ax.set_ylabel("Value")

for k in range(num_syllables, len(axes)):
    fig.delaxes(axes[k])

plt.show()

In [None]:
X = np.vstack(merged_df["freqs"].values)

# UMAP embedding
embedding = umap.UMAP(random_state=42).fit_transform(X)

fi = merged_df["age"].values

plt.figure(figsize=(6, 6))
sc = plt.scatter(embedding[:, 0], embedding[:, 1], c=fi)
plt.xlabel("UMAP1")
plt.ylabel("UMAP2")
plt.title("UMAP of Frequency Features (colored by FI)")
plt.colorbar(sc, label="FI")
plt.tight_layout()