In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import umap

from pathlib import Path
from typing import Callable, Iterable, Sequence

In [None]:
data_dir = Path("data/")
output_dir = Path("outputs/")

In [None]:
import pickle
with (data_dir / "features_block.pkl").open("rb") as f:
    features_arr, row_labels, col_labels = pickle.load(f)

In [None]:
metadata_df = pd.read_csv(output_dir / "condensed_metadata.csv")

In [None]:
metadata_df["pose_path"]

In [None]:
feature_label_basename = lambda feature_label: feature_label[8:-len("_pose_est_v6.csv")].split("__")
feature_label_basename(row_labels[0])

In [None]:
metadata_label_basename = lambda metadata_label: metadata_label[:-len("__trimmed_overlay_pose_est_v6.h5")].split("/")
metadata_label_basename(metadata_df["pose_path"][0])

In [None]:
def merge_features_metadata(
    features_arr:      np.ndarray,
    row_labels:        Sequence[str],
    col_labels:        Sequence[str],
    metadata_df:       pd.DataFrame,
    feature_base_fn:   Callable[[str], Iterable[str]],
    metadata_base_fn:  Callable[[str], Iterable[str]],
    *,
    metadata_col: str = "pose_path"
) -> pd.DataFrame:

    n_samples, n_features = features_arr.shape
    if n_samples != len(row_labels):
        raise ValueError(
            f"features_arr has {n_samples} rows, but len(row_labels) == {len(row_labels)}"
        )
    if n_features != len(col_labels):
        raise ValueError(
            f"features_arr has {n_features} columns, but len(col_labels) == {len(col_labels)}"
        )

    feat_df = pd.DataFrame(features_arr, columns=col_labels)
    feat_df.insert(0, "feature_label", row_labels)
    feat_df["basename_key"] = [tuple(feature_base_fn(lbl))
                               for lbl in row_labels]
    meta_df = metadata_df.copy()
    meta_df["basename_key"] = meta_df[metadata_col].apply(
        lambda p: tuple(metadata_base_fn(p))
    )

    merged = feat_df.merge(
        meta_df,
        on="basename_key",
        how="inner",
        suffixes=("", "_meta"),
        validate="many_to_one"
    )
    return merged

In [None]:
df = merge_features_metadata(
    features_arr,
    row_labels,
    col_labels,
    metadata_df,
    feature_label_basename,
    metadata_label_basename
)

df.head()

In [None]:
import numpy as np
from scipy.stats import entropy   # part of SciPy

# 1⃣  grab the syllable-frequency columns
syll_cols = [c for c in df.columns if c.startswith('syllable_freqs_')]

# 2⃣  turn each row into a probability vector
#     (skip this step if each row already sums to 1)
row_totals = df[syll_cols].sum(axis=1)
probs      = df[syll_cols].div(row_totals, axis=0)

# 3⃣  Shannon entropy, base-2 (bits) – add a tiny epsilon to dodge log(0)
eps = 1e-12
df['syllable_entropy'] = probs.add(eps).apply(
    lambda r: entropy(r, base=2), axis=1
)

# done!  peek:
df[['feature_label', 'syllable_entropy']].head()

In [None]:
feature_labels = col_labels + ["syllable_entropy"]

In [None]:
def plot_feature_correlations(
    df: pd.DataFrame,
    feature_cols: list[str],
    target_col: str,
    *,
    sort_by_abs: bool = True,
    cmap: str = "coolwarm",
    fmt: str = ".2f",
) -> None:
    with np.errstate(invalid='ignore', divide='ignore'):
        r = df[feature_cols].corrwith(df[target_col], method='pearson')
    if sort_by_abs:
        r = r.reindex(r.abs().sort_values(ascending=False).index)

    corr_df = pd.DataFrame(r).T
    corr_df.index = [target_col]

    f_w = max(4, len(r) * 0.35)
    f_h = 2.2
    base = 10

    plt.figure(figsize=(f_w, f_h))
    sns.set(font_scale=1)

    ax = sns.heatmap(
        corr_df, cmap=cmap,
        center=0, vmin=-1, vmax=1,
        linewidths=0.6, linecolor='black',
        annot=True, fmt=fmt, annot_kws={"size": base * 0.8},
        cbar_kws={
            "label": "Pearson r",
            "shrink": 0.6,
            "pad": 0.02,
            "aspect": 15,
        },
    )

    ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="center",
                       fontsize=base * 0.8)
    ax.set_yticklabels(ax.get_yticklabels(), rotation=0,
                       fontsize=base * 0.9)

    cbar = ax.collections[0].colorbar
    cbar.ax.tick_params(labelsize=base * 0.8)          # tick number font
    cbar.ax.set_ylabel("Pearson r", fontsize=base * 0.9, rotation=-90,
                       va='center', labelpad=12)

    ax.set_title(f"Correlation with {target_col}",
                 loc="left", weight="bold", fontsize=base * 1.1)

    plt.tight_layout()
    plt.show()


In [None]:
plot_feature_correlations(df, feature_labels, "fi")

In [None]:
plot_feature_correlations(df, feature_labels, "age")

In [None]:
len(col_labels)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr

def plot_feature_scatter(
    df: pd.DataFrame,
    x_col: str,
    y_col: str,
    fit_line: bool = True,
) -> None:

    data = df[[x_col, y_col]].dropna()
    r, _ = pearsonr(data[x_col], data[y_col])

    plt.figure(figsize=(5, 4))
    sns.scatterplot(x=x_col, y=y_col, data=data, s=35)

    if fit_line:
        sns.regplot(
            x=x_col, y=y_col, data=data,
            scatter=False, line_kws={"color": "black", "linewidth": 1}
        )

    plt.title(f"{y_col} vs {x_col}  (r = {r:.2f})")
    plt.tight_layout()
    plt.show()


In [None]:
plot_feature_scatter(df, "embedding_stats_8", "age")

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as sch

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as sch
from scipy.spatial.distance import squareform

def plot_correlation_grid(
        df: pd.DataFrame,
        feature_cols: list[str],
        *,
        cluster: bool = True,
    ) -> None:
    """
    Square grid whose off-diagonal cells are Pearson r and diagonal
    cells are sample variances, mimicking Fig 3a of the paper.

    Parameters
    ----------
    df : DataFrame
    feature_cols : list[str]
    cluster : bool, default True
        Re-orders rows/columns via hierarchical clustering if True.
    """
    # ------------------------------------------------------------------
    # 1.   Clean the data
    X = df[feature_cols].apply(pd.to_numeric, errors='coerce').replace(
        [np.inf, -np.inf], np.nan
    )
    # Drop columns with < 2 distinct finite values (variance = 0 or NaN)
    good = X.nunique(dropna=True) > 1
    X = X.loc[:, good]

    if X.shape[1] < 2:
        raise ValueError("Need at least two valid features to plot a grid.")

    # ------------------------------------------------------------------
    # 2.   Correlation + variance
    corr = X.corr(method='pearson')
    var  = X.var()
    np.fill_diagonal(corr.values, var)

    # ------------------------------------------------------------------
    # 3.   Optional clustering
    if cluster:
        # Build a distance matrix for clustering:
        #   d(i,j) = 1 - |r(i,j)|   (absolute corr → similarity)
        dist_for_linkage = 1 - corr.abs().fillna(0).values
        # linkage expects a condensed distance vector, not a square matrix
        linkage = sch.linkage(squareform(dist_for_linkage, checks=False),
                              method='average', optimal_ordering=True)
        order   = sch.dendrogram(linkage, no_plot=True)['leaves']
        corr    = corr.iloc[order, order]

    # ------------------------------------------------------------------
    # 4.   Plot
    n = len(corr)
    fig_size = max(4, n * 0.35)
    plt.figure(figsize=(fig_size, fig_size))

    sns.heatmap(
        corr,
        cmap='coolwarm',
        center=0,
        vmin=-1, vmax=1,
        linewidths=0.4, linecolor='black',
        square=True,
        cbar_kws={'label': 'Pearson r (off-diag) / Variance (diag)'}
    )

    plt.xticks(rotation=90, ha='center', fontsize=8)
    plt.yticks(rotation=0,  fontsize=8)
    plt.title('Feature × Feature Correlation / Variance Grid',
              loc='left', weight='bold')

    plt.tight_layout()
    plt.show()


In [None]:
plot_correlation_grid(df, feature_labels)

In [None]:
X = np.vstack(df[feature_labels].values)

# UMAP embedding
embedding = umap.UMAP(random_state=42).fit_transform(X)

fi = df["fi"].values

plt.figure(figsize=(6, 6))
sc = plt.scatter(embedding[:, 0], embedding[:, 1], c=fi)
plt.xlabel("UMAP1")
plt.ylabel("UMAP2")
plt.title("UMAP of Frequency Features (colored by FI)")
plt.colorbar(sc, label="FI")
plt.tight_layout()