# Analysis on how scalars evolve across age

- both before and after size normalization

In [1]:
import pandas as pd
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from toolz import sliding_window
from aging.util import correct_for_camera_height
from scipy.spatial.distance import pdist, squareform
from aging.plotting import figure, format_plots, save_factory, PlotConfig, add_identity

In [2]:
format_plots()

#c = PlotConfig()
#fig.savefig(c.dana_save_path / "fig4"/ 'fvm_syll_diff__over_life_heatmap.pdf')

In [3]:
version = 11
path = Path(f"/n/groups/datta/win/longtogeny/data/ontogeny/version_{version:02d}/ontogeny_females_syllable_df_v00.parquet")
df = pd.read_parquet(path, engine="pyarrow")

In [4]:
df["corrected_width"] = correct_for_camera_height(df["width_px"], df["true_depth"])
df["corrected_length"] = correct_for_camera_height(df["length_px"], df["true_depth"])
area = df["area_px"]
#df["corrected_area"] = np.square(
#    correct_for_camera_height(np.sqrt(area[area > 0]), df.loc[area > 0, "true_depth"])
#)

In [5]:
df['corrected_velocity_2d'] = correct_for_camera_height(df['velocity_2d_px'], df['true_depth'])

In [6]:
pl_df = pl.DataFrame(
    df[
        [
            "age",
            "corrected_velocity_2d",
            "velocity_2d_px",
            "session_name",
            "uuid",
            "subject_name",
            "centroid_x_px",
            "centroid_y_px",
        ]
    ]
)
# fix velcoity calculation when there are many dropped frames
pl_df = pl_df.with_columns(
    (
        ~pl.col("corrected_velocity_2d").is_null().shift(1).fill_null(False)
        * pl.col("corrected_velocity_2d")
    )
).with_columns(
    pl.when(pl.col("corrected_velocity_2d") == 0)
    .then(None)
    .otherwise(pl.col("corrected_velocity_2d"))
    #.name.keep()
    .alias('"corrected_velocity_2d"')
)
# bin the velocities
n_bins = 25
pl_df = pl_df.with_columns(
    pl.col("corrected_velocity_2d")
    .cut(
        np.linspace(0, 5, n_bins + 1),
        labels=["low"]
        + [str(np.mean(f)) for f in sliding_window(2, np.linspace(0, 5, n_bins + 1))]
        + ["high"],
    )
    .alias("bins")
)

# add distance to center
pl_df = (
    pl_df.lazy()
    .with_columns(
        (
            pl.col("centroid_x_px")
            - (pl.col("centroid_x_px").max() - pl.col("centroid_x_px").min()) / 2
        ).over("uuid"),
        (
            pl.col("centroid_y_px")
            - (pl.col("centroid_y_px").max() - pl.col("centroid_y_px").min()) / 2
        ).over("uuid"),
    )
    .with_columns(
        (pl.col("centroid_x_px") ** 2 + pl.col("centroid_y_px") ** 2)
        .sqrt()
        .alias("distance_to_center")
    )
    .with_columns(
        pl.col("distance_to_center")
        .cut(
            np.linspace(0, 140, n_bins + 1),
            labels=["low"]
            + [
                str(np.mean(f))
                for f in sliding_window(2, np.linspace(0, 140, n_bins + 1))
            ]
            + ["high"],
        )
        .alias("distance_bins")
    )
    .collect()
)

In [7]:
grouping = pl_df.groupby("age").agg(pl.col("bins").value_counts())

In [8]:
vel_hist = {}

for age, row in grouping.iter_rows():
    out = {}
    for item in row:
        if item["bins"] in (None, "low", "high"):
            continue
        out[round(float(item["bins"]) * 30, 1)] = item["counts"]
    vel_hist[age] = out

In [9]:
vel_hist_df = pd.DataFrame(vel_hist).T
vel_hist_df.columns.name = "Speed (px/s)"
vel_hist_df.index.name = "Age (weeks)"
vel_hist_df = vel_hist_df.sort_index().sort_index(axis='columns')
vel_hist_df = vel_hist_df / vel_hist_df.sum(axis=1).values[:, None]

In [10]:
fig = figure(1.55, 1.2)
ax = sns.heatmap(
    vel_hist_df.clip(10 ** (-2.5), 0.12).T,
    cmap="cubehelix",
    cbar_kws={"label": "Probability"},
    norm="log",
    vmin=0,
)
ax.invert_yaxis()
c = PlotConfig()
fig.savefig(c.dana_save_path / "fig3"/ 'scalars_female_velocity_heatmap.pdf')

In [11]:
fig = figure(1.55, 1.2)
ax=(vel_hist_df * vel_hist_df.columns.to_numpy()[None]).sum(1).rolling(4).mean().plot()
ax.set_ylim([0,148])
c = PlotConfig()
fig.savefig(c.dana_save_path / "fig3"/ 'scalars_female_velocity_line.pdf')

In [12]:
mean_velocities = (
    pl_df.lazy()
    .filter(pl.col("corrected_velocity_2d").is_between(0, 5, closed="none"))
    .groupby(["age", "uuid"])
    .agg(pl.mean("corrected_velocity_2d") * 30)
).collect()

In [13]:
sns.lineplot(data=mean_velocities, x='age', y='corrected_velocity_2d', errorbar='se')

In [14]:
high_velocities = (
    pl_df.lazy()
    .filter(pl.col("corrected_velocity_2d").is_between(0, 5, closed="none"))
    .groupby(["age", "uuid"])
    .agg(pl.col("corrected_velocity_2d").quantile(0.95) * 30)
).collect()

In [None]:
fig = figure(1.55, 1.2)
ax = sns.lineplot(
    data=high_velocities,
    x="age",
    y="corrected_velocity_2d",
    errorbar="se",
    err_kws={"lw": 0},
    c="k",
    lw=1
)
ax.set_ylim([0,148])
ax.set(ylabel="Peak speed (px/s)", xlabel="Age (weeks)")
sns.despine()
c = PlotConfig()
fig.savefig(c.dana_save_path / "fig2"/ 'scalars_male_velocity_line.pdf')

## Distance to center

In [None]:
dist_grouping = pl_df.groupby("age").agg(pl.col("distance_bins").value_counts())
dist_hist = {}

for age, row in dist_grouping.iter_rows():
    out = {}
    for item in row:
        if item['distance_bins'] in (None, 'low', 'high'):
            continue
        out[round(float(item['distance_bins']), 1)] = item['counts']
    dist_hist[age] = out

In [None]:
dist_hist_df = pd.DataFrame(dist_hist).T.fillna(0)
dist_hist_df.columns.name = "Distance to center (px)"
dist_hist_df.index.name = "Age (weeks)"
dist_hist_df = dist_hist_df.sort_index().sort_index(axis='columns')
dist_hist_df = dist_hist_df / dist_hist_df.sum(axis=1).values[:, None]

In [None]:
fig = figure(1.55, 1.2)
ax = sns.heatmap(
    dist_hist_df.clip(10 ** (-2.5), 0.12).T,
    cmap="cubehelix",
    cbar_kws={"label": "Probability"},
    norm="log",
    vmin=0,
)
ax.invert_yaxis()
c = PlotConfig()
fig.savefig(c.dana_save_path / "fig3"/ 'scalars_female_dist_to_center_heatmap.pdf')

In [None]:
fig = figure(1.55, 1.2)
ax=(dist_hist_df * dist_hist_df.columns.to_numpy()[None]).sum(1).rolling(4).mean().plot()
ax.set_ylim([0,148])
c = PlotConfig()
sns.despine()
fig.savefig(c.dana_save_path / "fig3"/ 'scalars_female_dist_to_center_line.pdf')

In [None]:
mean_center = (
    pl_df.lazy()
    .filter(pl.col("distance_to_center").is_between(0, 5, closed="none"))
    .groupby(["age", "uuid"])
    .agg(pl.mean("distance_to_center") * 30)
).collect()

In [None]:
sns.lineplot(data=mean_center, x='age', y='distance_to_center', errorbar='se')

## Combine velocity and dist to center, classify/regress age

In [None]:
dist_grouping = pl_df.groupby(["age", "uuid"]).agg(pl.col("distance_bins").value_counts())
dist_hist = {}

for age, uuid, row in dist_grouping.iter_rows():
    out = {}
    for item in row:
        if item['distance_bins'] in (None, 'low', 'high'):
            continue
        out[round(float(item['distance_bins']), 1)] = item['counts']
    dist_hist[(age, uuid)] = out

In [None]:
indiv_dist_hist_df = pd.DataFrame(dist_hist).T.fillna(0)
indiv_dist_hist_df.columns.name = "Distance to Center (px)"
indiv_dist_hist_df.index.rename(["Age (weeks)", "UUID"], inplace=True)
indiv_dist_hist_df = indiv_dist_hist_df.sort_index().sort_index(axis='columns')
indiv_dist_hist_df = indiv_dist_hist_df / indiv_dist_hist_df.sum(axis=1).values[:, None]

In [None]:
fig = figure(1.55, 1.2)
ax = sns.heatmap(
    indiv_dist_hist_df.fillna(1e-6).clip(10 ** (-2.5), 0.15).droplevel(1),
    cmap="cubehelix",
    cbar_kws={"label": "Probability"},
    norm="log",
)
# saver(fig, "individual-center-dist-histogram-across-age");

In [None]:
dist_dists = squareform(pdist(indiv_dist_hist_df.fillna(0), metric="jensenshannon"))

fig = figure(1.4, 1.1)
ax = sns.heatmap(
    dist_dists,
    cmap="cubehelix",
    vmin=0,
    vmax=0.35,
    cbar_kws=dict(label="Distance - JSD (bits)"),
    rasterized=True,
)
ax.set(xlabel="Sessions", ylabel="Sessions", aspect="equal")
# saver(fig, "individual-center-dist-histogram-distance-mtx", dpi=600);

In [None]:
grouping = pl_df.groupby(["age", "uuid"]).agg(pl.col("bins").value_counts())

In [None]:
indiv_vel_hist = {}

for age, uuid, row in grouping.iter_rows():
    out = {}
    for item in row:
        if item['bins'] in (None, 'low', 'high'):
            continue
        out[round(float(item['bins']) * 30, 1)] = item['counts']
    indiv_vel_hist[(age, uuid)] = out

In [None]:
indiv_vel_hist_df = pd.DataFrame(indiv_vel_hist).T.fillna(0)
indiv_vel_hist_df.columns.name = "Speed (px/s)"
indiv_vel_hist_df.index.rename(["Age (weeks)", "UUID"], inplace=True)
indiv_vel_hist_df = indiv_vel_hist_df.sort_index().sort_index(axis='columns')
indiv_vel_hist_df = indiv_vel_hist_df / indiv_vel_hist_df.sum(axis=1).values[:, None]

In [None]:
fig = figure(1.55, 1.2)
ax = sns.heatmap(
    indiv_vel_hist_df.fillna(1e-6).clip(10 ** (-2.5), 0.15).droplevel(1),
    cmap="cubehelix",
    cbar_kws={"label": "Probability"},
    norm="log",
)
# saver(fig, "individual-velocity-histogram-across-age");

In [None]:
combined_data = pd.concat([indiv_dist_hist_df, indiv_vel_hist_df], axis=1).fillna(0)

In [None]:
# StratifiedKFold classifer first
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold, GroupKFold, LeaveOneGroupOut
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import r2_score

In [None]:
ages = combined_data.index.get_level_values(0).values

In [None]:
pipeline = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000))

In [None]:
scores = cross_val_score(
    pipeline,
    combined_data,
    ages,
    # cv=StratifiedKFold(n_splits=5),
    cv=RepeatedStratifiedKFold(n_splits=4, n_repeats=10),
)
scores

In [None]:
scores = cross_val_score(
    RandomForestClassifier(),
    combined_data,
    combined_data.index.get_level_values(0),
    # cv=StratifiedKFold(n_splits=5),
    cv=RepeatedStratifiedKFold(n_splits=4, n_repeats=3),
)
scores

In [None]:
# regression form
pipeline = make_pipeline(StandardScaler(), PLSRegression(n_components=5))
preds = cross_val_predict(
    pipeline,
    combined_data,
    combined_data.index.get_level_values(0),
    cv=LeaveOneGroupOut(),
    groups=LabelEncoder().fit_transform(combined_data.index.get_level_values(0)).squeeze(),
)
r2_score(ages, preds)

In [None]:
r2 = r2_score(ages, preds)

fig = figure(1, 1)
ax = fig.gca()
ax.scatter(ages, preds, s=1, c='k')
add_identity(ax, color='gray', ls='--')
ax.set(title=f"R2 = {r2:.2f}", xlabel="True age (weeks)", ylabel="Predicted age (weeks)")
sns.despine()
fig.savefig(c.dana_save_path / "fig3"/ 'scalars_female_classifier.pdf')