# Compute features for the encoding model (average variant)

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from tqdm.auto import tqdm
from rl_analysis.batch import apply_parallel_joblib
from rl_analysis.io.df import dlight_exclude
from rl_analysis.photometry.encoding_average.features import get_lagged_features
from rl_analysis.info.util import dm_entropy
import pandas as pd
import natsort as ns
import numpy as np
import os
from functools import partial

In [3]:
# note that we're using observed TRUE for downstream groupby so whatever is a categorical will get dropped
# if it's not observed
convs = {
    "timestamp": "float32",
    "snippet": "int64",
    "syllable": "category",
    "next_syllable": "uint8",
    "bin": "category",
    "time_bin": "float32",
    "dlight_bin": "category",
    "mouse_id": "category",
    "uuid": "category",
    "counts": "uint16",
}

In [4]:
force = True

# Model functions

In [5]:
import toml

with open("../analysis_configuration.toml", "r") as f:
    analysis_config = toml.load(f)

In [6]:
raw_dirs = analysis_config["raw_data"]
proc_dirs = analysis_config["intermediate_results"]
lagged_cfg = analysis_config["dlight_lagged_correlations"]
encoding_cfg = analysis_config["dlight_encoding_features"]
figure_cfg = analysis_config["figures"]
dlight_cfg = analysis_config["dlight_common"]

In [7]:
file_suffix = "offline" if lagged_cfg["use_offline"] else "online"
load_file = os.path.join(raw_dirs["dlight"], f"dlight_snippets_{file_suffix}.parquet")

In [8]:
file, ext = os.path.splitext(load_file)
features_save_file = f"{file}_features{ext}"

if lagged_cfg["use_renormalized"]:
    file, ext = os.path.splitext(features_save_file)
    features_save_file = f"{file}_renormalize{ext}"

rle_save_file = features_save_file.replace("snippet", "usage")
dirname, filename = os.path.split(features_save_file)
file, ext = os.path.splitext(filename)
file = os.path.join(proc_dirs["dlight"], file)

if lagged_cfg["estimate_within_bin"]:
    file = f"{file}_withinbin"

results_file = f"{file}_encoding_model_average.parquet"
results_tm_file = f"{file}_encoding_model_average.npy"
results_features_file = f"{file}_encoding_model_average_features.parquet"

In [9]:
use_features = []
use_windows = lagged_cfg["use_windows"] + ["(0.0, inf)"]
for _use_win in use_windows:  # added to syllable duration for round 3 testing
    use_features += [f"{_}_{_use_win}" for _ in lagged_cfg["use_neural_features"]]
scalar_keys = lagged_cfg["usage_and_scalars"]["scalars"]

# Parameters

# Load in pre-processed data

In [10]:
feature_df = pd.read_parquet(features_save_file)
rle_df = pd.read_parquet(rle_save_file)

In [11]:
# we can only exclude sessions since these calculations depend on contiguity, remove specific trials later
feature_df = feature_df.loc[~(feature_df["session_number"].isin([1, 2]))].copy()
rle_df = rle_df.loc[rle_df["uuid"].isin(feature_df["uuid"])].copy()

In [12]:
feature_df = feature_df.loc[feature_df["window_tup"].isin(use_windows)].copy()

In [13]:
with open(
    os.path.join(proc_dirs["dlight"], "lagged_analysis_session_bins.toml"), "r"
) as f:
    use_session_bins = toml.load(f)["session_bins"]

In [14]:
syllable_stats = toml.load(
    os.path.join(proc_dirs["dlight"], "syllable_stats_photometry_offline.toml")
)
usage = syllable_stats["usages"]
mapping = {int(k): int(v) for k, v in syllable_stats["syllable_to_sorted_idx"].items()}
reverse_mapping = {
    int(k): int(v) for k, v in syllable_stats["sorted_idx_to_syllable"].items()
}

use_syllables = np.array(list(mapping.keys()))
# use_syllables = use_syllables[use_syllables >= 0]

## Stage data for downstream computation

In [15]:
feature_df["time_bin"] = pd.cut(
    feature_df["timestamp"], use_session_bins, labels=False, include_lowest=True
)

In [16]:
try:
    feature_df = feature_df.set_index("window_tup", append=True)
except KeyError:
    pass

In [17]:
wins = feature_df.index.get_level_values(-1).unique()
idx = pd.IndexSlice

In [18]:
dfs = []
for _idx in tqdm(wins):
    use_vals = feature_df.loc[idx[:, _idx], lagged_cfg["use_neural_features"]]
    use_vals.columns = [f"{_}_{_idx}" for _ in use_vals.columns]
    use_vals.index = use_vals.index.droplevel(-1)
    dfs.append(use_vals)

  0%|          | 0/2 [00:00<?, ?it/s]

In [19]:
meta_cols = feature_df.columns.difference(feature_df.filter(regex="dff").columns)

In [20]:
meta_df = feature_df[meta_cols].loc[idx[:, _idx], :]
meta_df.index = meta_df.index.droplevel(-1)
meta_df = meta_df.drop(meta_df.filter(regex="win").columns, axis=1)

In [21]:
feature_df = pd.concat(dfs, axis=1).join(meta_df)

rle_df["duration"] = rle_df.groupby("uuid")["timestamp"].shift(-1) - rle_df["timestamp"]
feature_df = feature_df.reset_index()
feature_df = feature_df.dropna(subset=["timestamp"])

In [22]:
rle_df["syllable_num"] = rle_df.groupby(["uuid"])["syllable"].transform(
    lambda x: np.arange(len(x))
)
feature_df = feature_df.sort_values(["timestamp", "uuid"]).dropna(subset=["timestamp"])
rle_df = rle_df.sort_values(["timestamp", "uuid"])
rle_df["timestamp"] = rle_df["timestamp"].astype("float32")

feature_df["uuid"] = feature_df["uuid"].astype("str")
feature_df = pd.merge_asof(
    feature_df, rle_df[["timestamp", "uuid", "syllable_num"]], on="timestamp", by="uuid"
)
feature_df = feature_df.sort_values(["uuid", "timestamp"])
rle_df = rle_df.sort_values(["uuid", "timestamp"])

In [23]:
feature_df["prev_duration"] = feature_df.groupby("uuid")["duration"].shift(1)

In [24]:
feature_df["syllable"] = feature_df["syllable"].map(mapping)
rle_df["syllable"] = rle_df["syllable"].map(mapping)

In [25]:
use_syllables = feature_df["syllable"].value_counts().index.tolist()
K = len(use_syllables)

# Now we compute values triggered on syllable instances, storing features we want to split by downstream (dLight, scalars, etc.)

In [26]:
usage_bins = encoding_cfg["average_variant"]["bins"]

regress_scalars = [
    "velocity_2d_mm",
    "velocity_height",
    "velocity_angle",
    "acceleration_2d_mm",
]

idx = pd.IndexSlice

feature_df = feature_df.reset_index(drop=True)

In [29]:
results_file

'/home/markowitzmeister_gmail_com/jeff_win_share/reinforcement_data/_final_test/_data/dlight_intermediate_results/dlight_snippets_offline_features_withinbin_encoding_model_average.parquet'

In [30]:
group_obj = feature_df.loc[~feature_df["session_number"].isin([1, 2])].groupby(
    ["uuid"]
)

In [31]:
print(group_obj.ngroups)

666


In [32]:
func = partial(
    get_lagged_features,
    chk_syllables=use_syllables,
    dlight_features=use_features,
    usage_bins=usage_bins,
    truncate=syllable_stats["truncate"],
    additional_scalar_keys=regress_scalars + ["duration"],
    additional_syllable_keys=["duration"],
    K=K,
)

In [33]:
if not os.path.exists(results_file) or force:
    syllable_rates = apply_parallel_joblib(group_obj, func, n_jobs=-10, backend="loky")

    syllable_rates = syllable_rates.reset_index()
    syllable_rates["total_duration"] = (
        syllable_rates["count"] * syllable_rates["duration"]
    )

    for k, v in tqdm(convs.items()):
        try:
            syllable_rates[k] = syllable_rates[k].astype(v)
        except KeyError:
            pass

    save_rates = syllable_rates[syllable_rates.columns.difference(["tm"])]
    tm_list = syllable_rates["tm"].to_list()
    save_rates.to_parquet(results_file)
    np.save(results_tm_file, tm_list)
    save_rates["tm"] = tm_list
else:
    save_rates = pd.read_parquet(results_file)
    tm_mat = np.load(results_tm_file)
    tm_list = list(tm_mat)
    save_rates["tm"] = tm_list

  ret_list = Parallel(n_jobs=n_jobs, verbose=verbose, backend=backend, batch_size=batch_size)(
[Parallel(n_jobs=-10)]: Using backend LokyBackend with 119 concurrent workers.
[Parallel(n_jobs=-10)]: Done   4 tasks      | elapsed:   15.7s
[Parallel(n_jobs=-10)]: Done  27 tasks      | elapsed:   25.8s
[Parallel(n_jobs=-10)]: Done  50 tasks      | elapsed:   30.6s
[Parallel(n_jobs=-10)]: Done  75 tasks      | elapsed:   35.4s
[Parallel(n_jobs=-10)]: Done 100 tasks      | elapsed:   40.0s
[Parallel(n_jobs=-10)]: Done 127 tasks      | elapsed:   45.2s
[Parallel(n_jobs=-10)]: Done 154 tasks      | elapsed:   51.5s
[Parallel(n_jobs=-10)]: Done 183 tasks      | elapsed:   60.0s
[Parallel(n_jobs=-10)]: Done 212 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-10)]: Done 243 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-10)]: Done 274 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-10)]: Done 307 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-10)]: Done 340 tasks      | elapsed:  1.6min
[Parall

  0%|          | 0/10 [00:00<?, ?it/s]

# Stage features

In [34]:
neural_keys = use_features

In [35]:
filt_rates = save_rates.copy()

In [36]:
filt_rates = filt_rates[filt_rates["duration"] > 0.3].copy()
filt_rates = dlight_exclude(
    filt_rates, exclude_3s=False, syllable_key="syllable", **dlight_cfg
)

In [37]:
with open(
    os.path.join(proc_dirs["dlight"], "lagged_analysis_session_bins.toml"), "r"
) as f:
    use_session_bins = toml.load(f)["session_bins"]

filt_rates["time_bin"] = pd.cut(filt_rates["timestamp"], use_session_bins, labels=False)

In [38]:
usage = (
    rle_df.loc[rle_df["uuid"].isin(filt_rates["uuid"].unique().tolist())]
    .groupby(["mouse_id"])["syllable"]
    .value_counts(normalize=False)
).rename("count")

include_pairs = usage[usage > 100].index

filt_rates = (
    filt_rates.set_index(["mouse_id", "syllable"]).loc[include_pairs].reset_index()
)

## Construct a feature matrix...feature0_lag1...lagx_feature1_lag1...lagN

In [39]:
from functools import reduce
from numba import jit

cache = False


@jit(nopython=True, cache=cache)
def zscore_nb(values, index):
    return (values - np.nanmean(values)) / np.nanstd(values)

In [41]:
idx = pd.IndexSlice
features = ["velocity_2d_mm_global_bin", "count", "tm"] + [
    f"{_}_global_bin" for _ in neural_keys
]
meta_keys = ["mouse_id", "session_number", "area"]

In [45]:
use_syllable_rates = filt_rates.copy().reset_index()

In [46]:
lags = np.array(sorted(use_syllable_rates["bin"].unique()))

In [47]:
include_lags = lags

In [48]:
try:
    use_syllable_rates = use_syllable_rates.set_index("bin")
except KeyError:
    pass

In [49]:
lsts = []
all_features = []
# skipping first lag...
for _lag in include_lags:
    new_df = (
        use_syllable_rates.loc[_lag]
        .reset_index(drop=True)
        .set_index(["uuid", "date", "syllable", "time_bin", "trans_number"])[features]
        .sort_index()
    )
    new_df.columns = [f"{_}_{_lag}" for _ in new_df.columns]
    all_features += new_df.columns.tolist()
    lsts.append(new_df)

In [50]:
regress_df = pd.concat(lsts, axis=1)
regress_df = pd.concat(
    (
        regress_df,
        use_syllable_rates.loc[10]
        .set_index(["uuid", "date", "syllable", "time_bin", "trans_number"])
        .sort_index()[meta_keys + neural_keys],
    ),
    axis=1,
)
regress_df = regress_df.dropna()

In [51]:
z_features = [_ for _ in all_features if "tm" not in _]
tm_features = [_ for _ in all_features if "tm" in _]

regress_df = regress_df.reset_index()
regress_df["syllable"] = regress_df["syllable"].astype("int")

# current results use this...
z_keys_between = ["mouse_id"]

zscore_data = True
zscore_response = True

if zscore_data:
    regress_df[z_features] = regress_df.groupby(z_keys_between)[z_features].transform(
        zscore_nb, engine="numba"
    )

if zscore_response:
    regress_df[neural_keys] = regress_df.groupby(z_keys_between)[neural_keys].transform(
        zscore_nb, engine="numba"
    )

zscore_ents = False
zscore_ents_keys = ["mouse_id"]

In [52]:
neural_sorted_cols = ns.natsorted(neural_keys)

In [53]:
# pool different feature types
ave_keys = ["area", "mouse_id", "syllable"]
ave_mapping = {
    "count": ave_keys,
    "velocity": ave_keys,
    ".*dff.*global_bin.*": ave_keys,
    ".*com_\d+$": ave_keys,
    ".*com_rank_\d+$": ave_keys,
}
index_keys = ["area", "mouse_id", "syllable"]
beh_aves = []
for k, v in tqdm(ave_mapping.items()):
    _cols = regress_df.filter(regex=k).columns
    if v is not None:
        _ave = regress_df.groupby(v)[_cols].mean()
    else:
        _ave = regress_df.set_index(index_keys, append=True)[_cols]
    beh_aves.append(_ave)

if ave_keys is not None:
    common_keys = list(
        reduce(
            lambda left, right: set(left).intersection(set(right)), ave_mapping.values()
        )
    )
    use_beh_features = reduce(
        lambda left, right: pd.merge(left, right, how="left", on=common_keys), beh_aves
    ).dropna()
else:
    use_beh_features = reduce(
        lambda left, right: pd.merge(
            left, right, left_index=True, right_index=True, how="left"
        ),
        beh_aves,
    ).dropna()

  0%|          | 0/5 [00:00<?, ?it/s]

In [54]:
# now pool dlight features and tms for entropy calculation
tm_group_keys = ["area", "mouse_id", "syllable"]
neural_ave_keys = ave_keys
use_tms = regress_df.groupby(tm_group_keys)[tm_features].apply(np.sum)

if neural_ave_keys is not None:
    use_neural_features = (
        regress_df.groupby(neural_ave_keys)[neural_sorted_cols].mean().dropna()
    )
elif ave_keys is not None:
    use_neural_features = regress_df.set_index(ave_keys)[neural_sorted_cols]
else:
    use_neural_features = regress_df.set_index(index_keys, append=True)[
        neural_sorted_cols
    ]

In [56]:
to_use = use_tms[~use_tms.applymap(lambda x: np.isscalar(x)).any(axis=1)].index

In [57]:
use_tms = use_tms.loc[to_use]
use_beh_features = use_beh_features.reset_index().set_index(use_tms.index.names)
use_beh_features = use_beh_features.loc[to_use.intersection(use_beh_features.index)]

In [58]:
truncate = syllable_stats["truncate"]

In [59]:
ent_func = lambda x: dm_entropy(
    x[:truncate, :truncate], alpha="perks", marginalize=False, axis=1
)

In [60]:
ents = []
for _col in tqdm(use_tms.columns):
    ents.append(use_tms[_col].apply(ent_func).rename(_col.replace("tm", "entropy")))
ent_df = pd.concat(ents, axis=1)

  0%|          | 0/10 [00:00<?, ?it/s]

In [61]:
ent_df = ent_df.reset_index()

In [62]:
if zscore_ents:
    ent_keys = ent_df.filter(regex="entropy").columns.tolist()
    ent_df[ent_keys] = ent_df.groupby(zscore_ents_keys)[ent_keys].transform(
        zscore_nb, engine="numba"
    )

In [63]:
ent_df = ent_df.set_index(use_beh_features.index.names)

In [64]:
use_beh_features = pd.merge(
    use_beh_features, ent_df, how="left", left_index=True, right_index=True
)

In [65]:
if neural_ave_keys is not None:
    merge_keys = list(set(neural_ave_keys).intersection(common_keys))
elif ave_keys is not None:
    merge_keys = ave_keys
else:
    merge_keys = ["level_0"] + index_keys

In [66]:
use_beh_features = use_beh_features.reset_index().set_index(merge_keys).sort_index()

In [67]:
use_beh_features.index = use_beh_features.index.reorder_levels(
    use_neural_features.index.names
)

In [68]:
use_merged_features = pd.merge(
    use_neural_features,
    use_beh_features,
    how="left",
    left_index=True,
    right_index=True,
)

In [69]:
use_merged_features = use_merged_features.dropna()

In [70]:
use_merged_features.to_parquet(results_features_file)

In [71]:
with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(
        use_merged_features.groupby("area")
        .corr()
        .xs("signal_reref_dff_z_max_(0.0, inf)", level=-1)
        .filter(regex="velocity")
    )

Unnamed: 0_level_0,velocity_2d_mm_global_bin_5,velocity_2d_mm_global_bin_10,velocity_2d_mm_global_bin_25,velocity_2d_mm_global_bin_50,velocity_2d_mm_global_bin_100,velocity_2d_mm_global_bin_200,velocity_2d_mm_global_bin_300,velocity_2d_mm_global_bin_400,velocity_2d_mm_global_bin_800,velocity_2d_mm_global_bin_1600
area,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
dls,-0.422203,-0.396877,-0.350213,-0.293097,-0.235119,-0.178612,-0.143672,-0.122178,-0.092409,-0.072143
dms,-0.368783,-0.288867,-0.241544,-0.181825,-0.119823,-0.134772,-0.117156,-0.110005,-0.121549,-0.118961
