# Compute session-level aggregate syllable counts

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from tqdm.auto import tqdm
from functools import partial
from rl_analysis.behavior.util import normalize_df, filter_feedback_dataframe
from rl_analysis.batch import apply_parallel_joblib
from rl_analysis.io.df import get_closed_loop_parquet_columns
import pandas as pd

import os
import numpy as np

# Load in raw data and normalize

In [3]:
import toml

with open("../analysis_configuration.toml", "r") as f:
    analysis_config = toml.load(f)

In [4]:
raw_dirs = analysis_config["raw_data"]
proc_dirs = analysis_config["intermediate_results"]
closed_loop_cfg = analysis_config["closed_loop_behavior"]
common_cfg = analysis_config["common"]

In [5]:
fname = os.path.join(raw_dirs["closed_loop_behavior"], "closed_loop_behavior.parquet")
cols = get_closed_loop_parquet_columns(fname, pcs=False, likes=False)

In [6]:
feedback_df = pd.read_parquet(
    fname,
    filters=[
        (
            "experiment_type",
            "in",
            [
                "reinforcement",
                "reinforcement_photometry",
                "excitation",
                "excitation_photometry",
                "excitation_pulsed",
                "excitation_pulsed_photometry",
            ],
        ),
    ],
    columns=cols,
).sort_index()

In [7]:
feedback_df = filter_feedback_dataframe(feedback_df, **common_cfg)
feedback_df.index = range(len(feedback_df))

# Load in raw data and normalize

## Compute the "normalized" dataframe (pretty memory intensive with target_only set to False, ~50-60 GB of RAM required)

In [8]:
precision = 10

In [9]:
feedback_df["timestamp"] = feedback_df.groupby("uniq_id")["timestamp"].transform(
    lambda x: (x - x.min())
)

In [10]:
first_timestamp = 0
last_timestamp = 30 * 60

In [11]:
eps = 0  # prob smoothing, 0-1 (.01 probably the max you want to use)
max_syllable = 100  # max syllable in the model
target_only = False  # only keep the target? (False keeps everything)
use_rle = True  # run-length-encode?
baseline = "m"  # (a)bsolute to use the first baseline session, (m)onday for mondays, (w)eek for earliest baseline session in the past week and (l)ocal for the closest baseline
label_key = "predicted_syllable"  # predicted_syllable or predicted_syllable (offline)
time_bins = [first_timestamp, last_timestamp]

In [12]:
# metadata to preserve
meta_keys = [
    "sex",
    "mouse_id",
    "session_number",
    "stim_duration",
    "syllable_group",
    "target_syllable",
    "opsin",
    "experiment_type",
    "area (pooled)",
    "power",
    "area",
    "genotype",
    "uuid",
    "date",
    "cohort",
]
group_dfs = []

# outer loop by cohort, inner loop by target, first check all timecourses, then wed-mon
_func = partial(
    normalize_df,
    label_key=label_key,
    outer_loop_key=["cohort", "target_syllable"],
    time_bins=time_bins,
    eps=eps,
    meta_keys=meta_keys,
    target_only=target_only,
    use_rle=use_rle,
    baseline=baseline,
    adjust_to_bin_size=True,
)

norm_df = apply_parallel_joblib(
    feedback_df.groupby("mouse_id", as_index=False, group_keys=False),
    _func,
    n_jobs=20,
    verbose=10,
)

[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   1 tasks      | elapsed:   34.6s
[Parallel(n_jobs=20)]: Done   7 out of  41 | elapsed:   53.0s remaining:  4.3min
[Parallel(n_jobs=20)]: Done  12 out of  41 | elapsed:   54.5s remaining:  2.2min
[Parallel(n_jobs=20)]: Done  17 out of  41 | elapsed:   56.6s remaining:  1.3min
[Parallel(n_jobs=20)]: Done  22 out of  41 | elapsed:   59.3s remaining:   51.2s
[Parallel(n_jobs=20)]: Done  27 out of  41 | elapsed:  1.0min remaining:   32.3s
[Parallel(n_jobs=20)]: Done  32 out of  41 | elapsed:  1.1min remaining:   18.1s
[Parallel(n_jobs=20)]: Done  37 out of  41 | elapsed:  1.1min remaining:    7.3s
[Parallel(n_jobs=20)]: Done  41 out of  41 | elapsed:  1.2min finished


In [13]:
tqdm.pandas()

In [14]:
norm_df = norm_df.reset_index(drop=True)

In [15]:
norm_df = norm_df.replace([np.inf, -np.inf], np.nan)
norm_df["log2_fold_change_count"] = np.log2(norm_df["fold_change_count"])
norm_df["log2_fold_change_usage"] = np.log2(norm_df["fold_change_usage"])

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [16]:
norm_df.to_parquet(
    os.path.join(raw_dirs["closed_loop_behavior"], "learning_aggregate.parquet")
)