# Compute syllable counts in bins across optoda sessions

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from functools import partial
from rl_analysis.behavior.util import normalize_df, filter_feedback_dataframe
from rl_analysis.batch import apply_parallel_joblib
from rl_analysis.io.df import get_closed_loop_parquet_columns
import pandas as pd
import os
import numpy as np

In [3]:
import toml

with open("../analysis_configuration.toml", "r") as f:
    analysis_config = toml.load(f)

In [4]:
raw_dirs = analysis_config["raw_data"]
proc_dirs = analysis_config["intermediate_results"]
closed_loop_cfg = analysis_config["closed_loop_behavior"]
common_cfg = analysis_config["common"]

# Load in raw data and normalize

In [5]:
fname = os.path.join(raw_dirs["closed_loop_behavior"], "closed_loop_behavior.parquet")
cols = get_closed_loop_parquet_columns(fname, pcs=False, likes=False)

In [6]:
feedback_df = pd.read_parquet(
    fname,
    filters=[
        (
            "experiment_type",
            "in",
            [
                "reinforcement",
                "reinforcement_photometry",
            ],
        ),
    ],
    columns=cols,
).sort_index()

In [7]:
feedback_df = filter_feedback_dataframe(feedback_df, **common_cfg)
feedback_df.index = range(len(feedback_df))

## Compute the "normalized" dataframe (pretty memory intensive with target_only set to False, ~50-60 GB of RAM required)

In [8]:
feedback_df["timestamp"] = feedback_df.groupby("uniq_id")["timestamp"].transform(
    lambda x: (x - x.min())
)

In [9]:
precision = 10

In [10]:
first_timestamp = (
    np.ceil(feedback_df.groupby("uniq_id")["timestamp"].min().max() / precision)
    * precision
)
last_timestamp = (
    np.ceil(feedback_df.groupby("uniq_id")["timestamp"].max().min() / precision)
    * precision
)

In [11]:
first_timestamp = 0
last_timestamp = 1790

In [12]:
eps = 1  # only used for fold changes, pseudocount for num and den
session_window = (first_timestamp, last_timestamp)
bin_size = closed_loop_cfg["learning_timecourse"]["bin_size"]
bin_overlap = closed_loop_cfg["learning_timecourse"][
    "bin_overlap"
]  # bin overlap in seconds
max_syllable = 100  # max syllable in the model
target_only = False  # only keep the target? (False keeps everything)
baseline_smoothing = None
baseline = closed_loop_cfg["learning_timecourse"][
    "baseline"
]  # (a)bsolute to use the first baseline session, (m)onday for mondays, (w)eek for earliest baseline session in the past week and (l)ocal for the closets baseline
label_key = "predicted_syllable"  # predicted_syllable or predicted_syllable (offline)
time_bins = np.arange(session_window[0], session_window[1] + bin_size - 1, bin_size)

In [13]:
save_file = os.path.join(
    raw_dirs["closed_loop_behavior"], f"learning_timecourse_binsize-{bin_size}.parquet"
)
syllable_list = sorted(feedback_df["predicted_syllable"].unique())

In [14]:
# outer loop by cohort, inner loop by target, first check all timecourses, then wed-mon
_func_rle = partial(
    normalize_df,
    label_key=label_key,
    outer_loop_key="syllable_group",
    time_bins=time_bins,
    baseline_smoothing=baseline_smoothing,
    eps=eps,
    syllable_list=syllable_list,
    meta_keys=closed_loop_cfg["learning_timecourse"]["meta_keys"],
    target_only=target_only,
    use_rle=True,
    baseline=baseline,  
)

# outer loop by cohort, inner loop by target, first check all timecourses, then wed-mon
_func_nonrle = partial(
    normalize_df,
    label_key=label_key,
    outer_loop_key="syllable_group",
    time_bins=time_bins,
    baseline_smoothing=baseline_smoothing,
    eps=eps,
    syllable_list=syllable_list,
    meta_keys=closed_loop_cfg["learning_timecourse"]["meta_keys"],
    target_only=target_only,
    use_rle=False,
    baseline=baseline,  
)

In [15]:
group_dfs_rle = apply_parallel_joblib(
    feedback_df.groupby(["cohort", "mouse_id"], as_index=False),
    _func_rle,
    n_jobs=20,
    verbose=10,
)
group_dfs_nonrle = apply_parallel_joblib(
    feedback_df.groupby(["cohort", "mouse_id"], as_index=False),
    _func_nonrle,
    n_jobs=20,
    verbose=10,
)

[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   1 tasks      | elapsed:   27.7s
[Parallel(n_jobs=20)]: Done   6 out of  40 | elapsed:   30.1s remaining:  2.8min
[Parallel(n_jobs=20)]: Done  11 out of  40 | elapsed:   32.7s remaining:  1.4min
[Parallel(n_jobs=20)]: Done  16 out of  40 | elapsed:   36.2s remaining:   54.2s
[Parallel(n_jobs=20)]: Done  21 out of  40 | elapsed:   38.6s remaining:   34.9s
[Parallel(n_jobs=20)]: Done  26 out of  40 | elapsed:   41.5s remaining:   22.4s
[Parallel(n_jobs=20)]: Done  31 out of  40 | elapsed:   44.3s remaining:   12.9s
[Parallel(n_jobs=20)]: Done  36 out of  40 | elapsed:   47.5s remaining:    5.3s
[Parallel(n_jobs=20)]: Done  40 out of  40 | elapsed:   52.9s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   1 tasks      | elapsed:   22.7s
[Parallel(n_jobs=20)]: Done   6 out of  40 | elapsed:   25.2s remaining:  2.4min
[Pa

In [16]:
group_dfs_rle["rle"] = True
group_dfs_nonrle["rle"] = False

In [17]:
group_dfs = pd.concat([group_dfs_rle, group_dfs_nonrle])
group_dfs.index = range(len(group_dfs))

In [18]:
norm_df = group_dfs

## Load in exclusions

In [19]:
use_df = norm_df.copy()

In [20]:
use_df.loc[use_df["session_number"].isin([-1, 0]), "session_type"] = "pre"
use_df.loc[use_df["session_number"].isin([1, 2]), "session_type"] = "stim"
use_df.loc[use_df["session_number"].isin([3, 4]), "session_type"] = "post"

In [21]:
use_df = use_df[(use_df["bin_end"] - use_df["bin_start"]) == bin_size].copy()

In [22]:
use_df["syllable_group"] = (
    use_df.groupby(["mouse_id", "cohort"])["syllable_group"].rank(method="dense") - 1
)

In [23]:
use_df = use_df.replace([np.inf, -np.inf], np.nan)

In [24]:
with np.errstate(divide="ignore", invalid="ignore"):
    use_df["log2_fold_change_count"] = np.log2(use_df["fold_change_count"])

In [25]:
codes = pd.factorize(
    pd._libs.lib.fast_zip(
        [
            use_df["mouse_id"].values,
            use_df["stim_duration"].values,
            use_df["syllable_group"].values,
            use_df["target_syllable"].values,
            use_df["cohort"].values,
        ]
    )
)[0]

In [26]:
use_df["syllable_group_unique"] = codes

In [27]:
use_df["syllable"] = use_df["syllable"].astype("int8")

In [28]:
use_df.to_parquet(save_file)

In [29]:
print(save_file)

/home/markowitzmeister_gmail_com/jeff_win_share/reinforcement_data/_final_test/_data/optoda_raw_data/learning_timecourse_binsize-30.parquet


In [30]:
use_df.groupby(["area","opsin"])["mouse_id"].nunique()

area        opsin   
ctrl        ctrl        12
snc (axon)  chr2        20
            chrimson     8
Name: mouse_id, dtype: int64