### Imports

In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.extend(['/home/nkinsky/Documents/GitHub/NeuroPy'])
sys.path.extend(['/home/nkinsky/Documents/GitHubPrivate/pythonprogs/DataPaths/'])

import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
import scipy.signal as sg
import matplotlib.pyplot as plt
from neuropy import plotting
from plotters import Plotter
import stats_utils
import subjects
from copy import deepcopy

from neuropy.core.epoch import Epoch

### Bootstrap continuous trajectory replay proportion/number
- For sd_paper fig4, where we compare proportion of significant continuous trajectory events, that can be substituted by bootstrap distribution.
- One way of doing that will be randomly sampling events with replacement and calculate how many of the selected events are significant trajectory events. While initially seems like a reasonable measure, it may backfire for sessions which have low proportion of significant events.

In [2]:
sessions = subjects.pf_sess()

# Change up jump distance threshold if needed
jump_thresh = 40
assert jump_thresh in [20, 40]
save_append = "" if jump_thresh == 40 else f"_jumpthresh{jump_thresh}"

cont_df = []
for s, sess in enumerate(sessions):
    # cont_replay_df = sess.replay_filtered.to_dataframe()
    cont_replay_df = sess.replay_filtered.to_dataframe() if jump_thresh == 40 else getattr(sess, f"replay_filtered{jump_thresh}jd").to_dataframe()
    # all_replay_df = sess.replay_pbe_mua.to_dataframe()

    all_pbe_df = sess.pbe_filters.to_dataframe()
    good_pbe = (all_pbe_df.is_rpl & all_pbe_df.is_5units & all_pbe_df.is_rest).values

    good_pbe_df = all_pbe_df.loc[good_pbe, ["start", "stop"]].reset_index(drop=True)
    is_cont = np.isin(good_pbe_df.start, cont_replay_df.start)

    # Add in brainstate
    bs_df = sess.brainstates.to_dataframe()
    nrem_epochs = Epoch(bs_df[bs_df.label == "NREM"])
    is_nrem, _, _ = nrem_epochs.contains(good_pbe_df.start)
    wake_epochs = Epoch(bs_df.loc[(bs_df.label == "QW") | (bs_df.label == "AW"), :])
    is_wake, _, _ = wake_epochs.contains(good_pbe_df.start)

    zt_epochs = sess.get_zt_epochs()

    for e in zt_epochs.itertuples():
        indx = (good_pbe_df.start >= e.start) & (good_pbe_df.stop <= e.stop)
        e_df = pd.DataFrame(
            dict(is_cont=is_cont[indx], zt=e.label, session=s, grp=sess.tag, brainstate="")
        )
        e_df.loc[is_nrem[indx], "brainstate"] = "NREM"
        e_df.loc[is_wake[indx], "brainstate"] = "WAKE"

        cont_df.append(e_df)

cont_df = pd.concat(cont_df, ignore_index=True)

#Sessions = 13


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  epochs.loc[:, "label"] = epochs["label"].astype("str")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  epochs.loc[:, "label"] = epochs["label"].astype("str")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  epochs.loc[:, "label"] = epochs["label"].astype("str")
A value is trying to be set on a copy of

In [3]:
cont_df

Unnamed: 0,is_cont,zt,session,grp,brainstate
0,False,PRE,0,NSD,WAKE
1,False,PRE,0,NSD,WAKE
2,False,PRE,0,NSD,WAKE
3,False,PRE,0,NSD,WAKE
4,False,PRE,0,NSD,WAKE
...,...,...,...,...,...
136784,False,5-7.5,12,SD,NREM
136785,True,5-7.5,12,SD,NREM
136786,False,5-7.5,12,SD,NREM
136787,False,5-7.5,12,SD,NREM


### Save proportions by brainstate

In [None]:
# Keep only NREM and WAKE epochs (get rid of noise and REM)
cont_df_bs = cont_df[(cont_df.brainstate == "NREM") | (cont_df.brainstate == "WAKE")]
cont_df_bs = cont_df_bs.groupby(["grp", "brainstate", "session", "zt"]).mean().reset_index()
cont_df_bs = cont_df_bs.rename(columns={"is_cont": "prop"})

subjects.GroupData().save(cont_df_bs, f"replay_continuous_events_brainstate{save_append}")
cont_df_bs

### Save numbers by brainstate

In [4]:
### KEY FLAG HERE - set to True if doing brainstate analysis, else set to False
by_brainstate = True
bs_append = "_bs" if by_brainstate else ""

In [6]:
### Continuous replay number

# Original code w/o brainstate
if not by_brainstate:
    number_df = cont_df.drop(columns=["brainstate"]).groupby(["grp", "session", "zt"], sort=False).sum().reset_index()
else: 
    number_df = cont_df.groupby(["brainstate", "grp", "session", "zt"], sort=False).sum().reset_index()

subjects.GroupData().save(number_df, f"continuous_replay_number{save_append}{bs_append}")

continuous_replay_number_bs saved


In [5]:
### Total number of candidate events
# Original code w/o brainstate
if not by_brainstate:
    number_df = cont_df.drop(columns=["brainstate"]).groupby(["grp", "session", "zt"], sort=False).count().reset_index()
else:
    number_df = cont_df.groupby(["brainstate", "grp", "session", "zt"], sort=False).count().reset_index()

subjects.GroupData().save(number_df, f"candidate_replay_number{save_append}{bs_append}")

candidate_replay_number_bs saved


In [None]:
# ---- bootstrapping absolute numbers--------
# use this one!
func = (
    lambda df: df.drop(columns=["brainstate"]).groupby(["grp", "zt", "session"], sort=False)
    .sum()
    # .reset_index()
    .groupby(["grp", "zt"], sort=False)
    .mean()
    .reset_index()
)

# Don't use this one - we want the mean number of events across all animals since we have different #s of sessions in each group
# func = lambda df: df.drop(columns=["brainstate"]).groupby(["grp", "zt"], sort=False).sum().reset_index()

boot_df = stats_utils.bootstrap_resample(
    cont_df, level="both", n_iter=10000, n_jobs=8, apply=func
)
subjects.GroupData().save(boot_df, f"continuous_replay_number_bootstrap{save_append}")

# ---- bootstrapping proportion --------
## proportion of events pooled across sampled sessions, not mean of proportions

# func = lambda df: df.drop(columns=["brainstate"]).groupby(["grp", "zt"], sort=False).mean().reset_index()

# boot_df = stats_utils.bootstrap_resample(
#     cont_df, level="both", n_iter=10000, n_jobs=8, apply=func
# ).drop(columns=["session"])
# subjects.GroupData().save(boot_df, f"continuous_replay_proportion_bootstrap{save_append}")


# ---- bootstrapping candidate numbers
# if by_brainstate:
#     func = lambda df: df.drop(columns=["brainstate"]).groupby(["grp", "zt"], sort=False).mean().reset_index()
# else:
#     func = lambda df: df.groupby(["grp", "zt"], sort=False).mean().reset_index()
# boot_df = stats_utils.bootstrap_resample(
#     number_df, level="both", n_iter=10000, n_jobs=8, apply=func
# ).drop(columns=["session"])
# subjects.GroupData().save(boot_df, f"candidate_replay_number_bootstrap{save_append}")

### Bootstrap by brain state

In [7]:
for state_name in ["WAKE", "NREM"]:
    state_cont_df = cont_df[cont_df.brainstate == state_name]
    state_number_df = number_df[number_df.brainstate == state_name]

    # # ---- bootstrapping proportion --------
    # ## proportion of events pooled across sampled sessions, not mean of proportions
    # func = lambda df: df.drop(columns=["brainstate"]).groupby(["grp", "zt"], sort=False).mean().reset_index()
    
    # boot_df = stats_utils.bootstrap_resample(
    #     state_cont_df, level="both", n_iter=10000, n_jobs=8, apply=func
    # ).drop(columns=["session"])
    # subjects.GroupData().save(boot_df, f"continuous_replay_{state_name}_proportion_bootstrap")

    # ---- bootstraping absolute numbers--------
    # use this one!
    func = (
        lambda df: df.drop(columns=["brainstate"])
        .groupby(["grp", "zt", "session"], sort=False)
        .sum()
        # .reset_index()
        .groupby(["grp", "zt"], sort=False)
        .mean()
        .reset_index()
    )
    ## Don't use this one - we want the mean number of events across all animals since we have different #s of sessions in each group
    ## func = lambda df: df.drop(columns=["brainstate"]).groupby(["grp", "zt"], sort=False).sum().reset_index()
    
    boot_df = stats_utils.bootstrap_resample(
        state_cont_df, level="both", n_iter=10000, n_jobs=8, apply=func
    )
    subjects.GroupData().save(boot_df, f"continuous_replay_{state_name}_number_bootstrap")

    # ---- bootstrapping candidate numbers

    # func = lambda df: df.drop(columns=["brainstate"]).groupby(["grp", "zt"], sort=False).mean().reset_index()
    # boot_df = stats_utils.bootstrap_resample(
    #     state_number_df, level="both", n_iter=10000, n_jobs=8, apply=func
    # ).drop(columns=["session"])
    # subjects.GroupData().save(boot_df, f"candidate_replay_{state_name}_number_bootstrap")


Running bootstraps for NSD group


100%|████████████████████████████████████| 10000/10000 [00:55<00:00, 180.07it/s]


Running bootstraps for SD group


100%|████████████████████████████████████| 10000/10000 [01:21<00:00, 123.10it/s]


continuous_replay_WAKE_number_bootstrap saved
Running bootstraps for NSD group


100%|████████████████████████████████████| 10000/10000 [00:52<00:00, 189.10it/s]


Running bootstraps for SD group


100%|████████████████████████████████████| 10000/10000 [00:33<00:00, 297.78it/s]


continuous_replay_NREM_number_bootstrap saved


In [None]:
# calculating the absolute number


fig = plotting.Fig(5, 5)

ax = fig.subplot(fig.gs[0, 0])
df = cont_df.groupby(["grp", "session", "zt"], sort=False).sum().reset_index()
p1 = Plotter(data=df, x="zt", y="is_cont", hue="grp", hue_order=["NSD", "SD"], ax=ax)
p1.stripbarplot_sd().stat_anot_sd(
    stat_across="t-test_welch",
    alpha_across=0.05,
    stat_within="t-test_paired",
    alpha_within=0.05,
    fontsize=5,
)
ax = fig.subplot(fig.gs[0, 2])
boot_number_df = subjects.GroupData().continuous_replay_number_bootstrap
p2 = Plotter(
    data=boot_number_df, x="zt", y="is_cont", hue="grp", hue_order=["NSD", "SD"], ax=ax
)
p2.violinplot_sd(palette=subjects.colors_sd()).stat_anot_sd(
    stat_across=stats_utils.get_bootstrap_prob,
    alpha_across=0.025,
    stat_within=stats_utils.get_bootstrap_prob_paired,
    alpha_within=0.025,
    fontsize=5,
)

# ax = fig.subplot(fig.gs[3, 0])
# df = cont_df.groupby(["grp", "session", "zt"], sort=False).mean().reset_index()
# p2 = Plotter(data=df, x="zt", y="is_cont", hue="grp", hue_order=["NSD", "SD"], ax=ax)
# p2.stripbarplot_sd()

fig.savefig(subjects.figpath_sd / "trajectory_replay_number_and_bootstrap")

### Bootstrap the continuous trajectory PBE duration
- calculate mean bootstrap duration for PBEs that have continuous trajectory

In [None]:
if "jump_thresh" in locals():
    save_append = "" if jump_thresh == 40 else f"_jumpthresh{jump_thresh}"
    
pbe_duration = getattr(subjects.GroupData(), f"continuous_replay_PBE_duration{save_append}")

# ---- bootstraping absolute numbers--------
func = lambda df: df.groupby(["grp", "zt"], sort=False).mean().reset_index()

boot_df = stats_utils.bootstrap_resample(
    pbe_duration, level="both", n_iter=10000, n_jobs=8, apply=func
).drop(columns="session")

subjects.GroupData().save(boot_df, f"continuous_replay_PBE_duration_bootstrap{save_append}")

In [11]:
# by brainstate
if "jump_thresh" in locals():
    save_append = "" if jump_thresh == 40 else f"_jumpthresh{jump_thresh}"
    
pbe_duration = getattr(subjects.GroupData(), f"continuous_replay_PBE_duration{save_append}_bs")

for state_name in ["WAKE", "NREM"]:
    
    state_dur_df = pbe_duration[pbe_duration.brainstate == state_name]
    # ---- bootstraping absolute numbers--------
    func = lambda df: df.drop(columns=["brainstate"]).groupby(["grp", "zt"], sort=False).mean().reset_index()
    
    boot_df = stats_utils.bootstrap_resample(
        state_dur_df, level="both", n_iter=10000, n_jobs=8, apply=func
    ).drop(columns="session")
    
    subjects.GroupData().save(boot_df, f"continuous_replay_{state_name}_PBE_duration_bootstrap{save_append}")

Running bootstraps for NSD group


100%|████████████████████████████████████| 10000/10000 [00:36<00:00, 272.43it/s]


Running bootstraps for SD group


100%|████████████████████████████████████| 10000/10000 [00:43<00:00, 228.60it/s]


continuous_replay_WAKE_PBE_duration_bootstrap saved
Running bootstraps for NSD group


100%|████████████████████████████████████| 10000/10000 [00:31<00:00, 315.37it/s]


Running bootstraps for SD group


100%|████████████████████████████████████| 10000/10000 [00:23<00:00, 423.62it/s]


continuous_replay_NREM_PBE_duration_bootstrap saved


In [None]:
pbe_duration_1h = subjects.GroupData().continuous_replay_PBE_duration_1h_blocks

# ---- bootstraping absolute numbers--------
func = lambda df: df.groupby(["grp", "zt"], sort=False).mean().reset_index()

boot_df = stats_utils.bootstrap_resample(
    pbe_duration_1h, level="both", n_iter=10000, n_jobs=8, apply=func
).drop(columns="session")

subjects.GroupData().save(boot_df, "continuous_replay_PBE_duration_1h_blocks_bootstrap")

In [None]:
fig = plotting.Fig(5, 5)
ax = fig.subplot(fig.gs[0])

p1 = Plotter(
    data=boot_df, x="zt", y="pbe_duration", hue="grp", hue_order=["NSD", "SD"], ax=ax
)
p1.violinplot_sd(palette=subjects.colors_sd()).stat_anot_sd(
    stat_across=stats_utils.get_bootstrap_prob,
    alpha_across=0.025,
    stat_within=stats_utils.get_bootstrap_prob_paired,
    alpha_within=0.025,
)

fig.savefig(subjects.figpath_sd / "trajectory_replay_PBE_duration_bootstrap")

### (1h block) Bootstrap continuous trajectory replay proportion/number
- For sd_paper fig4, where we compare proportion of significant continuous trajectory events, that can be substituted by bootstrap distribution.
- One way of doing that will be randomly sampling events with replacement and calculate how many of the selected events are significant trajectory events. While initially seems like a reasonable measure, it may backfire for sessions which have low proportion of significant events.

In [None]:
sessions = subjects.pf_sess()

cont_df = []
for s, sess in enumerate(sessions):
    cont_replay_df = sess.replay_filtered.to_dataframe()
    # all_replay_df = sess.replay_pbe_mua.to_dataframe()

    all_pbe_df = sess.pbe_filters.to_dataframe()
    good_pbe = (all_pbe_df.is_rpl & all_pbe_df.is_5units & all_pbe_df.is_rest).values

    good_pbe_df = all_pbe_df.loc[good_pbe, ["start", "stop"]].reset_index(drop=True)
    is_cont = np.isin(good_pbe_df.start, cont_replay_df.start)

    zt_epochs = sess.get_zt_1h()

    for e in zt_epochs.itertuples():
        indx = (good_pbe_df.start >= e.start) & (good_pbe_df.stop <= e.stop)
        e_df = pd.DataFrame(
            dict(is_cont=is_cont[indx], zt=e.label, session=s, grp=sess.tag)
        )

        cont_df.append(e_df)

cont_df = pd.concat(cont_df, ignore_index=True)

In [None]:
number_df = cont_df.groupby(["grp", "session", "zt"], sort=False).sum().reset_index()

subjects.GroupData().save(number_df, "continuous_replay_number_1h_blocks")

In [None]:
# NRK Sanity check / unit test code to illustrate resampling at both levels for boostrapping below

from copy import deepcopy

# Inputs/parameters
df = deepcopy(cont_df)
level="both"
apply = lambda df: df.groupby(["grp", "zt"], sort=False).mean().reset_index()

# Code copied from stats_utils.boostrap_resample
sess_ids = df["session"].unique()
n_sess = len(sess_ids)
# print(sess_ids)
if level in {"session", "both"}:
    # bootstrap session_ids
    rng = np.random.default_rng()
    sess_ids = rng.choice(sess_ids, size=n_sess, replace=True)
    # sess_ids = [12]*13  # uncomment to test what happens if you only grab one type of sessions

new_df = []
print(f"#NSD sessions = {n_NSD}")
for i, idx in enumerate(sess_ids):
    idx_df = df[df.session == idx].copy()  # df of variables for that session
    idx_df.loc[:, "session"] = i  # make selected session independent

    if level in {"both", "samples"}:
        # bootstrap second level - grab equal numbers of events in each zt group
        if "zt" in idx_df.columns:
            idx_df = (
                idx_df.groupby(["zt"], sort=False)
                .apply(pd.DataFrame.sample, frac=1, replace=True, ignore_index=True)
                .reset_index(drop=True)
            )
        else:
            idx_df = idx_df.sample(frac=1, replace=True, ignore_index=True)

    new_df.append(idx_df)
new_df = pd.concat(new_df, ignore_index=True)

if apply is not None:
    assert callable(apply), "apply can only be a function"
    new_df = apply(new_df)
new_df

In [None]:
# ---- bootstraping proportion --------
## proportion of events pooled across sampled sessions, not mean of proportions
func = lambda df: df.groupby(["grp", "zt"], sort=False).mean().reset_index()

boot_df = stats_utils.bootstrap_resample(
    cont_df, level="both", n_iter=10000, n_jobs=8, apply=func
).drop(columns=["session"])
subjects.GroupData().save(boot_df, "continuous_replay_proportion_1h_blocks_bootstrap")

In [None]:
# ---- bootstraping absolute numbers--------
func = (
    lambda df: df.groupby(["grp", "zt", "session"], sort=False)
    .sum()
    # .reset_index()
    .groupby(["grp", "zt"], sort=False)
    .mean()
    .reset_index()
)
# func = lambda df: df.groupby(["grp", "zt"], sort=False).sum().reset_index()

boot_df = stats_utils.bootstrap_resample(
    number_df, level="both", n_iter=10000, n_jobs=8, apply=func
)
subjects.GroupData().save(boot_df, "continuous_replay_number_1h_blocks_bootstrap")


### Absolute number using only the last hour of the PRE block to match POST block lengths

In [None]:
sessions = subjects.pf_sess()

cont_df = []
for s, sess in enumerate(sessions):
    cont_replay_df = sess.replay_filtered.to_dataframe()
    # all_replay_df = sess.replay_pbe_mua.to_dataframe()

    all_pbe_df = sess.pbe_filters.to_dataframe()
    good_pbe = (all_pbe_df.is_rpl & all_pbe_df.is_5units & all_pbe_df.is_rest).values

    good_pbe_df = all_pbe_df.loc[good_pbe, ["start", "stop"]].reset_index(drop=True)
    is_cont = np.isin(good_pbe_df.start, cont_replay_df.start)

    zt_epochs = sess.get_zt_1h(pre_length=1)

    for e in zt_epochs.itertuples():
        indx = (good_pbe_df.start >= e.start) & (good_pbe_df.stop <= e.stop)
        e_df = pd.DataFrame(
            dict(is_cont=is_cont[indx], zt=e.label, session=s, grp=sess.tag)
        )

        cont_df.append(e_df)

cont_df = pd.concat(cont_df, ignore_index=True)

In [None]:
number_df = cont_df.groupby(["grp", "session", "zt"], sort=False).sum().reset_index()

subjects.GroupData().save(number_df, "continuous_replay_number_1h_blocks_1hpre")

In [None]:
# ---- bootstraping absolute numbers--------
func = (
    lambda df: df.groupby(["grp", "zt", "session"], sort=False)
    .sum()
    # .reset_index()
    .groupby(["grp", "zt"], sort=False)
    .mean()
    .reset_index()
)

boot_df = stats_utils.bootstrap_resample(
    number_df, level="both", n_iter=10000, n_jobs=8, apply=func
)
subjects.GroupData().save(boot_df, "continuous_replay_number_1h_blocks_1hpre_bootstrap")