# Gather all the data

- depth files
- tdt files

In [1]:
import os
import h5py
import stat
import json
import shutil
import datetime
import numpy as np
import pandas as pd
from pathlib import Path
from toolz import compose, curry
from tqdm.auto import tqdm

In [2]:
photometry_folder = Path('/n/groups/datta/win/longtogeny/dlight/photometry')
behavior_folder = Path('/n/groups/datta/win/longtogeny/dlight')

In [3]:
depth_files = sorted(behavior_folder.glob('**/depth.*'))

## Extract data

In [3]:
def not_extracted(file):
    if file.name.endswith('filepart'):
        return False

    if extracted := (file.parent / "proc" / "results_00.h5").exists():
        try:
            with h5py.File(file.parent / "proc" / "results_00.h5", "r") as h5f:
                list(h5f)
        except Exception as e:
            print(e)
            return True
        with open(file.parent / "proc" / "results_00.yaml", "r") as conf_f:
            config = yaml.safe_load(conf_f)
        extracted = config["complete"]
    # TODO: make sure extraction is newer than 5/30/2023
    return not extracted


def no_depth_doubles(file):
    return not (file.name.endswith("avi") and file.with_suffix(".dat").exists())


def multi_filter(*filters, seq):
    return compose(*(curry(filter)(f) for f in filters))(seq)

In [4]:
user = os.environ['USER']

In [10]:
script = '''#!/bin/env bash
#SBATCH -c 1
#SBATCH -n 1
#SBATCH --mem=10G
#SBATCH -p short
#SBATCH -t 00:40:00
#SBATCH --output=/n/scratch3/users/{user_pth}/tmp/ontogeny/dlight-depth-extraction-%j.out

source $HOME/.bashrc
conda activate moseq2-app
moseq2-extract extract "{file_path}" --config-file "/n/groups/datta/win/longtogeny/data/extractions/config.yaml"
'''

In [16]:
files = multi_filter(not_extracted, no_depth_doubles, seq=depth_files)

for f in tqdm(files):
    # skip avi files that have a dat copy - meaning just use dat copy for extractions
    print('extracting:', f)
    new_script = script.format(user_pth=f"{user[0]}/{user}", file_path=str(f.absolute()))
    with open("tmp.sh", "w") as f:
        f.write(new_script)

    !sbatch tmp.sh
!rm tmp.sh

0it [00:00, ?it/s]

extracting: /n/groups/datta/win/longtogeny/dlight/session_20230807113048-862691 (datta-realtime1)/depth.dat
Submitted batch job 18136162
extracting: /n/groups/datta/win/longtogeny/dlight/session_20230807121131-654885 (datta-realtime1)/depth.dat
Submitted batch job 18136163
extracting: /n/groups/datta/win/longtogeny/dlight/session_20230807125505-062630 (datta-realtime1)/depth.dat
Submitted batch job 18136164
extracting: /n/groups/datta/win/longtogeny/dlight/session_20230807133323-254885 (datta-realtime1)/depth.dat
Submitted batch job 18136165
extracting: /n/groups/datta/win/longtogeny/dlight/session_20230807143853-524963 (datta-realtime1)/depth.dat
Submitted batch job 18136166
extracting: /n/groups/datta/win/longtogeny/dlight/session_20230807152155-314778 (datta-realtime1)/depth.dat
Submitted batch job 18136167
extracting: /n/groups/datta/win/longtogeny/dlight/session_20230807160308-202771 (datta-realtime1)/depth.dat
Submitted batch job 18136168
extracting: /n/groups/datta/win/longtogen

## Copy photometry files to behavior folder

In [5]:
def parse_session(filepath):
    folder = filepath.name.split(" ")[0]
    time = datetime.datetime.strptime(folder, "session_%Y%m%d%H%M%S-%f")
    return time


def parse_photometry(filepath):
    file = filepath.name.split(".dat")[0]
    time = datetime.datetime.strptime(file, "tdt_data_%Y%m%d%H%M%S")
    return time

In [6]:
def is_appropriate_size(file: Path):
    size = file.stat().st_size / 1024 / 1024  # megabytes
    return size > 50

In [7]:
def check_animal(beh_folder: Path, tdt_file: Path):
    tdt_json = tdt_file.with_suffix(".json")
    beh_json = beh_folder / "metadata.json"
    with open(tdt_json, "r") as f:
        tdt_data = json.load(f)
    with open(beh_json, "r") as f:
        beh_data = json.load(f)
    tdt_name = tdt_data["metadata"]["subject_name"]
    beh_name = beh_data["SubjectName"]
    tdt_name_parts = tdt_name.split("-")
    beh_name_parts = beh_name.split("-")
    if any((tdt_name_parts[0] != beh_name_parts[0], tdt_name_parts[-1] != beh_name_parts[-1])):
        print("tdt", tdt_name, "beh", beh_name)
        return False
    else:
        return True

In [11]:
tdt_files = sorted(filter(is_appropriate_size, photometry_folder.glob("tdt*.dat")))

In [12]:
len(tdt_files)

169

In [17]:
tdt_times = np.array([parse_photometry(f) for f in tdt_files])
beh_times = np.array([parse_session(f) for f in map(lambda x: x.parent, depth_files)])

In [19]:
unmatched_tdt = []
matched_tdt = []

for idx, t in enumerate(tqdm(beh_times)):
    delta = t - tdt_times
    if not any([d.days == 0 for d in delta]):
        continue
    i = np.argmin(np.abs(delta))
    if not tdt_files[i].with_suffix(".json").exists():
        print('error: no json file')
        continue
    is_same_animal = check_animal(depth_files[idx].parent, tdt_files[i])
    new_tdt = depth_files[idx].parent / tdt_files[i].name
    if is_same_animal and not new_tdt.exists():
        shutil.copy(tdt_files[i], new_tdt)
        json_file = tdt_files[i].with_suffix(".json")
        shutil.copy(json_file, new_tdt.with_suffix(".json"))
        matched_tdt.append(tdt_files[i])
    elif is_same_animal:
        matched_tdt.append(tdt_files[i])
    else:
        unmatched_tdt.append(tdt_files[i])

  0%|          | 0/174 [00:00<?, ?it/s]

tdt 98w-3 beh 98w-2
tdt 52w-4 beh 12w-3
tdt 98w-3 beh 98w-4
tdt 98w-5 beh 52w-5
tdt 98w-5 beh 52w-5


## Align TDT recordings to behavior

In [8]:
def folder_filter(path: Path, out_file: Path):
    out_file_exists = not out_file.exists()

    dat_file_exists = (path / "ir.dat").exists()
    avi_file_exists = (path / "ir.avi").exists()
    mov_file_exists = any((dat_file_exists, avi_file_exists))

    tdt_file_exists = len(sorted(path.glob("tdt*.dat"))) > 0
    return all((out_file_exists, mov_file_exists, tdt_file_exists))

In [20]:
led_sync_kws = dict(
    median_winsize=6,
    threshold_scale=1.1,
    mode_winsize=9,
)

In [21]:
script = '''#!/bin/env python
#SBATCH -c 2
#SBATCH -n 1
#SBATCH --mem=25G
#SBATCH -p short
#SBATCH -t 0:45:00
#SBATCH --mail-type=END
#SBATCH --mail-user=wgillis@g.harvard.edu
#SBATCH --output="{}"

import pandas as pd
from rl_analysis.rl_photometry.align import align_photometry_to_behavior_v2

df, alignment_pred = align_photometry_to_behavior_v2("{}", led_sync_kws={led_sync_kws})

df['start_time'] = pd.to_datetime(df['start_time'])
df.to_parquet("{}")

alignment_pred.to_parquet("{}")
'''

In [30]:
for session in map(lambda f: f.parent, depth_files):
    out_file = session / "photometry_df_v2.parquet"
    alignment_file = session / "alignment_df.parquet"
    if folder_filter(session, out_file):
        new_script = script.format(
            session / "photometry_alignment_v2_%j.out",
            session,
            out_file,
            alignment_file,
            led_sync_kws=led_sync_kws,
        )

        with open("tmp.py", "w") as f:
            f.write(new_script)

        st = os.stat("tmp.py")
        os.chmod("tmp.py", st.st_mode | stat.S_IEXEC)

        !sbatch tmp.py
!rm tmp.py

Submitted batch job 18225426
Submitted batch job 18225428
Submitted batch job 18225429
Submitted batch job 18225430
Submitted batch job 18225433
Submitted batch job 18225434
Submitted batch job 18225435
Submitted batch job 18225438
Submitted batch job 18225439
Submitted batch job 18225620
Submitted batch job 18225622
Submitted batch job 18225623
Submitted batch job 18225624
Submitted batch job 18225626
Submitted batch job 18225628
Submitted batch job 18225629
Submitted batch job 18225631
Submitted batch job 18225633
Submitted batch job 18225634
Submitted batch job 18225636
Submitted batch job 18225638
Submitted batch job 18225639
Submitted batch job 18225642
Submitted batch job 18225643
Submitted batch job 18225644
Submitted batch job 18225647
Submitted batch job 18225648
Submitted batch job 18225649
Submitted batch job 18225651
Submitted batch job 18225653
Submitted batch job 18225654
Submitted batch job 18225656
Submitted batch job 18225658
Submitted batch job 18225659
Submitted batc

## Merge photometry and behavior

In [9]:
import matplotlib.pyplot as plt
from scipy import signal
from toolz import partial
from aging.dlight.normalize import rolling_fluor_normalization, rereference

In [10]:
version = 11
pc_scores_file = Path(f'/n/groups/datta/win/longtogeny/data/ontogeny/version_{version:02d}/all_data_pca/pca_scores.h5')
syllables_file = pc_scores_file.with_name('syllables.h5')

In [11]:
h5_files = sorted(behavior_folder.glob('**/results_00.h5'))

In [12]:
norm_func = partial(rolling_fluor_normalization, window_size=10, quantile=0.5)
dff_func = partial(rolling_fluor_normalization, window_size=5, quantile=0.1, normalizer="dff")


def filter_signal(v):
    sos = signal.butter(2, 3, output="sos", fs=30)
    f = signal.sosfiltfilt(sos, v)
    return f

In [13]:
concat_phot_df = []
for file in tqdm(h5_files):
    phot_file = file.parents[1] / "photometry_df_v2.parquet"
    if not phot_file.exists():
        continue
    try:
        with h5py.File(file, 'r') as h5f:
            uuid = h5f['metadata/uuid'][()].decode()
            timestamps = h5f['timestamps'][()]
        with h5py.File(pc_scores_file, 'r') as pc_h5f:
            scores = pc_h5f['scores'][uuid][:, :10]
            scores_idx = pc_h5f['scores_idx'][uuid][()]
        with h5py.File(syllables_file, 'r') as s_h5f:
            syllables = s_h5f[uuid][()]
    except KeyError:
        continue

    phot_df = pd.read_parquet(phot_file)
    assert np.all(np.nan_to_num(scores_idx)[:len(phot_df)] == np.nan_to_num(phot_df['ir_indices'].to_numpy()))
    phot_df['syllables'] = syllables[:len(phot_df)]
    phot_df['uuid'] = uuid

    phot_df["dlight_dff"] = dff_func(phot_df["pmt00_ref00 (dLight)"])
    phot_df["uv_dff"] = dff_func(phot_df["pmt00_ref01 (UV)"])

    msk = ~phot_df["uv_dff"].isna()
    if msk.sum() > 1000:
        uv_filter = filter_signal(phot_df.loc[msk, "uv_dff"])
        ref_df = rereference(
            pd.Series(uv_filter), phot_df.loc[msk, "dlight_dff"], center=True
        )
        phot_df["dlight_reref"] = ref_df["rereference"]
        phot_df["uv_reference_fit"] = ref_df["reference_fit"]
        # run robust zscore on rereferenced dlight
        phot_df["dlight_reref_zscore"] = norm_func(phot_df["dlight_reref"])
        msk2 = ~phot_df["dlight_reref_zscore"].isna()
        phot_df.loc[msk2, "dlight_reref_zscore_filter"] = filter_signal(
            phot_df.loc[msk2, "dlight_reref_zscore"]
        )

    phot_df = phot_df.drop(
        columns=[
            "pmt01_ref00",
            "pmt01_ref01",
            "pmt00_ref00_robust_dff (dLight)",
            "pmt00_ref01_robust_dff (UV)",
            "pmt01_ref00_robust_dff",
            "pmt01_ref01_robust_dff",
        ]
    )
    concat_phot_df.append(phot_df)

  0%|          | 0/174 [00:00<?, ?it/s]

In [14]:
concat_phot_df = pd.concat(concat_phot_df, ignore_index=True)

In [15]:
concat_phot_df.head()

Unnamed: 0,pmt00_ref00 (dLight),pmt00_ref01 (UV),timestamp,raw_timestamp,fs,ir_indices,subject_name,session_name,start_time,syllables,uuid,dlight_dff,uv_dff,dlight_reref,uv_reference_fit,dlight_reref_zscore,dlight_reref_zscore_filter
0,0.032205,0.013812,0.0,2823607000.0,29.962547,0.0,98w-4,dlight-aging,2023-08-07 12:11:31.654885,92,e4b59581-a54e-4033-9612-86b903a59909,0.010748,0.003174,0.006851,0.003897,0.327802,0.327511
1,0.032189,0.013818,0.033375,2823607000.0,29.962547,1.0,98w-4,dlight-aging,2023-08-07 12:11:31.654885,92,e4b59581-a54e-4033-9612-86b903a59909,0.010231,0.003526,0.00723,0.003,0.356387,1.008055
2,0.03254,0.013813,0.066688,2823608000.0,29.962547,,98w-4,dlight-aging,2023-08-07 12:11:31.654885,92,e4b59581-a54e-4033-9612-86b903a59909,0.021203,0.003282,0.018696,0.002506,1.304884,1.668263
3,0.033076,0.013693,0.1,2823608000.0,29.962547,2.0,98w-4,dlight-aging,2023-08-07 12:11:31.654885,53,e4b59581-a54e-4033-9612-86b903a59909,0.037963,-0.0055,0.035129,0.002834,2.664293,2.239887
4,0.033303,0.013747,0.133313,2823608000.0,29.962547,,98w-4,dlight-aging,2023-08-07 12:11:31.654885,53,e4b59581-a54e-4033-9612-86b903a59909,0.045054,-0.001611,0.040849,0.004205,3.203278,2.626676


In [16]:
concat_phot_df.to_parquet(
    pc_scores_file.parents[1] / "dlight_photometry_df.parquet", compression="brotli"
)