# Take inferred syllables from ARHMM and assign mouse ID and experiment to each session

In [1]:
import h5py
import numpy as np
import pandas as pd
from pathlib import Path
from copy import deepcopy
from tqdm.auto import tqdm
from datetime import datetime
from toolz.curried import pluck
from aging.organization.paths import FOLDERS
from aging.behavior.scalars import compute_scalars
from toolz import concat, compose, valmap, first, groupby, keyfilter

In [2]:
version = 4
data_folder = Path(f'/n/groups/datta/win/longtogeny/data/ontogeny/version_{version:02d}')
syllable_path = data_folder / 'all_data_pca/syllables.h5'

In [3]:
uuid_map = {}
for file in tqdm(concat(f.glob('**/results_00.h5') for f in FOLDERS)):
    try:
        with h5py.File(file, 'r') as h5f:
            uuid = h5f['metadata/uuid'][()].decode()
            uuid_map[uuid] = file
    except OSError:
        continue

with h5py.File(syllable_path, 'r') as h5f:
    h5f_uuids = list(h5f)
    uuid_map = keyfilter(lambda u: u in h5f_uuids, uuid_map)

0it [00:00, ?it/s]

In [4]:
with h5py.File(first(uuid_map.values()), 'r') as h5f:
    print(list(h5f['metadata/acquisition']))

['ColorDataType', 'ColorResolution', 'DepthDataType', 'DepthResolution', 'IsLittleEndian', 'NidaqChannels', 'NidaqSamplingRate', 'SessionName', 'StartTime', 'SubjectName']


In [5]:
def get_experiment(path: Path):
    str_path = str(path)
    if "min" in str_path and 'longtogeny' in str_path:
        exp = f"longtogeny_v2_{path.parents[2].name.lower()}"
    elif "longtogeny" in str(path):
        sex = path.parents[3].name.lower()
        if sex not in ("males", "females"):
            sex = path.parents[2].name.lower()
            if sex not in ("males", "females"):
                raise ValueError("bleh")
        exp = f"longtogeny_{sex}"
    elif "ontogeny" in str(path).lower() and "community" not in str(path):
        exp = path.parents[3].name.lower()
        if exp == "raw_data":
            exp = path.parents[2].name.lower()
    elif "wheel" in str(path).lower():
        exp = "wheel"
    else:
        exp = path.parents[2].name
    return exp


def insert_nans(timestamps, data, fps=30):
    df_timestamps = np.diff(np.insert(timestamps, 0, timestamps[0] - 1.0 / fps))
    missing_frames = np.floor(df_timestamps / (1.0 / fps))

    fill_idx = np.where(missing_frames > 1)[0]
    data_idx = np.arange(len(timestamps)).astype('float64')

    filled_data = deepcopy(data)
    filled_timestamps = deepcopy(timestamps)

    if filled_data.ndim == 1:
        isvec = True
        filled_data = filled_data[:, None]
    else:
        isvec = False
    nframes, nfeatures = filled_data.shape

    for idx in fill_idx[::-1]:
        if idx < len(missing_frames):
            ninserts = int(missing_frames[idx] - 1)
            data_idx = np.insert(data_idx, idx, [np.nan] * ninserts)
            insert_timestamps = timestamps[idx - 1] + \
                np.cumsum(np.ones(ninserts,) * 1.0 / fps)
            filled_data = np.insert(filled_data, idx,
                                    np.ones((ninserts, nfeatures)) * np.nan, axis=0)
            filled_timestamps = np.insert(
                filled_timestamps, idx, insert_timestamps)

    if isvec:
        filled_data = np.squeeze(filled_data)

    return filled_data, data_idx, filled_timestamps

In [6]:
# group uuids by experiment
exp_groups = groupby(lambda k: get_experiment(k[1]), uuid_map.items())
exp_groups = valmap(compose(list, pluck(0)), exp_groups)

In [7]:
list(exp_groups)

['ontogeny_females',
 'ontogeny_males',
 'longtogeny_males',
 'longtogeny_v2_females',
 'longtogeny_v2_males']

In [8]:
valmap(len, exp_groups)

{'ontogeny_females': 224,
 'ontogeny_males': 216,
 'longtogeny_males': 1096,
 'longtogeny_v2_females': 537,
 'longtogeny_v2_males': 475}

In [10]:
recon_frames_key = "win_size_norm_frames_v4"
version = 0
failed_sessions = []
with h5py.File(syllable_path, "r") as h5f:
    for experiment, uuids in exp_groups.items():
        # remove this line to do everything
        df = []
        for i, (uuid, path) in enumerate(map(lambda u: (u, uuid_map[u]), tqdm(uuids))):
            try:
                with h5py.File(path, "r") as f:
                    session_name = f["metadata/acquisition/SessionName"][()].decode()
                    subject_name = f["metadata/acquisition/SubjectName"][()].decode()
                    keep_scalars = list(filter(lambda k: "mm" in k, f["scalars"])) + [
                        "angle",
                        "velocity_theta",
                    ]

                    ts = f["timestamps"][()] / 1000
                    scalars = dict((k, f["scalars"][k][()]) for k in keep_scalars)
                    filled_scalars = valmap(lambda v: insert_nans(ts, v)[0], scalars)
                    filled_ts = insert_nans(ts, ts)[2]

                    frames = f[recon_frames_key][()]
                    centroid = np.array(
                        [f["scalars/centroid_x_px"][()], f["scalars/centroid_y_px"][()]]
                    ).T
                    true_depth = f["metadata/extraction/true_depth"][()]
                    recon_scalars = compute_scalars(frames, centroid, true_depth)
                    recon_scalars = valmap(
                        lambda v: insert_nans(ts, v)[0], recon_scalars
                    )
            except KeyError:
                session_name = ""
                subject_name = ""
                failed_sessions.append((uuid, path))
            if "longtogeny" in experiment:
                age = np.nan
            elif "ontogeny" in experiment:
                age = path.parents[2].name.split("_")[0]
            else:
                age = np.nan
            date = datetime.strptime(
                path.parents[1].name.split("_")[-1], "%Y%m%d%H%M%S"
            )
            try:
                _df = pd.DataFrame(
                    dict(
                        experiment=experiment,
                        file=str(path),
                        syllables=h5f[uuid][()],
                        date=date,
                        uuid=uuid,
                        age=age,
                        true_depth=true_depth,
                        session_name=session_name,
                        subject_name=subject_name,
                        timestamps=filled_ts - filled_ts[0],
                        **filled_scalars,
                        **recon_scalars,
                    )
                )
                _df = _df.astype(
                    dict(
                        syllables="int16[pyarrow]",
                        file="string[pyarrow]",
                        experiment="string[pyarrow]",
                        uuid="string[pyarrow]",
                        session_name="string[pyarrow]",
                        subject_name="string[pyarrow]",
                        timestamps="float32[pyarrow]",
                        true_depth="float32[pyarrow]",
                        **{
                            k: "float32[pyarrow]"
                            for k in list(filled_scalars) + list(recon_scalars)
                        },
                    )
                )
                df.append(_df)
                if i % 35 == 0:
                    pd.concat(df, ignore_index=True).to_parquet(
                        data_folder / f"{experiment}_syllable_df_v{version:02d}.parquet"
                    )
            except ValueError as e:
                print('failure', uuid, e)
                failed_sessions.append((uuid, path))
        df = pd.concat(df, ignore_index=True)
        df.to_parquet(data_folder / f"{experiment}_syllable_df_v{version:02d}.parquet")
        print(experiment, "length", len(df))

  0%|          | 0/224 [00:00<?, ?it/s]

failure 1deda02d-3a49-49a1-b589-63ea27b3d6c8 All arrays must be of the same length
failure 2958cb9b-a49a-436b-8aa0-17392de92af0 All arrays must be of the same length
ontogeny_females length 11978538


  0%|          | 0/216 [00:00<?, ?it/s]

failure 3d2f89d8-0c8b-4bd1-8897-c1101db2dd96 All arrays must be of the same length
failure d0e75973-54c7-4572-af5a-dd51aab605ca All arrays must be of the same length
failure 2008df9d-e8cd-48be-8e48-5a8adaf331d4 All arrays must be of the same length
ontogeny_males length 11493813


  0%|          | 0/1096 [00:00<?, ?it/s]

failure 840a5b11-187a-471a-95f5-199c5bcd5014 All arrays must be of the same length
failure 04614d47-178a-43f7-aab7-1a00a37b5d05 All arrays must be of the same length
failure 9b2d68e3-c266-4af2-921c-ad4203e70cf0 All arrays must be of the same length
failure eb7bec68-81f4-43de-b3cc-21a440f1042e All arrays must be of the same length
failure fcd074ad-e38c-4864-a1af-fcf7a2c8dacc All arrays must be of the same length
failure eedb6a08-7323-4833-bb58-f9c39e8c4f29 All arrays must be of the same length
failure 44bd76a8-1e82-4210-91d6-1bdc6cbf6d87 All arrays must be of the same length
failure e2d324ed-c7c2-4a2a-907e-cb4ad44d610f All arrays must be of the same length
failure 6e072e8c-05ca-4b47-8983-238172d42ac1 All arrays must be of the same length
failure e1e937d1-eb7b-4de4-81c4-d0eb6b4123e5 All arrays must be of the same length
failure d0624d72-cc73-4b93-aec0-58008161f43d All arrays must be of the same length
failure c18d5fe6-5e0e-4374-8dae-961ae3334637 All arrays must be of the same length
fail

  0%|          | 0/537 [00:00<?, ?it/s]

failure 84ced407-cfd6-4656-968a-a33606f0c827 All arrays must be of the same length
failure adb35ddf-57b7-42ca-88f2-40ff571e3140 All arrays must be of the same length
failure da88a2b7-11ff-4548-8965-e936e829d0ba All arrays must be of the same length
failure 4234e8dc-835b-4350-aad0-03ad2d29f0c2 All arrays must be of the same length
failure 0f857ef2-ef50-4221-8394-43f5b10ce428 All arrays must be of the same length
failure d52c3569-b074-4cac-963a-5f1b16821080 All arrays must be of the same length
failure 3ffffe10-c1f9-4927-b729-da48dd4f7b34 All arrays must be of the same length
failure e9859d7f-f8c9-4d6a-a733-54a5948f39c2 All arrays must be of the same length
failure 1d199543-91ec-4101-97c4-df065c705267 All arrays must be of the same length
failure be875a1e-6a75-4067-bad5-6115c1cb6056 All arrays must be of the same length
failure ee09ee68-994a-43f3-bf41-7dc5f534d7ef All arrays must be of the same length
failure 46f3a503-ae0e-46eb-a0f6-bbae443eb4d6 All arrays must be of the same length
fail

  0%|          | 0/475 [00:00<?, ?it/s]

failure 9ab3cce7-5afb-490f-8be4-b187e1dea123 All arrays must be of the same length
failure 36e36a1f-7ad4-43e2-98fd-ec16f1299739 All arrays must be of the same length
failure 538d25f1-3681-45b5-a514-3a54e00429ec All arrays must be of the same length
longtogeny_v2_males length 16973244


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16973244 entries, 0 to 16973243
Data columns (total 24 columns):
 #   Column          Dtype         
---  ------          -----         
 0   experiment      string        
 1   file            string        
 2   syllables       int16[pyarrow]
 3   date            datetime64[ns]
 4   uuid            string        
 5   age             float64       
 6   true_depth      float[pyarrow]
 7   session_name    string        
 8   subject_name    string        
 9   timestamps      float[pyarrow]
 10  area_mm         float[pyarrow]
 11  centroid_x_mm   float[pyarrow]
 12  centroid_y_mm   float[pyarrow]
 13  height_ave_mm   float[pyarrow]
 14  length_mm       float[pyarrow]
 15  velocity_2d_mm  float[pyarrow]
 16  velocity_3d_mm  float[pyarrow]
 17  width_mm        float[pyarrow]
 18  angle           float[pyarrow]
 19  velocity_theta  float[pyarrow]
 20  recon_width     float[pyarrow]
 21  recon_length    float[pyarrow]
 22  recon_height    

In [12]:
len(failed_sessions)

79