# Organize data for running PCA

Two main approaches:
1. select a subset of all ontogeny sessions, a few sessions from each age and sex
2. select a subset of all longtogeny sessions, a few sessions from each age and sex. Make sure to have the same number of sessions and similar ages per mouse.

In [27]:
import re
import json
import h5py
import pandas as pd
from pathlib import Path
from toolz import concat, curry, frequencies

In [2]:
arhmm_version = 7
size_norm_key = f'win_size_norm_frames_v{arhmm_version - 1}'

In [3]:
session_exclusion_list = '/n/groups/datta/win/longtogeny/data/metadata/Ontogeny experiment list - Session quality control.csv'
exclusion_df = pd.read_csv(session_exclusion_list, header=0)

In [4]:
exclusion_df['Keep?'] = exclusion_df['Keep?'].str.lower()

In [5]:
exclusion_df['Keep?'].unique()

array(['y', 'n', '?'], dtype=object)

In [6]:
exclusions = exclusion_df.query('`Keep?` != "y"')

In [7]:
exclusions.head()

Unnamed: 0,File path,Checked?,Keep?,Issue
3,/n/groups/datta/Dana/Ontogeny/raw_data/Ontogen...,y,n,"fading, noise, further inspection needed"
4,/n/groups/datta/Dana/Ontogeny/raw_data/Ontogen...,y,n,noise
5,/n/groups/datta/Dana/Ontogeny/raw_data/Ontogen...,y,n,wall noise
6,/n/groups/datta/Dana/Ontogeny/raw_data/Ontogen...,y,n,replaced with noise
7,/n/groups/datta/Dana/Ontogeny/raw_data/Ontogen...,y,n,wall noise


In [8]:
def is_size_normalized(file):
    try:
        with h5py.File(file, 'r') as h5f:
            return size_norm_key in h5f
    except OSError:
        print(file)
        return False


def not_excluded(file):
    return str(file) not in exclusions['File path'].values

## Organize ontogeny sessions

These are the data that will be run through PCA and the modeling steps

- create a dataframe with the file path, age, sex, and session name
- do a groupby and subsample the same number of sessions for each (age, sex) group
- symlink results.h5 files to a new folder

In [9]:
agg_folder = Path(f'/n/groups/datta/win/longtogeny/data/ontogeny/version_{arhmm_version:02d}/training_data')
agg_folder.mkdir(exist_ok=True, parents=True)

In [10]:
folders = [
    '/n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_females',
    '/n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_males',
]

folders = [Path(f) for f in folders]

In [11]:
@curry
def thread_and(funcs, x):
    return all(f(x) for f in funcs)

In [12]:
# filter files for the ones with size_normalized_frames
files = sorted(
    filter(
        thread_and((is_size_normalized, not_excluded)),
        concat(f.glob("**/results_00.h5") for f in folders),
    )
)

/n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_males/12months_28042021/session_20210428102836/proc_cleaned/results_00.h5


In [20]:
parent_counts = frequencies([str(f.parents[1]) for f in files])

In [21]:
def not_double(path):
    if parent_counts[str(path.parents[1])] > 1:
        return 'proc_cleaned' == path.parent.name
    return True

In [23]:
files = list(filter(not_double, files))

In [35]:
df = []
for f in files:
    md = json.load((f.parents[1] / "metadata.json").open())
    data = dict(
        age=f.parents[2].name,
        session=f.parents[1].name,
        sex=f.parents[3].name.split("_")[-1][:-1],
        path=str(f),
        SessionName=md["SessionName"],
    )
    df.append(data)
df = pd.DataFrame(df)

In [38]:
df = df[~(df['age'].str.contains('Ontogeny') | df['SessionName'].str.contains("CRL"))]

In [39]:
df.groupby(['age', 'sex']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,session,path,SessionName
age,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
12months,female,24,24,24
12months_28042021,male,26,26,26
16wks_092923,male,16,16,16
18months,female,24,24,24
18months_29042021,male,27,27,27
20wks_093023,male,16,16,16
22months_06052023,male,16,16,16
28wks_092923,male,16,16,16
32wks_100223,male,16,16,16
36wks_093023,male,16,16,16


In [40]:
# sub-select 4 sessions from each group
# 64 sessions = 3.5e6 frames
sample = df.groupby(['age', 'sex']).sample(n=6, random_state=0)

In [41]:
len(sample)

168

In [46]:
for path in map(Path, sample['path']):
    try:
        new_path = agg_folder / (path.parents[1].name + '.h5')
        new_path.symlink_to(path)
        new_path = agg_folder / (path.parents[1].name + '.yaml')
        new_path.symlink_to(path.with_suffix('.yaml'))
    except FileExistsError:
        print(path, 'exists in training set')