# Organize data for running PCA

Two main approaches:
1. select a subset of all ontogeny sessions, a few sessions from each age and sex
2. select a subset of all longtogeny sessions, a few sessions from each age and sex. Make sure to have the same number of sessions and similar ages per mouse.

In [1]:
import json
import h5py
import pandas as pd
from pathlib import Path
from toolz import concat, curry, frequencies
from aging.organization.paths import FOLDERS

In [2]:
FOLDERS

(PosixPath('/n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_females'),
 PosixPath('/n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_males'),
 PosixPath('/n/groups/datta/Dana/Ontogeny/raw_data/Dana_ontogeny/Males'),
 PosixPath('/n/groups/datta/Dana/Ontogeny/raw_data/Dana_ontogeny/Females'),
 PosixPath('/n/groups/datta/Dana/Ontogeny/raw_data/longtogeny_pre_unet/Males'),
 PosixPath('/n/groups/datta/min/longtogeny_072023/Males'),
 PosixPath('/n/groups/datta/min/longtogeny_072023/Females'),
 PosixPath('/n/groups/datta/min/wheel_062023'),
 PosixPath('/n/groups/datta/win/longtogeny/dlight'),
 PosixPath('/n/groups/datta/min/longtogeny_052023/Males'),
 PosixPath('/n/groups/datta/win/longtogeny/data/jackson-labs/datta_i'))

In [3]:
arhmm_version = 9
size_norm_key = 'win_size_norm_frames_v6'

In [4]:
session_exclusion_list = '/n/groups/datta/win/longtogeny/data/metadata/Ontogeny experiment list - Session quality control.csv'
exclusion_df = pd.read_csv(session_exclusion_list, header=0)

In [5]:
exclusion_df['Keep?'] = exclusion_df['Keep?'].str.lower()
exclusion_df['Keep?'].unique()

array(['y', 'n', '?'], dtype=object)

In [6]:
exclusions = exclusion_df.query('`Keep?` != "y"')

In [11]:
def is_size_normalized(file):
    try:
        with h5py.File(file, 'r') as h5f:
            return size_norm_key in h5f
    except OSError:
        print(file)
        return False


def not_excluded(file):
    return str(file) not in exclusions['File path'].values


@curry
def thread_and(funcs, x):
    return all(f(x) for f in funcs)


@curry
def not_double(parent_counts, path):
    if "jackson" not in str(path) and parent_counts[str(path.parents[1])] > 1:
        return 'proc_cleaned' == path.parent.name
    return True

## Organize ontogeny sessions

These are the data that will be run through PCA and the modeling steps

- create a dataframe with the file path, age, sex, and session name
- do a groupby and subsample the same number of sessions for each (age, sex) group
- symlink results.h5 files to a new folder

In [9]:
agg_folder = Path(f'/n/groups/datta/win/longtogeny/data/ontogeny/version_{arhmm_version:02d}/training_data')
agg_folder.mkdir(exist_ok=True, parents=True)

In [8]:
FOLDERS[:-1]

(PosixPath('/n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_females'),
 PosixPath('/n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_males'),
 PosixPath('/n/groups/datta/Dana/Ontogeny/raw_data/Dana_ontogeny/Males'),
 PosixPath('/n/groups/datta/Dana/Ontogeny/raw_data/Dana_ontogeny/Females'),
 PosixPath('/n/groups/datta/Dana/Ontogeny/raw_data/longtogeny_pre_unet/Males'),
 PosixPath('/n/groups/datta/min/longtogeny_072023/Males'),
 PosixPath('/n/groups/datta/min/longtogeny_072023/Females'),
 PosixPath('/n/groups/datta/min/wheel_062023'),
 PosixPath('/n/groups/datta/win/longtogeny/dlight'),
 PosixPath('/n/groups/datta/min/longtogeny_052023/Males'))

In [12]:
# filter files for the ones with size_normalized_frames
files = sorted(
    filter(
        thread_and((is_size_normalized, not_excluded)),
        concat(f.glob("**/*results_00.h5") for f in FOLDERS[:-1]),
    )
)

/n/groups/datta/Dana/Ontogeny/raw_data/Dana_ontogeny/Males/12months_28042021/session_20210428102836/proc_cleaned/results_00.h5


In [13]:
parent_counts = frequencies(str(f.parents[1]) for f in files)

In [14]:
files = list(filter(not_double(parent_counts), files))

In [15]:
for path in files:
    try:
        new_path = agg_folder / (path.parents[1].name + '.h5')
        new_path.symlink_to(path)
        new_path = agg_folder / (path.parents[1].name + '.yaml')
        new_path.symlink_to(path.with_suffix('.yaml'))
    except FileExistsError:
        print(path, 'exists in training set')

/n/groups/datta/min/wheel_062023/session_20230726160946/proc/results_00.h5 exists in training set
/n/groups/datta/min/wheel_062023/session_20230726163905/proc/results_00.h5 exists in training set
/n/groups/datta/min/wheel_062023/session_20230726170544/proc/results_00.h5 exists in training set
/n/groups/datta/min/wheel_062023/session_20230726173404/proc/results_00.h5 exists in training set
/n/groups/datta/min/wheel_062023/session_20230726180157/proc/results_00.h5 exists in training set
/n/groups/datta/min/wheel_062023/wheel_min/session_20230715120836/proc/results_00.h5 exists in training set
/n/groups/datta/min/wheel_062023/wheel_min/session_20230715125704/proc/results_00.h5 exists in training set
/n/groups/datta/min/wheel_062023/wheel_min/session_20230715132708/proc/results_00.h5 exists in training set
/n/groups/datta/min/wheel_062023/wheel_min/session_20230715135804/proc/results_00.h5 exists in training set
/n/groups/datta/min/wheel_062023/wheel_min/session_20230715142922/proc/results