# Organize data for running PCA

Two main approaches:
1. select a subset of all ontogeny sessions, a few sessions from each age and sex
2. select a subset of all longtogeny sessions, a few sessions from each age and sex. Make sure to have the same number of sessions and similar ages per mouse.

In [1]:
import json
import h5py
import numpy as np
import pandas as pd
from pathlib import Path
from toolz import concat, curry, frequencies, keyfilter
from aging.organization.paths import get_experiment_grouped_files
from aging.organization.dataframes import get_experiment, get_age

In [2]:
files = get_experiment_grouped_files()
files = keyfilter(lambda x: x not in ["jax_longtogeny", "dana_ontogeny_dana_ontogeny"], files)

In [3]:
list(files)

['ontogeny_females',
 'ontogeny_males',
 'dana_ontogeny_males',
 'dana_ontogeny_females',
 'longtogeny_males',
 'longtogeny_v2_males',
 'longtogeny_v2_females',
 'wheel',
 'dlight',
 'klothos']

In [4]:
arhmm_version = 12

# NEW SIZE NORM MODEL
size_norm_key = 'win_size_norm_frames_v8'

In [5]:
session_exclusion_list = '/n/groups/datta/win/longtogeny/data/metadata/Ontogeny experiment list - Session quality control.csv'
exclusion_df = pd.read_csv(session_exclusion_list, header=0)

In [6]:
exclusion_df['Keep?'] = exclusion_df['Keep?'].str.lower()
exclusion_df['Keep?'].unique()

array(['y', 'n', '?'], dtype=object)

In [7]:
exclusions = exclusion_df.query('`Keep?` != "y"')

In [8]:
def is_size_normalized(file):
    try:
        with h5py.File(file, 'r') as h5f:
            return size_norm_key in h5f
    except OSError:
        print(file)
        return False


def not_excluded(file):
    return str(file) not in exclusions['File path'].values


@curry
def thread_and(funcs, x):
    return all(f(x) for f in funcs)


@curry
def not_double(parent_counts, path):
    if parent_counts[str(path.parents[1])] > 1:
        return 'proc_cleaned' == path.parent.name
    return True

## Organize ontogeny sessions

These are the data that will be run through PCA and the modeling steps

- create a dataframe with the file path, age, sex, and session name
- do a groupby and subsample the same number of sessions for each (age, sex) group
- symlink results.h5 files to a new folder

In [9]:
agg_folder = Path(f'/n/groups/datta/win/longtogeny/data/ontogeny/version_{arhmm_version:02d}/training_data')
agg_folder.mkdir(exist_ok=True, parents=True)

In [10]:
# filter files for the ones with size_normalized_frames
files = sorted(
    filter(
        thread_and((is_size_normalized, not_excluded)),
        concat(files.values()),
    )
)

/n/groups/datta/Dana/Ontogeny/raw_data/Dana_ontogeny/Males/12months_28042021/session_20210428102836/proc_cleaned/results_00.h5


In [11]:
parent_counts = frequencies(str(f.parents[1]) for f in files)

In [12]:
files = list(filter(not_double(parent_counts), files))

In [13]:
df = []
for f in files:
    try:
        md = json.load((f.parents[1] / "metadata.json").open())
        data = dict(
            age=get_age(f),
            experiment=get_experiment(f),
            session=f.parents[1].name,
            path=str(f),
            SessionName=md["SessionName"],
        )
        df.append(data)
    except KeyError:
        print("KeyError: ", f)
    except FileNotFoundError:
        continue
df = pd.DataFrame(df)

In [14]:
len(df)

7218

In [15]:
df['experiment'].unique()

array(['dana_ontogeny_females', 'dana_ontogeny_males', 'klothos',
       'ontogeny_females', 'ontogeny_males', 'longtogeny_males',
       'longtogeny_v2_females', 'longtogeny_v2_males', 'wheel', 'dlight'],
      dtype=object)

In [16]:
if "wheel" in df["experiment"].unique():
    mask = df['experiment'] == "wheel"
    fake_ages = list(range(mask.sum()))
    df.loc[mask, "age"] = fake_ages

In [17]:
df.head()

Unnamed: 0,age,experiment,session,path,SessionName
0,52.0,dana_ontogeny_females,session_20221018094245,/n/groups/datta/Dana/Ontogeny/raw_data/Dana_on...,fo_12months
1,52.0,dana_ontogeny_females,session_20221018094429,/n/groups/datta/Dana/Ontogeny/raw_data/Dana_on...,fo_12months
2,52.0,dana_ontogeny_females,session_20221018094537,/n/groups/datta/Dana/Ontogeny/raw_data/Dana_on...,fo_12months
3,52.0,dana_ontogeny_females,session_20221018094629,/n/groups/datta/Dana/Ontogeny/raw_data/Dana_on...,fo_12months
4,52.0,dana_ontogeny_females,session_20221018110503,/n/groups/datta/Dana/Ontogeny/raw_data/Dana_on...,fo_12months


In [18]:
df = df[~df['SessionName'].str.contains("CRL")]

In [19]:
df.groupby(['age', 'experiment']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,session,path,SessionName
age,experiment,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,wheel,1,1,1
1.0,wheel,1,1,1
2.0,wheel,1,1,1
3.0,dana_ontogeny_females,24,24,24
3.0,dana_ontogeny_males,15,15,15
...,...,...,...,...
995.0,wheel,1,1,1
996.0,wheel,1,1,1
997.0,wheel,1,1,1
998.0,wheel,1,1,1


In [20]:
sample_size = 6
samples = []
for experiment, _df in df.groupby('experiment', sort=False):
    unique_ages = _df['age'].nunique()
    if unique_ages > 15:
        age_cut = pd.qcut(_df['age'], 13, labels=False)
        _samples = _df.groupby(age_cut).sample(n=sample_size)
    elif np.all(_df['age'].isna()):
        _samples = _df.iloc[::(len(_df) // sample_size)]
    else:
        _samples = _df.groupby('age').sample(n=sample_size)
    print(experiment, len(_samples))
    samples.append(_samples)
samples = pd.concat(samples)

dana_ontogeny_females 60
dana_ontogeny_males 60
klothos 6
ontogeny_females 78
ontogeny_males 78
longtogeny_males 78
longtogeny_v2_females 78
longtogeny_v2_males 78
wheel 78
dlight 6


In [21]:
samples

Unnamed: 0,age,experiment,session,path,SessionName
99,3.0,dana_ontogeny_females,session_20221007115446,/n/groups/datta/Dana/Ontogeny/raw_data/Dana_on...,fo_3wks
89,3.0,dana_ontogeny_females,session_20221007094130,/n/groups/datta/Dana/Ontogeny/raw_data/Dana_on...,fo_3w
91,3.0,dana_ontogeny_females,session_20221007100636,/n/groups/datta/Dana/Ontogeny/raw_data/Dana_on...,fo_3wks
90,3.0,dana_ontogeny_females,session_20221007094148,/n/groups/datta/Dana/Ontogeny/raw_data/Dana_on...,fo_3wks
107,3.0,dana_ontogeny_females,session_20221007131737,/n/groups/datta/Dana/Ontogeny/raw_data/Dana_on...,fo_3wks
...,...,...,...,...,...
7073,,dlight,session_20230809152555-446501 (datta-realtime1),/n/groups/datta/win/longtogeny/dlight/session_...,dlight-aging
7102,,dlight,session_20230811174036-538526 (datta-realtime1),/n/groups/datta/win/longtogeny/dlight/session_...,dlight-aging
7131,,dlight,session_20230819170421-738640 (datta-realtime1),/n/groups/datta/win/longtogeny/dlight/session_...,dlight-aging
7160,,dlight,session_20230824172208-658594 (datta-realtime1),/n/groups/datta/win/longtogeny/dlight/session_...,dlight-aging


In [22]:
len(samples)

600

In [23]:
len(samples) * 30 * 60 * 20

21600000

In [24]:
for path in map(Path, samples['path']):
    try:
        new_path = agg_folder / (path.parents[1].name + '.h5')
        new_path.symlink_to(path)
        new_path = agg_folder / (path.parents[1].name + '.yaml')
        new_path.symlink_to(path.with_suffix('.yaml'))
    except FileExistsError:
        print(path, 'exists in training set')

/n/groups/datta/Dana/Ontogeny/raw_data/longtogeny_pre_unet/Males/session_20230520125314/proc/results_00.h5 exists in training set
