# Organize data for running PCA

Two main approaches:
1. select a subset of all ontogeny sessions, a few sessions from each age and sex
2. select a subset of all longtogeny sessions, a few sessions from each age and sex. Make sure to have the same number of sessions and similar ages per mouse.

In [1]:
import json
import h5py
import pandas as pd
from pathlib import Path
from toolz import concat, curry, frequencies
from aging.organization.paths import FOLDERS
from aging.organization.dataframes import get_experiment, get_age

In [2]:
FOLDERS

(PosixPath('/n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_females'),
 PosixPath('/n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_males'),
 PosixPath('/n/groups/datta/Dana/Ontogeny/raw_data/longtogeny_pre_unet/Males'),
 PosixPath('/n/groups/datta/min/longtogeny_072023/Males'),
 PosixPath('/n/groups/datta/min/longtogeny_072023/Females'),
 PosixPath('/n/groups/datta/min/wheel_062023'),
 PosixPath('/n/groups/datta/win/longtogeny/dlight'),
 PosixPath('/n/groups/datta/win/longtogeny/data/jackson-labs/datta_i'),
 PosixPath('/n/groups/datta/min/longtogeny_052023/Males'))

In [3]:
arhmm_version = 11
size_norm_key = 'win_size_norm_frames_v6'

In [4]:
session_exclusion_list = '/n/groups/datta/win/longtogeny/data/metadata/Ontogeny experiment list - Session quality control.csv'
exclusion_df = pd.read_csv(session_exclusion_list, header=0)

In [5]:
exclusion_df['Keep?'] = exclusion_df['Keep?'].str.lower()
exclusion_df['Keep?'].unique()

array(['y', 'n', '?'], dtype=object)

In [6]:
exclusions = exclusion_df.query('`Keep?` != "y"')

In [7]:
def is_size_normalized(file):
    try:
        with h5py.File(file, 'r') as h5f:
            return size_norm_key in h5f
    except OSError:
        print(file)
        return False


def not_excluded(file):
    return str(file) not in exclusions['File path'].values


@curry
def thread_and(funcs, x):
    return all(f(x) for f in funcs)


@curry
def not_double(parent_counts, path):
    if parent_counts[str(path.parents[1])] > 1:
        return 'proc_cleaned' == path.parent.name
    return True

## Organize ontogeny sessions

These are the data that will be run through PCA and the modeling steps

- create a dataframe with the file path, age, sex, and session name
- do a groupby and subsample the same number of sessions for each (age, sex) group
- symlink results.h5 files to a new folder

In [8]:
agg_folder = Path(f'/n/groups/datta/win/longtogeny/data/ontogeny/version_{arhmm_version:02d}/training_data')
agg_folder.mkdir(exist_ok=True, parents=True)

In [23]:
FOLDERS[:6]

(PosixPath('/n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_females'),
 PosixPath('/n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_males'),
 PosixPath('/n/groups/datta/Dana/Ontogeny/raw_data/longtogeny_pre_unet/Males'),
 PosixPath('/n/groups/datta/min/longtogeny_072023/Males'),
 PosixPath('/n/groups/datta/min/longtogeny_072023/Females'),
 PosixPath('/n/groups/datta/min/wheel_062023'))

In [24]:
# filter files for the ones with size_normalized_frames
files = sorted(
    filter(
        thread_and((is_size_normalized, not_excluded)),
        concat(f.glob("**/results_00.h5") for f in FOLDERS[:6]),
    )
)

/n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_males/12months_28042021/session_20210428102836/proc_cleaned/results_00.h5


In [25]:
parent_counts = frequencies(str(f.parents[1]) for f in files)

In [26]:
files = list(filter(not_double(parent_counts), files))

In [27]:
df = []
for f in files:
    try:
        md = json.load((f.parents[1] / "metadata.json").open())
        data = dict(
            age=get_age(f),
            experiment=get_experiment(f),
            session=f.parents[1].name,
            # sex=f.parents[3].name.split("_")[-1][:-1],
            path=str(f),
            SessionName=md["SessionName"],
        )
        df.append(data)
    except KeyError:
        print("KeyError: ", f)
    except FileNotFoundError:
        continue
df = pd.DataFrame(df)

KeyError:  /n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_females/session_20230721072116/proc/results_00.h5
KeyError:  /n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_females/session_20230721072215/proc/results_00.h5
KeyError:  /n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_females/session_20230721072329/proc/results_00.h5
KeyError:  /n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_females/session_20230721082639/proc/results_00.h5
KeyError:  /n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_females/session_20230721082719/proc/results_00.h5
KeyError:  /n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_females/session_20230721082802/proc/results_00.h5
KeyError:  /n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_females/session_20230721091042/proc/results_00.h5
KeyError:  /n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_females/session_20230721091057/proc/results_00.h5
KeyError:  /n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_females/session_20230721091106/proc/results_00.h5
KeyError:  /n/group

In [28]:
len(df)

5298

In [29]:
df['experiment'].unique()

array(['ontogeny_females', 'ontogeny_males', 'longtogeny_males',
       'longtogeny_v2_females', 'longtogeny_v2_males', 'wheel'],
      dtype=object)

In [32]:
mask = df['experiment'] == "wheel"
fake_ages = list(range(mask.sum()))
df.loc[mask, "age"] = fake_ages

In [34]:
df.head()

Unnamed: 0,age,experiment,session,path,SessionName
0,127.0,ontogeny_females,session_20231030082607,/n/groups/datta/Dana/Ontogeny/raw_data/Ontogen...,otgy_127F_01
1,127.0,ontogeny_females,session_20231030102302,/n/groups/datta/Dana/Ontogeny/raw_data/Ontogen...,otgy_127F_02
2,127.0,ontogeny_females,session_20231030102324,/n/groups/datta/Dana/Ontogeny/raw_data/Ontogen...,otgy_127F_02
3,127.0,ontogeny_females,session_20231030102342,/n/groups/datta/Dana/Ontogeny/raw_data/Ontogen...,otgy_127F_02
4,127.0,ontogeny_females,session_20231030112543,/n/groups/datta/Dana/Ontogeny/raw_data/Ontogen...,otgy_127F_01


In [35]:
df = df[~df['SessionName'].str.contains("CRL")]

In [36]:
df.groupby(['age', 'experiment']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,session,path,SessionName
age,experiment,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,wheel,1,1,1
1.0,wheel,1,1,1
2.0,wheel,1,1,1
3.0,longtogeny_males,16,16,16
3.0,ontogeny_females,23,23,23
...,...,...,...,...
964.0,wheel,1,1,1
965.0,wheel,1,1,1
966.0,wheel,1,1,1
967.0,wheel,1,1,1


In [37]:
# bin ages to make sampling easier
df['age_cuts'] = df.groupby("experiment")["age"].transform(lambda v: pd.qcut(v, 15, labels=False))

In [40]:
# sub-select 4 sessions from each group
# 64 sessions = 3.5e6 frames
sample = df.groupby(['age_cuts', 'experiment']).sample(n=7, random_state=0)

In [41]:
len(sample)

630

In [42]:
len(sample) * 30 * 60 * 20

22680000

In [43]:
for path in map(Path, sample['path']):
    try:
        new_path = agg_folder / (path.parents[1].name + '.h5')
        new_path.symlink_to(path)
        new_path = agg_folder / (path.parents[1].name + '.yaml')
        new_path.symlink_to(path.with_suffix('.yaml'))
    except FileExistsError:
        print(path, 'exists in training set')

/n/groups/datta/Dana/Ontogeny/raw_data/longtogeny_pre_unet/Males/20210331/session_20210331163355/proc/results_00.h5 exists in training set
/n/groups/datta/Dana/Ontogeny/raw_data/longtogeny_pre_unet/Males/20210331/session_20210331163403/proc/results_00.h5 exists in training set
/n/groups/datta/Dana/Ontogeny/raw_data/longtogeny_pre_unet/Males/20210402/session_20210402170649/proc/results_00.h5 exists in training set
/n/groups/datta/Dana/Ontogeny/raw_data/longtogeny_pre_unet/Males/20210331/session_20210331153528/proc/results_00.h5 exists in training set
/n/groups/datta/Dana/Ontogeny/raw_data/longtogeny_pre_unet/Males/20210401/session_20210401144828/proc/results_00.h5 exists in training set
/n/groups/datta/Dana/Ontogeny/raw_data/longtogeny_pre_unet/Males/20210403/session_20210403143928/proc/results_00.h5 exists in training set
/n/groups/datta/Dana/Ontogeny/raw_data/longtogeny_pre_unet/Males/20210330/session_20210330153328/proc/results_00.h5 exists in training set
/n/groups/datta/min/longtog