# Organize data for running PCA

Two main approaches:
1. select a subset of all ontogeny sessions, a few sessions from each age and sex
2. select a subset of all longtogeny sessions, a few sessions from each age and sex. Make sure to have the same number of sessions and similar ages per mouse.

In [34]:
import json
import h5py
import pandas as pd
from pathlib import Path
from toolz import concat, curry, frequencies
from aging.organization.paths import FOLDERS
from aging.organization.dataframes import get_experiment, get_age

In [2]:
FOLDERS

(PosixPath('/n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_females'),
 PosixPath('/n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_males'),
 PosixPath('/n/groups/datta/Dana/Ontogeny/raw_data/longtogeny_pre_unet/Males'),
 PosixPath('/n/groups/datta/min/longtogeny_072023/Males'),
 PosixPath('/n/groups/datta/min/longtogeny_072023/Females'),
 PosixPath('/n/groups/datta/min/wheel_062023'),
 PosixPath('/n/groups/datta/win/longtogeny/dlight'),
 PosixPath('/n/groups/datta/win/longtogeny/data/jackson-labs/datta_i'),
 PosixPath('/n/groups/datta/min/longtogeny_052023/Males'))

In [3]:
arhmm_version = 10
size_norm_key = 'win_size_norm_frames_v6'

In [4]:
session_exclusion_list = '/n/groups/datta/win/longtogeny/data/metadata/Ontogeny experiment list - Session quality control.csv'
exclusion_df = pd.read_csv(session_exclusion_list, header=0)

In [5]:
exclusion_df['Keep?'] = exclusion_df['Keep?'].str.lower()
exclusion_df['Keep?'].unique()

array(['y', 'n', '?'], dtype=object)

In [6]:
exclusions = exclusion_df.query('`Keep?` != "y"')

In [7]:
def is_size_normalized(file):
    try:
        with h5py.File(file, 'r') as h5f:
            return size_norm_key in h5f
    except OSError:
        print(file)
        return False


def not_excluded(file):
    return str(file) not in exclusions['File path'].values


@curry
def thread_and(funcs, x):
    return all(f(x) for f in funcs)


@curry
def not_double(parent_counts, path):
    if "jackson" not in str(path) and parent_counts[str(path.parents[1])] > 1:
        return 'proc_cleaned' == path.parent.name
    return True

## Organize ontogeny sessions

These are the data that will be run through PCA and the modeling steps

- create a dataframe with the file path, age, sex, and session name
- do a groupby and subsample the same number of sessions for each (age, sex) group
- symlink results.h5 files to a new folder

In [8]:
agg_folder = Path(f'/n/groups/datta/win/longtogeny/data/ontogeny/version_{arhmm_version:02d}/training_data')
agg_folder.mkdir(exist_ok=True, parents=True)

In [9]:
folder_list = FOLDERS[:5] + FOLDERS[-2:]
folder_list

(PosixPath('/n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_females'),
 PosixPath('/n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_males'),
 PosixPath('/n/groups/datta/Dana/Ontogeny/raw_data/longtogeny_pre_unet/Males'),
 PosixPath('/n/groups/datta/min/longtogeny_072023/Males'),
 PosixPath('/n/groups/datta/min/longtogeny_072023/Females'),
 PosixPath('/n/groups/datta/win/longtogeny/data/jackson-labs/datta_i'),
 PosixPath('/n/groups/datta/min/longtogeny_052023/Males'))

In [10]:
# filter files for the ones with size_normalized_frames
files = sorted(
    filter(
        thread_and((is_size_normalized, not_excluded)),
        concat(f.glob("**/*results_00.h5") for f in folder_list),
    )
)

/n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_males/12months_28042021/session_20210428102836/proc_cleaned/results_00.h5


In [11]:
len(files)

4674

In [12]:
parent_counts = frequencies(str(f.parents[1]) for f in files)

In [13]:
files = list(filter(not_double(parent_counts), files))

In [14]:
len(files)

4625

In [21]:
df = []
for f in files:
    try:
        md_file = f.parents[1] / "metadata.json"
        data = dict(
            age=get_age(f),
            experiment=get_experiment(f),
            session=f.parents[1].name,
            path=str(f),
            SessionName='',
        )
        if md_file.exists():
            md = json.load(md_file.open())
            data["SessionName"] = md["SessionName"]
        df.append(data)
    except KeyError:
        print("KeyError: ", f)
    except FileNotFoundError:
        print("File not found", f)
        continue
df = pd.DataFrame(df)

KeyError:  /n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_females/session_20230721072116/proc/results_00.h5
KeyError:  /n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_females/session_20230721072215/proc/results_00.h5
KeyError:  /n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_females/session_20230721072329/proc/results_00.h5
KeyError:  /n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_females/session_20230721082639/proc/results_00.h5
KeyError:  /n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_females/session_20230721082719/proc/results_00.h5
KeyError:  /n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_females/session_20230721082802/proc/results_00.h5
KeyError:  /n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_females/session_20230721091042/proc/results_00.h5
KeyError:  /n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_females/session_20230721091057/proc/results_00.h5
KeyError:  /n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_females/session_20230721091106/proc/results_00.h5
KeyError:  /n/group

In [22]:
len(df)

4609

In [23]:
df = df[~df['SessionName'].str.contains("CRL")]

In [24]:
df.groupby(['age', 'experiment']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,session,path,SessionName
age,experiment,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3.000000,longtogeny_males,16,16,16
3.000000,ontogeny_females,23,23,23
3.000000,ontogeny_males,15,15,15
3.142857,longtogeny_males,12,12,12
3.285714,longtogeny_males,12,12,12
...,...,...,...,...
112.000000,jax_longtogeny,11,11,11
114.571429,longtogeny_males,13,13,13
116.285714,longtogeny_males,13,13,13
118.714286,longtogeny_males,12,12,12


In [25]:
# bin ages to make sampling easier
df['age_cuts'] = df.groupby("experiment")["age"].transform(lambda v: pd.qcut(v, 15, labels=False))

In [30]:
# sub-select 4 sessions from each group
# 64 sessions = 3.5e6 frames
sample = df.groupby(['age_cuts', 'experiment']).sample(n=6, random_state=0)

In [31]:
len(sample)

540

In [32]:
len(sample) * 30 * 60 * 20

19440000

In [36]:
for idx, row in sample.iterrows():
    path = Path(row.path)
    try:
        if row.experiment == "jax_longtogeny":
            new_path = agg_folder / path.name
        else:
            new_path = agg_folder / (path.parents[1].name + ".h5")

        new_path.symlink_to(path)
        new_path = new_path.with_suffix(".yaml")
        new_path.symlink_to(path.with_suffix(".yaml"))
    except FileExistsError:
        print(path, "exists in training set")