# Organize data for running PCA

Two main approaches:
1. select a subset of all ontogeny sessions, a few sessions from each age and sex
2. select a subset of all longtogeny sessions, a few sessions from each age and sex. Make sure to have the same number of sessions and similar ages per mouse.

In [1]:
import h5py
import datetime
import pandas as pd
from tqdm.auto import tqdm
from pathlib import Path
from toolz import concat, curry

In [2]:
session_exclusion_list = '/n/groups/datta/win/longtogeny/data/metadata/Ontogeny experiment list - Session quality control.csv'
exclusion_df = pd.read_csv(session_exclusion_list, header=0)

In [3]:
exclusion_df['Keep?'] = exclusion_df['Keep?'].str.lower()

In [4]:
exclusion_df['Keep?'].unique()

array(['y', 'n', '?'], dtype=object)

In [5]:
exclusions = exclusion_df.query('`Keep?` != "y"')

In [6]:
exclusions.head()

Unnamed: 0,File path,Checked?,Keep?,Issue
3,/n/groups/datta/Dana/Ontogeny/raw_data/Ontogen...,y,n,"fading, noise, further inspection needed"
4,/n/groups/datta/Dana/Ontogeny/raw_data/Ontogen...,y,n,noise
5,/n/groups/datta/Dana/Ontogeny/raw_data/Ontogen...,y,n,wall noise
6,/n/groups/datta/Dana/Ontogeny/raw_data/Ontogen...,y,n,replaced with noise
7,/n/groups/datta/Dana/Ontogeny/raw_data/Ontogen...,y,n,wall noise


In [7]:
def is_size_normalized(file):
    with h5py.File(file, 'r') as h5f:
        return 'win_size_norm_frames' in h5f


def not_excluded(file):
    return file not in exclusions['File path'].values

## Organize ontogeny sessions

These are the data that will be run through PCA and the modeling steps

- create a dataframe with the file path, age, sex, and session name
- do a groupby and subsample the same number of sessions for each (age, sex) group
- symlink results.h5 files to a new folder

In [8]:
agg_folder = Path('/n/groups/datta/win/longtogeny/data/ontogeny/version_03/training_data')
agg_folder.mkdir(exist_ok=True, parents=True)

In [9]:
folders = [
    '/n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_females',
    '/n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_males',
]

folders = [Path(f) for f in folders]

In [10]:
@curry
def thread_and(funcs, x):
    return all(f(x) for f in funcs)

In [11]:
# filter files for the ones with size_normalized_frames
files = sorted(
    filter(
        thread_and((is_size_normalized, not_excluded)),
        concat(f.glob("**/results_00.h5") for f in folders),
    )
)

In [12]:
df = []
for f in files:
    data = dict(
        age=f.parents[2].name,
        session=f.parents[1].name,
        sex=f.parents[3].name.split('_')[-1][:-1],
        path=str(f)
    )
    df.append(data)
df = pd.DataFrame(df)

In [13]:
df.groupby(['age', 'sex']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,session,path
age,sex,Unnamed: 2_level_1,Unnamed: 3_level_1
12months,female,24,24
12months_28042021,male,26,26
18months,female,24,24
18months_29042021,male,27,27
22months_06052023,male,32,32
3months,female,24,24
3months_19042021,male,27,27
3wks,female,24,24
3wks_02112021,male,16,16
5wks,female,24,24


In [14]:
# sub-select 3 sessions from each group
# 51 sessions = 1.8e6 frames
sample = df.groupby(['age', 'sex']).sample(n=3, random_state=0)

In [15]:
for path in map(Path, sample['path']):
    try:
        new_path = agg_folder / (path.parents[1].name + '.h5')
        new_path.symlink_to(path)
        new_path = agg_folder / (path.parents[1].name + '.yaml')
        new_path.symlink_to(path.with_suffix('.yaml'))
    except FileExistsError:
        print(path, 'exists in training set')

## Organize longtogeny sessions

In [26]:
agg_folder = Path('/n/groups/datta/win/longtogeny/data/longtogeny/version_03/training_data')
agg_folder.mkdir(exist_ok=True, parents=True)

In [27]:
folders = [
    '/n/groups/datta/Dana/Ontogeny/raw_data/longtogeny_pre_unet/Females',
    '/n/groups/datta/Dana/Ontogeny/raw_data/longtogeny_pre_unet/Males',
]
folders = [Path(f) for f in folders]

In [28]:
# filter files for the ones with size_normalized_frames
files = sorted(
    filter(
        thread_and((is_size_normalized, not_excluded)),
        concat(f.glob("**/results_00.h5") for f in folders),
    )
)

In [29]:
len(files)

1651

In [30]:
with h5py.File(files[0], 'r') as h5f:
    print(h5f['metadata/acquisition/SessionName'][()].decode())
    print(h5f['metadata/acquisition/SubjectName'][()].decode())

002
01_01_001


In [31]:
df = []
for f in tqdm(files):
    try:
        with h5py.File(f, 'r') as h5f:
            session = h5f['metadata/acquisition/SubjectName'][()].decode()
            start = h5f['metadata/acquisition/StartTime'][()].decode()
        sex = f.parents[3].name.lower()[:-1]
        sex = 'female' if 'Females' in str(f) else 'male'
        data = dict(
            mouse=session[:5],
            subject_name=session,
            session=f.parents[1].name,
            sex=sex,
            date=datetime.datetime.strptime(start.split('.')[0], '%Y-%m-%dT%H:%M:%S'),
            path=str(f)
        )
        df.append(data)
    except KeyError:
        print(f, 'incomplete')
df = pd.DataFrame(df)

  0%|          | 0/1651 [00:00<?, ?it/s]

/n/groups/datta/Dana/Ontogeny/raw_data/longtogeny_pre_unet/Males/20210403/session_20210403155244/proc/results_00.h5 incomplete
/n/groups/datta/Dana/Ontogeny/raw_data/longtogeny_pre_unet/Males/20210406/session_20210406131627/proc/results_00.h5 incomplete
/n/groups/datta/Dana/Ontogeny/raw_data/longtogeny_pre_unet/Males/20210410/session_20210410184436/proc/results_00.h5 incomplete
/n/groups/datta/Dana/Ontogeny/raw_data/longtogeny_pre_unet/Males/20210410/session_20210410205337/proc/results_00.h5 incomplete
/n/groups/datta/Dana/Ontogeny/raw_data/longtogeny_pre_unet/Males/20210423/session_20210423193537/proc/results_00.h5 incomplete
/n/groups/datta/Dana/Ontogeny/raw_data/longtogeny_pre_unet/Males/20210514/session_20210514153543/proc/results_00.h5 incomplete
/n/groups/datta/Dana/Ontogeny/raw_data/longtogeny_pre_unet/Males/20210521/session_20210521154512/proc/results_00.h5 incomplete
/n/groups/datta/Dana/Ontogeny/raw_data/longtogeny_pre_unet/Males/20210526/session_20210526150505/proc/results_0

In [36]:
filt_df = df[~df['mouse'].str.lower().str.contains('c') & ~df['mouse'].str.lower().str.contains('long') & ~((df['mouse'] == "05_02") & (df['sex'] == "male"))]

In [37]:
filt_df.groupby(['mouse', 'sex']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,subject_name,session,date,path
mouse,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
01_01,female,28,28,28,28
01_01,male,66,66,66,66
01_02,female,29,29,29,29
01_02,male,64,64,64,64
01_03,female,28,28,28,28
01_03,male,68,68,68,68
01_04,female,28,28,28,28
01_04,male,64,64,64,64
02_01,female,32,32,32,32
02_01,male,66,66,66,66


In [38]:
sample = filt_df.groupby(['mouse', 'sex']).sample(n=2, random_state=0)

In [39]:
for path in map(Path, sample['path']):
    try:
        new_path = agg_folder / (path.parents[1].name + '.h5')
        new_path.symlink_to(path)
        new_path = agg_folder / (path.parents[1].name + '.yaml')
        new_path.symlink_to(path.with_suffix('.yaml'))
    except FileExistsError:
        print(path, 'exists in training set')