# Gather files for PCA

In [1]:
import h5py
from pathlib import Path
from aging.organization.paths import FOLDERS
from toolz import groupby, concat, frequencies

In [2]:
version = 7  # arhmm version
size_key = f'win_size_norm_frames_v{version - 1}'
pca_path = Path(f'/n/groups/datta/win/longtogeny/data/ontogeny/version_{version:02d}/all_data_pca')

In [3]:
# set environment variables for bash scripts below
%env FRAMES_KEY=$size_key
%env VERSION=$version

env: FRAMES_KEY=win_size_norm_frames_v6
env: VERSION=7


In [4]:
FOLDERS

(PosixPath('/n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_females'),
 PosixPath('/n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_males'),
 PosixPath('/n/groups/datta/Dana/Ontogeny/raw_data/longtogeny_pre_unet/Males'),
 PosixPath('/n/groups/datta/min/longtogeny_072023/Males'),
 PosixPath('/n/groups/datta/min/longtogeny_072023/Females'),
 PosixPath('/n/groups/datta/min/wheel_062023'),
 PosixPath('/n/groups/datta/win/longtogeny/dlight'))

In [5]:
def is_size_normalized(file):
    try:
        with h5py.File(file, 'r') as h5f:
            return size_key in h5f
    except (BlockingIOError, OSError):
        return False

In [6]:
files = sorted(
    filter(is_size_normalized, concat(f.glob("**/results_00.h5") for f in FOLDERS))
)
parent_counts = frequencies([str(f.parents[1]) for f in files])

In [7]:
def not_double(path):
    if parent_counts[str(path.parents[1])] > 1:
        return 'proc_cleaned' == path.parent.name
    return True

In [8]:
grouped_files = groupby(
    lambda f: [x for x in FOLDERS if str(x) in str(f)][0], filter(not_double, files)
)

In [9]:
duplicate_files = []
for folder, _files in grouped_files.items():
    if 'longtogeny' in str(folder):
        out_folder = pca_path / (folder.parent.name + '_' + folder.name)
    else:
        out_folder = pca_path / folder.name
    out_folder.mkdir(exist_ok=True, parents=True)

    for file in _files:
        try:
            new_path = out_folder / (file.parents[1].name + '.h5')
            new_path.symlink_to(file)
            new_path = out_folder / (file.parents[1].name + '.yaml')
            new_path.symlink_to(file.with_suffix('.yaml'))
        except FileExistsError:
            duplicate_files.append(file)

In [10]:
len(duplicate_files)

5213

## Apply PCA

In [11]:
%%bash

# run this one
printf -v VER "%02d" $VERSION
source $HOME/.bashrc
conda activate moseq2-app
moseq2-pca apply-pca \
    -i /n/groups/datta/win/longtogeny/data/ontogeny/version_${VER}/all_data_pca \
    -o /n/groups/datta/win/longtogeny/data/ontogeny/version_${VER}/all_data_pca \
    --h5-path ${FRAMES_KEY} -n 25 --cluster-type slurm \
    --pca-file /n/groups/datta/win/longtogeny/data/ontogeny/version_${VER}/_pca/pca.h5 \
    --timeout 8 -w 00:45:00 -m 13GB -q short \
    --dask-cache-path /n/scratch3/users/w/wg41/tmp \
    --batch-apply --overwrite-pca-apply 1

Hello from o2
Loading PCs from /n/groups/datta/win/longtogeny/data/ontogeny/version_07/_pca/pca.h5
Access dask dashboard at http://localhost:8787


Intializing workers: 100%|██████████| 25/25 [01:45<00:00,  4.21s/it]
100%|██████████| 12/12 [03:52<00:00, 19.35s/it]
