# Gather files for PCA

In [1]:
import h5py
from pathlib import Path
from tqdm.auto import tqdm
from aging.organization.paths import FOLDERS

In [2]:
version = 6
size_key = 'win_size_norm_frames_v5'
pca_path = Path(f'/n/groups/datta/win/longtogeny/data/ontogeny/version_{version:02d}/all_data_pca')

In [3]:
# set environment variables for bash scripts below
%env FRAMES_KEY=$size_key
%env VERSION=$version

env: FRAMES_KEY=win_size_norm_frames_v5
env: VERSION=6


In [4]:
FOLDERS

(PosixPath('/n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_females'),
 PosixPath('/n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_males'),
 PosixPath('/n/groups/datta/Dana/Ontogeny/raw_data/longtogeny_pre_unet/Males'),
 PosixPath('/n/groups/datta/min/longtogeny_072023'),
 PosixPath('/n/groups/datta/min/wheel_062023'),
 PosixPath('/n/groups/datta/win/longtogeny/dlight'))

In [5]:
def is_size_normalized(file):
    try:
        with h5py.File(file, 'r') as h5f:
            return size_key in h5f
    except (BlockingIOError, OSError):
        return False

In [6]:
duplicate_files = []
for folder in tqdm(FOLDERS):
    if 'longtogeny' in str(folder):
        out_folder = pca_path / (folder.parent.name + '_' + folder.name)
    else:
        out_folder = pca_path / folder.name
    out_folder.mkdir(exist_ok=True, parents=True)

    for file in filter(is_size_normalized, folder.glob('**/results_00.h5')):
        try:
            new_path = out_folder / (file.parents[1].name + '.h5')
            new_path.symlink_to(file)
            new_path = out_folder / (file.parents[1].name + '.yaml')
            new_path.symlink_to(file.with_suffix('.yaml'))
        except FileExistsError:
            duplicate_files.append(file)

  0%|          | 0/6 [00:00<?, ?it/s]

In [7]:
len(duplicate_files)

3152

## Apply PCA

In [8]:
%%bash

# run this one
printf -v VER "%02d" $VERSION
source $HOME/.bashrc
conda activate moseq2-app
moseq2-pca apply-pca \
    -i /n/groups/datta/win/longtogeny/data/ontogeny/version_${VER}/all_data_pca \
    -o /n/groups/datta/win/longtogeny/data/ontogeny/version_${VER}/all_data_pca \
    --h5-path ${FRAMES_KEY} -n 50 --cluster-type slurm \
    --pca-file /n/groups/datta/win/longtogeny/data/ontogeny/version_${VER}/_pca/pca.h5 \
    --timeout 8 -w 00:45:00 -m 13GB -q short \
    --dask-cache-path /n/scratch3/users/w/wg41/tmp \
    --batch-apply --overwrite-pca-apply 1

Hello from o2
Loading PCs from /n/groups/datta/win/longtogeny/data/ontogeny/version_06/_pca/pca.h5
Access dask dashboard at http://localhost:8787


Intializing workers: 100%|██████████| 50/50 [00:37<00:00,  1.35it/s]




100%|██████████| 5/5 [01:31<00:00, 18.23s/it]
