# Gather files for PCA

In [1]:
import h5py
from pathlib import Path
from aging.organization.paths import FOLDERS
from toolz import groupby, concat, frequencies, curry

In [2]:
version = 8  # arhmm version
size_key = 'win_size_norm_frames_v6'
pca_path = Path(f'/n/groups/datta/win/longtogeny/data/ontogeny/version_{version:02d}/all_data_pca')

In [3]:
# set environment variables for bash scripts below
%env FRAMES_KEY=$size_key
%env VERSION=$version

env: FRAMES_KEY=win_size_norm_frames_v6
env: VERSION=8


In [4]:
FOLDERS

(PosixPath('/n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_females'),
 PosixPath('/n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_males'),
 PosixPath('/n/groups/datta/Dana/Ontogeny/raw_data/Dana_ontogeny/Males'),
 PosixPath('/n/groups/datta/Dana/Ontogeny/raw_data/Dana_ontogeny/Females'),
 PosixPath('/n/groups/datta/Dana/Ontogeny/raw_data/longtogeny_pre_unet/Males'),
 PosixPath('/n/groups/datta/min/longtogeny_072023/Males'),
 PosixPath('/n/groups/datta/min/longtogeny_072023/Females'),
 PosixPath('/n/groups/datta/min/wheel_062023'),
 PosixPath('/n/groups/datta/win/longtogeny/dlight'),
 PosixPath('/n/groups/datta/min/longtogeny_052023/Males'),
 PosixPath('/n/groups/datta/win/longtogeny/data/jackson-labs/datta_i'),
 PosixPath('/n/groups/datta/Dana/Ontogeny/raw_data/Klothos'),
 PosixPath('/n/groups/datta/Dana/Ontogeny/raw_data/Epigclock'))

In [5]:
def is_size_normalized(file):
    try:
        with h5py.File(file, 'r') as h5f:
            return size_key in h5f
    except (BlockingIOError, OSError):
        return False

In [6]:
files = sorted(
    filter(is_size_normalized, concat(f.glob("**/*results_00.h5") for f in FOLDERS))
)
parent_counts = frequencies(str(f.parents[1]) for f in files)

In [7]:
@curry
def not_double(parent_counts, path):
    if "jackson" not in str(path) and parent_counts[str(path.parents[1])] > 1:
        return 'proc_cleaned' == path.parent.name
    return True

In [8]:
grouped_files = groupby(
    lambda f: [x for x in FOLDERS if str(x) in str(f)][0], filter(not_double(parent_counts), files)
)

In [9]:
list(grouped_files)

[PosixPath('/n/groups/datta/Dana/Ontogeny/raw_data/Dana_ontogeny/Females'),
 PosixPath('/n/groups/datta/Dana/Ontogeny/raw_data/Dana_ontogeny/Males'),
 PosixPath('/n/groups/datta/Dana/Ontogeny/raw_data/Klothos'),
 PosixPath('/n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_females'),
 PosixPath('/n/groups/datta/Dana/Ontogeny/raw_data/Ontogeny_males'),
 PosixPath('/n/groups/datta/Dana/Ontogeny/raw_data/longtogeny_pre_unet/Males'),
 PosixPath('/n/groups/datta/min/longtogeny_052023/Males'),
 PosixPath('/n/groups/datta/min/longtogeny_072023/Females'),
 PosixPath('/n/groups/datta/min/longtogeny_072023/Males'),
 PosixPath('/n/groups/datta/min/wheel_062023'),
 PosixPath('/n/groups/datta/win/longtogeny/data/jackson-labs/datta_i'),
 PosixPath('/n/groups/datta/win/longtogeny/dlight')]

In [10]:
duplicate_files = []
for folder, _files in grouped_files.items():
    if 'jackson-labs' in str(folder):
        out_folder = pca_path / "jax_longtogeny"
    elif 'klothos' in str(folder).lower():
        out_folder = pca_path / "klothos"
    elif 'longtogeny' in str(folder) or "Dana_ontogeny" in str(folder):
        out_folder = pca_path / (folder.parent.name.lower() + '_' + folder.name)
    else:
        out_folder = pca_path / folder.name
    out_folder.mkdir(exist_ok=True, parents=True)

    for file in _files:
        try:
            if 'jackson-labs' in str(folder):
                new_path = out_folder / file.name
            else:
                new_path = out_folder / (file.parents[1].name + '.h5')
            new_path.symlink_to(file)
            new_path = new_path.with_suffix('.yaml')
            new_path.symlink_to(file.with_suffix('.yaml'))
        except FileExistsError:
            duplicate_files.append(file)

In [11]:
len(duplicate_files)

5873

## Apply PCA

In [None]:
%%bash

# run this one
printf -v VER "%02d" $VERSION
source $HOME/.bashrc
conda activate moseq2-app
moseq2-pca apply-pca \
    -i /n/groups/datta/win/longtogeny/data/ontogeny/version_${VER}/all_data_pca \
    -o /n/groups/datta/win/longtogeny/data/ontogeny/version_${VER}/all_data_pca \
    --h5-path ${FRAMES_KEY} -n 33 --cluster-type slurm \
    --pca-file /n/groups/datta/win/longtogeny/data/ontogeny/version_${VER}/_pca/pca.h5 \
    --timeout 8 -w 01:40:00 -m 13GB -q short \
    --dask-cache-path /n/scratch3/users/w/wg41/tmp \
    --batch-apply --overwrite-pca-apply 1

Hello from o2
Loading PCs from /n/groups/datta/win/longtogeny/data/ontogeny/version_08/_pca/pca.h5
Access dask dashboard at http://localhost:8787


Intializing workers: 100%|██████████| 33/33 [02:01<00:00,  3.68s/it]
100%|██████████| 20/20 [09:50<00:00, 29.54s/it]


In [None]:
!rm slurm-*.out