# Aug 24, 2025: collect + organize BOLD volumes

conda env: gt

In [1]:
# os.system(f'mount_aba')
# os.system(f'unmount_aba')

In [2]:
import os
import sys
import numpy as np 
import pandas as pd 
from glob import glob
from nilearn.maskers import NiftiLabelsMasker
import nibabel as nib
import re
from pathlib import Path
import pickle
import seaborn as sns
from tqdm import tqdm
from joblib import Parallel, delayed

In [3]:
class ARGS(): pass

args = ARGS()

args.SEED = 100
np.random.seed(args.SEED)

In [4]:
PROJ_path = f'/home/govindas/mounts/kaba/aba'
DATA_path = f'{PROJ_path}/dataset/first_level/voxelwise_13TR_4plays_offset_reducedRuns_trialwise'
ATLAS_path = f'{PROJ_path}/ROI_mask'

In [5]:
atlas_name = f'NEWMAX_ROIs_final_gm_104_2mm' #f'ABA_ROIs_final_gm_36'
atlas_file = f'{ATLAS_path}/{atlas_name}.nii.gz'
atlas = nib.load(atlas_file)
masker = NiftiLabelsMasker(labels_img=atlas, standardize=False)

In [6]:
def parse_metadata_from_path(path: str) -> dict:
    """
    Robustly parse lags (TRs), number of play conditions, and flags from a folder path.
    Handles underscores, hyphens, slashes, and camelCase.
    """
    lower = path.lower()

    # numbers next to tokens (allow separators between)
    m_tr  = re.search(r'(\d+)\s*[_\-./\\]*\s*tr', lower)
    m_pl  = re.search(r'(\d+)\s*[_\-./\\]*\s*plays?', lower)

    # simplify for flag detection: strip common separators
    simple = re.sub(r'[/\\_.\-]', '', lower)  # e.g., "...voxelwise13tr4playsoffsetreducedrunstrialwise"

    return {
        "lags": int(m_tr.group(1)) if m_tr else None,
        "n_plays": int(m_pl.group(1)) if m_pl else None,
        "offset": "offset" in simple,
        "reduced_runs": ("reducedruns" in simple) or ("reducedrun" in simple),
        "trialwise": "trialwise" in simple,
    }

In [7]:
def read_col_labels(labels_file):
    col_labels = Path(labels_file).read_text().strip()
    labels = []
    for line in col_labels.splitlines():
        labels.extend(
            [
                p.strip()
                for p in line.split('|')
                if p.strip()
            ]
        )
    return labels

def find_unique_labels(labels):
    rows = []
    for lab in labels:
        prefix = lab.split("#", 1)[0] # everything before the first '#'
        kind = "OTHER"
        if lab.endswith("_Coef"):   
            kind = "Coef"
            rows.append((prefix, kind))
        elif lab.endswith("_Tstat"): kind = "Tstat"
        elif lab.endswith("_Fstat"): kind = "Fstat"
        # rows.append((prefix, kind))
    df = pd.DataFrame(rows, columns=["prefix", "kind"])
    out = df.pivot_table(index="prefix", columns="kind", aggfunc="size", fill_value=0)
    # keep consistent column order
    for col in ["Coef", "Tstat", "Fstat", "OTHER"]:
        if col not in out.columns: out[col] = 0
    return out.reset_index()[["prefix", "Coef", "Tstat", "Fstat", "OTHER"]].sort_values("prefix")

def find_trial_windows(labels, cond, lags):
    pat = re.compile(rf"^{re.escape(cond)}#\d+.*_Coef$")
    idxs = [i for i, lab in enumerate(labels) if pat.match(lab)]
    n_trials = len(idxs) // lags
    windows = np.split(np.array(idxs), n_trials)
    return windows

In [8]:
def read_reml_file(reml_file):
    img = nib.load(reml_file)
    data = img.dataobj
    if data.ndim == 5 and data.shape[3] == 1:
        data = np.squeeze(data, axis=3)
        
    img = nib.Nifti1Image(data, img.affine, img.header.copy())
    return img 

def apply_roi_masks(img, masker: NiftiLabelsMasker):
    data = masker.fit_transform(img)
    return data

In [9]:
def get_trialwise_timeseries_per_sub(args, sub_folder, OUT_path, masker):
    # identify subject
    match = re.search(r'ABA\d+', sub_folder)
    if match: sub = match.group(0)

    # get main data
    reml_file = f'{sub_folder}/{sub}_bucket_REML.nii.gz'
    img = read_reml_file(reml_file)
    data = apply_roi_masks(img, masker)

    # extract trialwise time series
    meta_info = parse_metadata_from_path(sub_folder)
    args.TRIAL_LENGTH = meta_info['lags'] # 13

    labels_file = f'{sub_folder}/{sub}_bucket_REML_labels.txt'
    labels = read_col_labels(labels_file)
    df_labels = find_unique_labels(labels)
    unique_labels = df_labels['prefix']
    play_labels = [label for label in unique_labels if 'PLAY' in label] # PLAY_highT, PLAY_lowT, PLAY_highR, PLAY_lowR

    df_ts = []
    for cond in play_labels:
        windows = find_trial_windows(labels, cond=cond, lags=args.TRIAL_LENGTH)
        tss = np.stack([data[window, :] for window in windows]) # #trials x # time points x # rois
        row = pd.DataFrame({
            'sub': [sub],
            'cond': [cond],
            'ts': [tss],
        })
        df_ts += [row]
    df_ts = pd.concat(df_ts)

    # save trialwise timeseries
    sub_out_path = f'{OUT_path}/{sub}'
    os.system(f'mkdir -p {sub_out_path}')

    for idx, row in df_ts.iterrows():
        sub, cond, ts = row
        file_name = f'sub-{sub}_cond-{cond}_desc-trial-ts'
        with open(f'{sub_out_path}/{file_name}.pkl', 'wb') as f:
            pickle.dump(ts, f)

In [10]:
OUT_path = f'/home/govindas/lab-data/aba/{atlas_name}/roi-timeseries'
os.system(f'mkdir -p {OUT_path}')

0

In [11]:
sub_folders = sorted(glob(f'{DATA_path}/*', recursive=True))
for sub_folder in tqdm(sub_folders):
    try:
        get_trialwise_timeseries_per_sub(args, sub_folder, OUT_path, masker)
    except:
        pass

100%|██████████| 101/101 [5:01:45<00:00, 179.26s/it] 
