In [14]:
# Import all required libraries
import pandas as pd
import numpy as np
import pingouin as pg
from pathlib import Path
from scipy import stats
from scipy.stats import spearmanr
import matplotlib.pyplot as plt
import seaborn as sns
import os.path as op
import re
import nibabel as nib

In [15]:
# Additional imports for neuroimaging and atlas processing
from nilearn import datasets, masking, input_data
from nilearn.connectome import ConnectivityMeasure
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

  from nilearn import datasets, masking, input_data


In [16]:
# Setup directories and plotting theme
RELI_DIR = Path("dset/derivatives/caps/interrater")
FIGURES_DIR = Path("dset/derivatives/figures")
# will use loop later to run over all subjects
OUT_DIR = Path("dset/derivatives/caps")


In [17]:
# Load Craddock 268 atlas
print("Loading Craddock 268 atlas...")
craddock_atlas = datasets.fetch_craddock_2012()

# Use the 268 ROI parcellation
atlas_filename = craddock_atlas['scorr_mean']  # 268 ROIs
atlas_labels = craddock_atlas['labels']

print(f"Atlas loaded: {atlas_filename}")
print(f"Number of ROIs: 268")

# Create masker for extracting time series from ROIs
masker = input_data.NiftiLabelsMasker(
    labels_img=atlas_filename,
    standardize=False,  # We'll do our own z-scoring
    memory='nilearn_cache',
    verbose=1
)

Loading Craddock 268 atlas...


AttributeError: module 'nilearn.datasets' has no attribute 'fetch_craddock_2012'

In [13]:
# Extract BOLD time series and create z-scored participant matrices

# Define all runs for each participant and episode
participant_data = {
    "sub-Blossom": {
        "episode_2": [1, 2, 3, 4, 5, 6, 7]  # Available runs for episode 2
    }
    # Add other participants as needed
}

# Initialize storage for participant-level matrices
all_participant_matrices = {}

for sub_id, episodes in participant_data.items():
    print(f"\n{'='*60}")
    print(f"PROCESSING {sub_id}")
    print(f"{'='*60}")
    
    participant_timeseries = []
    
    for episode_key, run_numbers in episodes.items():
        ep_num = int(episode_key.split('_')[1])
        
        print(f"\nProcessing Episode {ep_num} with {len(run_numbers)} runs...")
        
        for run_num in run_numbers:
            print(f"  Processing run {run_num}...")
            
            TASK_DIR = Path(f"dset/{sub_id}/ses-{ep_num:02d}/func") 
            
            # Construct the filename - note that run number is NOT zero-padded
            task_filename = f"{sub_id}_ses-{ep_num:02d}_task-strangerthings_run-{run_num}_part-mag_space-MNI152NLin2009cAsym_res-2_desc-preproc_bold.nii.gz"
            task_filepath = TASK_DIR / task_filename
            
            if not task_filepath.exists():
                print(f"    WARNING: File not found: {task_filepath}")
                continue
            
            try:
                # Extract time series from 268 ROIs
                print(f"    Extracting time series from {task_filepath.name}...")
                time_series = masker.fit_transform(task_filepath)
                
                print(f"    Time series shape: {time_series.shape} (TRs x ROIs)")
                
                # Z-score normalization within each ROI across time points
                # This normalizes by standard error (std/sqrt(n)) across fMRI volumes
                print("    Applying z-score normalization...")
                
                # Calculate z-scores: (x - mean) / std for each ROI
                roi_means = np.mean(time_series, axis=0)
                roi_stds = np.std(time_series, axis=0, ddof=1)  # Using sample std
                
                # Avoid division by zero for constant signals
                roi_stds[roi_stds == 0] = 1.0
                
                # Z-score normalization
                z_scored_ts = (time_series - roi_means) / roi_stds
                
                print(f"    Z-scored time series shape: {z_scored_ts.shape}")
                print(f"    Z-score stats - Mean: {np.mean(z_scored_ts):.4f}, Std: {np.std(z_scored_ts):.4f}")
                
                # Store z-scored time series for this run
                participant_timeseries.append(z_scored_ts)
                
            except Exception as e:
                print(f"    ERROR processing {task_filepath}: {e}")
                continue
    
    if participant_timeseries:
        # Concatenate all runs for this participant
        print(f"\nConcatenating {len(participant_timeseries)} runs for {sub_id}...")
        participant_matrix = np.vstack(participant_timeseries)
        
        print(f"Final participant matrix shape: {participant_matrix.shape}")
        print(f"  - Total TRs across all runs: {participant_matrix.shape[0]}")
        print(f"  - Number of ROIs (Craddock 268): {participant_matrix.shape[1]}")
        
        # Store the participant-level matrix
        all_participant_matrices[sub_id] = participant_matrix
        
        # Save the participant matrix
        output_dir = OUT_DIR / "timeseries_matrices"
        output_dir.mkdir(parents=True, exist_ok=True)
        
        output_file = output_dir / f"{sub_id}_zscore_timeseries_matrix.npy"
        np.save(output_file, participant_matrix)
        print(f"Saved participant matrix to: {output_file}")
        
        # Also save as CSV for easier inspection
        output_csv = output_dir / f"{sub_id}_zscore_timeseries_matrix.csv"
        df_matrix = pd.DataFrame(participant_matrix, 
                                columns=[f"ROI_{i+1:03d}" for i in range(participant_matrix.shape[1])])
        df_matrix.to_csv(output_csv, index=False)
        print(f"Saved participant matrix (CSV) to: {output_csv}")
        
    else:
        print(f"WARNING: No valid runs found for {sub_id}")

print(f"\n{'='*60}")
print("PROCESSING COMPLETE")
print(f"{'='*60}")
print(f"Processed {len(all_participant_matrices)} participants:")
for sub_id, matrix in all_participant_matrices.items():
    print(f"  {sub_id}: {matrix.shape[0]} TRs Ã— {matrix.shape[1]} ROIs")





PROCESSING sub-Blossom

Processing Episode 2 with 7 runs...
  Processing run 1...
    Extracting time series from sub-Blossom_ses-02_task-strangerthings_run-1_part-mag_space-MNI152NLin2009cAsym_res-2_desc-preproc_bold.nii.gz...
    ERROR processing dset/sub-Blossom/ses-02/func/sub-Blossom_ses-02_task-strangerthings_run-1_part-mag_space-MNI152NLin2009cAsym_res-2_desc-preproc_bold.nii.gz: name 'masker' is not defined
  Processing run 2...
    Extracting time series from sub-Blossom_ses-02_task-strangerthings_run-2_part-mag_space-MNI152NLin2009cAsym_res-2_desc-preproc_bold.nii.gz...
    ERROR processing dset/sub-Blossom/ses-02/func/sub-Blossom_ses-02_task-strangerthings_run-2_part-mag_space-MNI152NLin2009cAsym_res-2_desc-preproc_bold.nii.gz: name 'masker' is not defined
  Processing run 3...
    Extracting time series from sub-Blossom_ses-02_task-strangerthings_run-3_part-mag_space-MNI152NLin2009cAsym_res-2_desc-preproc_bold.nii.gz...
    ERROR processing dset/sub-Blossom/ses-02/func/sub