<a href="https://colab.research.google.com/github/grabuffo/BrainStim_ANN_fMRI_HCP/blob/main/notebooks/Process_TMS_fMRI_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<a href="https://colab.research.google.com/github/grabuffo/BrainStim_ANN_fMRI_HCP/blob/main/notebooks/Process_TMS_fMRI_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Process TMS-fMRI data (task-rest) for population ANN

This notebook loads the preprocessed **TMS-fMRI** parcel time series stored in:

- `data/TMS_fMRI/dataset_tian50_schaefer400_allruns.pkl`

and creates, **per subject**, the arrays:

- `*_signals.npy`  (T × 450)
- `*_inputs.npy`   ((T−S) × (S·450))
- `*_targets.npy`  ((T−S) × 450)

where **S** is the number of past steps used as input (multi-to-one).

Important design choices:
- **No concatenation across subjects before building samples** (avoids artificial transitions).
- Concatenation happens **within subject** across all available **task-rest** runs.
- Filtering is applied **per run** before concatenation to avoid boundary artifacts.


In [2]:
# --- Setup ---
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import os, sys, json, pickle
import numpy as np

# Project + paths
PROJECT_DIR = "/content/drive/MyDrive/Colab Notebooks/Brain_Stim_ANN"
DATA_DIR    = os.path.join(PROJECT_DIR, "data")

TMS_PKL_PATH = os.path.join(DATA_DIR, "TMS_fMRI", "dataset_tian50_schaefer400_allruns.pkl")
OUT_DIR      = os.path.join(DATA_DIR, "preprocessed_subjects_tms_fmri")

os.makedirs(OUT_DIR, exist_ok=True)

# Make sure preprocessing_tms_fmri.py is importable
# (place it in PROJECT_DIR or in the same folder as this notebook)
if PROJECT_DIR not in sys.path:
    sys.path.append(PROJECT_DIR)

# 2 - Clone repo (only if missing)
if not os.path.exists("/content/BrainStim_ANN_fMRI_HCP"):
    !git clone https://github.com/grabuffo/BrainStim_ANN_fMRI_HCP.git
else:
    print("Repo already exists ✅")

# 3 - Set paths
repo_dir = "/content/BrainStim_ANN_fMRI_HCP"

import sys
sys.path.append(repo_dir)

# 4 - Imports
from src.preprocessing_tms_fmri import *
from src.NPI import *

#from preprocessing_tms_fmri import concat_runs, make_inputs_targets


print("TMS pickle:", TMS_PKL_PATH)
print("Output dir:", OUT_DIR)


Mounted at /content/drive
Cloning into 'BrainStim_ANN_fMRI_HCP'...
remote: Enumerating objects: 450, done.[K
remote: Counting objects: 100% (100/100), done.[K
remote: Compressing objects: 100% (90/90), done.[K
remote: Total 450 (delta 45), reused 10 (delta 10), pack-reused 350 (from 2)[K
Receiving objects: 100% (450/450), 62.69 MiB | 14.19 MiB/s, done.
Resolving deltas: 100% (153/153), done.
TMS pickle: /content/drive/MyDrive/Colab Notebooks/Brain_Stim_ANN/data/TMS_fMRI/dataset_tian50_schaefer400_allruns.pkl
Output dir: /content/drive/MyDrive/Colab Notebooks/Brain_Stim_ANN/data/preprocessed_subjects_tms_fmri


In [3]:
# --- Parameters (match your HCP-style pipeline) ---
# Window length (multi-to-one): input uses S past steps to predict next step
USING_STEPS = 3          # S

# Remove first N time points per run (to remove transient)
REMOVE_POINTS = 30

# Filtering band (same as HCP helper)
LOW_HZ  = 0.008
HIGH_HZ = 0.08
ORDER   = 2

# Standardization after filtering
ZSCORE = True

# TR handling:
# - If TR_OVERRIDE is None: use per-run metadata['tr_s'] (recommended)
# - If you want to force a TR (e.g., 2.4), set TR_OVERRIDE = 2.4
TR_OVERRIDE = None

DTYPE = np.float32

print("USING_STEPS:", USING_STEPS)
print("REMOVE_POINTS:", REMOVE_POINTS)
print("Band (Hz):", (LOW_HZ, HIGH_HZ), "order", ORDER)
print("TR_OVERRIDE:", TR_OVERRIDE)


USING_STEPS: 3
REMOVE_POINTS: 30
Band (Hz): (0.008, 0.08) order 2
TR_OVERRIDE: None


In [4]:
# --- Load dataset dictionary ---
with open(TMS_PKL_PATH, "rb") as f:
    dataset = pickle.load(f)

print("Loaded subjects:", len(dataset))
# Quick peek
some_sub = next(iter(dataset.keys()))
print("Example subject:", some_sub)
print("Keys:", list(dataset[some_sub].keys()))


  dataset = pickle.load(f)


Loaded subjects: 46
Example subject: sub-NTHC1001
Keys: ['task-rest', 'task-stim']


In [5]:
# --- Build per-subject signals/inputs/targets for TASK-REST only ---
saved_subjects = []
summary = {}

def get_rest_runs(sub_dict):
    # sub_dict['task-rest'] is a dict keyed by int run_idx
    if "task-rest" not in sub_dict:
        return []
    runs_dict = sub_dict["task-rest"]
    # sort by run_idx for determinism
    return [runs_dict[k] for k in sorted(runs_dict.keys())]

for sub_id, sub_dict in dataset.items():
    rest_runs = get_rest_runs(sub_dict)
    if not rest_runs:
        continue

    # Collect time series per run
    run_ts = []
    run_trs = []
    run_sessions = []

    for run_idx, run in sorted(sub_dict["task-rest"].items()):
        ts = run["time series"]
        md = run.get("metadata", {})
        tr = md.get("tr_s", None)
        ses = md.get("session", None)

        if ts is None or len(ts) == 0:
            continue

        if TR_OVERRIDE is not None:
            tr = float(TR_OVERRIDE)
        elif tr is None:
            raise RuntimeError(f"Missing tr_s in metadata for {sub_id} run {run_idx}")

        run_ts.append(ts)
        run_trs.append(float(tr))
        run_sessions.append(ses)

    if not run_ts:
        continue

    # Ensure TR consistency across a subject's rest runs
    # (If you ever have mixed TRs within 'task-rest', this will flag it.)
    if len(set(np.round(run_trs, 6))) != 1:
        raise RuntimeError(f"Inconsistent TRs in {sub_id} task-rest runs: {run_trs}")
    tr_subject = run_trs[0]

    # Preprocess + concatenate runs (filters each run separately by default)
    signals = concat_runs(
        run_ts,
        tr=tr_subject,
        n_drop=REMOVE_POINTS,
        low=LOW_HZ,
        high=HIGH_HZ,
        order=ORDER,
        zscore=ZSCORE,
        filter_each_run=True,
    ).astype(DTYPE, copy=False)

    if signals.shape[0] <= USING_STEPS:
        # too short after trimming
        continue

    inputs, targets = make_inputs_targets(signals, steps=USING_STEPS)
    inputs = inputs.astype(DTYPE, copy=False)
    targets = targets.astype(DTYPE, copy=False)

    # Save (mirror HCP naming but with sub-... prefix)
    sub_tag = sub_id  # keep "sub-NTHC...."
    np.save(os.path.join(OUT_DIR, f"{sub_tag}_signals.npy"), signals)
    np.save(os.path.join(OUT_DIR, f"{sub_tag}_inputs.npy"), inputs)
    np.save(os.path.join(OUT_DIR, f"{sub_tag}_targets.npy"), targets)

    saved_subjects.append(sub_tag)
    summary[sub_tag] = {
        "n_runs": len(run_ts),
        "sessions": run_sessions,
        "tr_s": tr_subject,
        "signals_shape": list(signals.shape),
        "inputs_shape": list(inputs.shape),
        "targets_shape": list(targets.shape),
    }

print("Saved subjects:", len(saved_subjects))
print("Example summary:", saved_subjects[0] if saved_subjects else None)


Saved subjects: 46
Example summary: sub-NTHC1001


In [6]:
# --- Save bookkeeping files (recommended) ---
subjects_path = os.path.join(OUT_DIR, "subjects_list_tms_fmri.txt")
with open(subjects_path, "w") as f:
    for s in saved_subjects:
        f.write(s + "\n")

summary_path = os.path.join(OUT_DIR, "summary_tms_fmri_task_rest.json")
with open(summary_path, "w") as f:
    json.dump(summary, f, indent=2)

config = {
    "dataset_pickle": TMS_PKL_PATH,
    "task": "task-rest",
    "atlas_order": "Tian50_then_Schaefer400",
    "using_steps": USING_STEPS,
    "remove_points_per_run": REMOVE_POINTS,
    "band_hz": [LOW_HZ, HIGH_HZ],
    "butter_order": ORDER,
    "zscore": ZSCORE,
    "tr_override": TR_OVERRIDE,
}
config_path = os.path.join(OUT_DIR, "preprocess_config_tms_fmri.json")
with open(config_path, "w") as f:
    json.dump(config, f, indent=2)

print("Wrote:", subjects_path)
print("Wrote:", summary_path)
print("Wrote:", config_path)


Wrote: /content/drive/MyDrive/Colab Notebooks/Brain_Stim_ANN/data/preprocessed_subjects_tms_fmri/subjects_list_tms_fmri.txt
Wrote: /content/drive/MyDrive/Colab Notebooks/Brain_Stim_ANN/data/preprocessed_subjects_tms_fmri/summary_tms_fmri_task_rest.json
Wrote: /content/drive/MyDrive/Colab Notebooks/Brain_Stim_ANN/data/preprocessed_subjects_tms_fmri/preprocess_config_tms_fmri.json


### Understanding `signals`, `inputs`, and `targets`

Let:

- **T** = number of time points after trimming/filtering/concatenation (within subject)
- **N** = number of parcels (here **450**)
- **S** = `USING_STEPS`

Then:

- `signals` has shape **(T, N)**
- `inputs` has shape **(T−S, S·N)** and contains flattened windows:  
  `inputs[t] = signals[t : t+S].reshape(-1)`
- `targets` has shape **(T−S, N)** and contains next-step targets:  
  `targets[t] = signals[t+S]`


# Test Correctdness

In [7]:
import os, glob

OUT_DIR = "/content/drive/MyDrive/Colab Notebooks/Brain_Stim_ANN/data/preprocessed_subjects_tms_fmri"

print("OUT_DIR exists:", os.path.exists(OUT_DIR))
print("npy files:", len(glob.glob(os.path.join(OUT_DIR, "*.npy"))))
print("json files:", len(glob.glob(os.path.join(OUT_DIR, "*.json"))))
print("txt files:", len(glob.glob(os.path.join(OUT_DIR, "*.txt"))))

print("\nExample files:")
for f in sorted(glob.glob(os.path.join(OUT_DIR, "*")))[:10]:
    print(" ", os.path.basename(f))


OUT_DIR exists: True
npy files: 138
json files: 2
txt files: 1

Example files:
  ECts_MLP
  preprocess_config_tms_fmri.json
  sub-NTHC1001_inputs.npy
  sub-NTHC1001_signals.npy
  sub-NTHC1001_targets.npy
  sub-NTHC1003_inputs.npy
  sub-NTHC1003_signals.npy
  sub-NTHC1003_targets.npy
  sub-NTHC1009_inputs.npy
  sub-NTHC1009_signals.npy


In [8]:
import numpy as np, glob, os

sub = os.path.basename(sorted(glob.glob(os.path.join(OUT_DIR, "sub-*_signals.npy")))[0]).split("_signals.npy")[0]
signals = np.load(os.path.join(OUT_DIR, f"{sub}_signals.npy"))
inputs  = np.load(os.path.join(OUT_DIR, f"{sub}_inputs.npy"))
targets = np.load(os.path.join(OUT_DIR, f"{sub}_targets.npy"))

print("Subject:", sub)
print("signals:", signals.shape, signals.dtype)
print("inputs :", inputs.shape, inputs.dtype)
print("targets:", targets.shape, targets.dtype)

# Basic integrity
print("NaNs in signals:", np.isnan(signals).any())
print("NaNs in inputs :", np.isnan(inputs).any())
print("NaNs in targets:", np.isnan(targets).any())

# Dimension consistency
T, N = signals.shape
print("N regions (expected 450):", N)
print("targets rows == inputs rows:", targets.shape[0] == inputs.shape[0])


Subject: sub-NTHC1001
signals: (210, 450) float32
inputs : (207, 1350) float32
targets: (207, 450) float32
NaNs in signals: False
NaNs in inputs : False
NaNs in targets: False
N regions (expected 450): 450
targets rows == inputs rows: True


In [10]:
import numpy as np

# Pick one parcel, compute FFT power ratio in/out of band
tr = 2.4
fs = 1.0 / tr
x = signals[:, 100]  # any parcel

freqs = np.fft.rfftfreq(len(x), d=tr)
p = np.abs(np.fft.rfft(x))**2

band = (freqs >= 0.008) & (freqs <= 0.08)
ratio = p[band].sum() / (p.sum() + 1e-12)

print("Bandpower ratio (0.008–0.08 Hz):", float(ratio))


Bandpower ratio (0.008–0.08 Hz): 0.9618606567382812


In [11]:
import os, glob

subs = sorted({os.path.basename(f).split("_signals.npy")[0]
               for f in glob.glob(os.path.join(OUT_DIR, "sub-*_signals.npy"))})

missing = []
for s in subs:
    for suffix in ["signals", "inputs", "targets"]:
        p = os.path.join(OUT_DIR, f"{s}_{suffix}.npy")
        if not os.path.exists(p):
            missing.append(p)

print("n subjects:", len(subs))
print("missing files:", len(missing))
if missing[:10]:
    print("First missing examples:", missing[:10])


n subjects: 46
missing files: 0


In [12]:
import json, os

summary = os.path.join(OUT_DIR, "summary_tms_fmri_task_rest.json")
cfg     = os.path.join(OUT_DIR, "preprocess_config_tms_fmri.json")

print("summary exists:", os.path.exists(summary))
print("config exists :", os.path.exists(cfg))

if os.path.exists(cfg):
    with open(cfg, "r") as f:
        print("\nConfig:", json.load(f))


summary exists: True
config exists : True

Config: {'dataset_pickle': '/content/drive/MyDrive/Colab Notebooks/Brain_Stim_ANN/data/TMS_fMRI/dataset_tian50_schaefer400_allruns.pkl', 'task': 'task-rest', 'atlas_order': 'Tian50_then_Schaefer400', 'using_steps': 3, 'remove_points_per_run': 30, 'band_hz': [0.008, 0.08], 'butter_order': 2, 'zscore': True, 'tr_override': None}


In [14]:
import os, json

OUT_DIR = "/content/drive/MyDrive/Colab Notebooks/Brain_Stim_ANN/data/preprocessed_subjects_tms_fmri"
cfg_path = os.path.join(OUT_DIR, "preprocess_config_tms_fmri.json")

with open(cfg_path, "r") as f:
    cfg = json.load(f)

cfg


{'dataset_pickle': '/content/drive/MyDrive/Colab Notebooks/Brain_Stim_ANN/data/TMS_fMRI/dataset_tian50_schaefer400_allruns.pkl',
 'task': 'task-rest',
 'atlas_order': 'Tian50_then_Schaefer400',
 'using_steps': 3,
 'remove_points_per_run': 30,
 'band_hz': [0.008, 0.08],
 'butter_order': 2,
 'zscore': True,
 'tr_override': None}

In [16]:
import os, pickle, numpy as np
from src.preprocessing_tms_fmri import concat_runs

PICKLE_PATH = "/content/drive/MyDrive/Colab Notebooks/Brain_Stim_ANN/data/TMS_fMRI/dataset_tian50_schaefer400_allruns.pkl"
OUT_DIR     = "/content/drive/MyDrive/Colab Notebooks/Brain_Stim_ANN/data/preprocessed_subjects_tms_fmri"
sub_id      = "sub-NTHC1001"

saved = np.load(os.path.join(OUT_DIR, f"{sub_id}_signals.npy"))

with open(PICKLE_PATH, "rb") as f:
    dataset = pickle.load(f)

rest_runs = [dataset[sub_id]["task-rest"][k]["time series"] for k in sorted(dataset[sub_id]["task-rest"].keys())]

def score(arr):
    diff = saved - arr
    return float(np.max(np.abs(diff))), float(np.mean(np.abs(diff)))

candidates = []
for filter_each_run in [True, False]:
    for zscore in [True, False]:
        for n_drop in [30, 0]:
            rec = concat_runs(
                rest_runs,
                tr=2., n_drop=n_drop,
                low=0.008, high=0.08, order=2,
                zscore=zscore,
                filter_each_run=filter_each_run
            )
            if rec.shape != saved.shape:
                continue
            mx, me = score(rec)
            candidates.append((mx, me, filter_each_run, zscore, n_drop))

candidates = sorted(candidates, key=lambda x: (x[0], x[1]))
print("Best matches (max_abs, mean_abs, filter_each_run, zscore, n_drop):")
for row in candidates[:5]:
    print(row)


  dataset = pickle.load(f)


Best matches (max_abs, mean_abs, filter_each_run, zscore, n_drop):
(0.0, 0.0, True, True, 30)
(0.0, 0.0, False, True, 30)
(2.589588165283203, 0.28962451219558716, True, False, 30)
(2.589588165283203, 0.28962451219558716, False, False, 30)


In [17]:
import os, pickle, numpy as np
from src.preprocessing_tms_fmri import concat_runs

PICKLE_PATH = "/content/drive/MyDrive/Colab Notebooks/Brain_Stim_ANN/data/TMS_fMRI/dataset_tian50_schaefer400_allruns.pkl"
OUT_DIR     = "/content/drive/MyDrive/Colab Notebooks/Brain_Stim_ANN/data/preprocessed_subjects_tms_fmri"
sub_id      = "sub-NTHC1001"

saved = np.load(os.path.join(OUT_DIR, f"{sub_id}_signals.npy"))

with open(PICKLE_PATH, "rb") as f:
    dataset = pickle.load(f)

rest_runs = [dataset[sub_id]["task-rest"][k]["time series"] for k in sorted(dataset[sub_id]["task-rest"].keys())]

# IMPORTANT: rest TR = 2.0s
recomputed = concat_runs(
    rest_runs,
    tr=2.0,
    n_drop=30,
    low=0.008, high=0.08, order=2,
    zscore=True,
    filter_each_run=True
)

print("Saved shape      :", saved.shape)
print("Recomputed shape :", recomputed.shape)

diff = saved - recomputed
print("Max abs diff :", float(np.max(np.abs(diff))))
print("Mean abs diff:", float(np.mean(np.abs(diff))))
print("Allclose?    :", np.allclose(saved, recomputed, atol=1e-5, rtol=1e-5))


  dataset = pickle.load(f)


Saved shape      : (210, 450)
Recomputed shape : (210, 450)
Max abs diff : 0.0
Mean abs diff: 0.0
Allclose?    : True
