###**cluster_1way**

Run hierarchical models on iEEG timeseries data with one within-channel fixed effect (2 levels), with the random effects of subject and channel nested in subject. Perform cluster-based corrections for multiple comparisons across timepoints, preserving the random effects structure. The outputs are two-tailed t-statistics with uncorrected and cluster-corrected p-values.

Copyright (c) 2025  
EL Johnson, PhD

###Import modules:

In [None]:
import numpy as np
import pandas as pd
from statsmodels.formula.api import mixedlm
import warnings
import pickle
from pathlib import Path
import gdown

# custom modules
from cluster_utils import load_ieeg_data, create_df, cluster_test

###Download sample data file:

Contains the following variables:  
- sid = subject ID
- ch = channel label
- hit_miss = hit (1) or miss (0) within-channel variable
- data = timeseries data with one row per channel per level for the within-channel variable (186 rows x 139 timepoints)

In [None]:
# download data from Google Drive
output_path = 'lme_data_1way.mat'
fid = '1i170RbUZuRQzY8gzEC99XBdo4uyCbsl4'
url = f'https://drive.google.com/uc?id={fid}'
gdown.download(url, output_path, quiet = False)
data_path = Path(output_path)

# load data
data_dict = load_ieeg_data(data_path)

# create df
df = create_df(data_dict)

print(f'\nDataFrame: {df.shape[0]} rows × {df.shape[1]} columns')
print('\nFirst few rows:')
print(df.head())

###Define function to run the models:

In [None]:
def run_lme_1way(df, verbose = True):
  """
  Run LME models across all timepoints.

  Parameters
  ----------
  df : pd.DataFrame
    DataFrame from create_df()
  verbose : bool, optional
    If True, print progress for each timepoint. Default is True

  Returns
  -------
  dict
    Dictionary with t-statistic and p-value:
    - 't': array
    - 'p': array
  """
  warnings.filterwarnings('ignore')

  # initialize dictionary for storing model outputs
  n_time = df.shape[1] - 4  # exclude metadata columns

  lme = {
    't': np.full(n_time, np.nan),
    'p': np.full(n_time, np.nan)
  }

  # get timepoint column names
  time_cols = [col for col in df.columns if col.startswith('t')]

  # loop through timepoints
  for t in range(n_time):

    if verbose:
      print(f'Running model on datapoint {t+1}/{n_time}...')

    # create table for model
    tmp_dat = df[['sid', 'channel', 'hit_miss', time_cols[t]]].copy()
    tmp_dat.rename(columns = {time_cols[t]: 'data'}, inplace = True)

    # create sid:channel variable for nested random effect
    tmp_dat['sid_ch'] = tmp_dat['sid'].astype(str) + ':' + tmp_dat['channel'].astype(str)

    # run model: data ~ hit_miss + (1|sid) + (1|sid:channel)
    model = mixedlm(
        formula = 'data ~ hit_miss',
        data = tmp_dat,
        groups = tmp_dat['sid'],
        re_formula = '1',
        vc_formula = {'sid_ch': '0 + C(sid_ch)'}
        )
    result = model.fit(method = 'lbfgs', reml = True)

    # extract model outputs
    lme['t'][t] = result.tvalues[1]
    lme['p'][t] = result.pvalues[1]

  return lme

###Run model per timepoint on observed data:

In [None]:
lme = run_lme_1way(df)

###Define function to create null distributions for cluster testing:

In [None]:
def create_null_dist_1way(df, lme, nperm = 1000):
  """
  Create null distributions for cluster testing via permutation.

  Lower nperm for prototyping (multiple of 10). HPC recommended for implementation.

  Parameters
  ----------
  df : pd.DataFrame
    DataFrame from create_df() containing EEG data with columns:
    'sid', 'channel', 'hit_miss', and timepoint columns starting with 't'
  lme : dict
    Dictionary from run_lme_models() with 't' and 'p' arrays
  nperm : int, optional
    Number of permutations. Default is 1000

  Returns
  -------
  tuple of (array, array)
    Returns two arrays:
    - hit_miss: observed hit_miss t-statistics
    - hit_miss_null: null distribution for hit_miss (n_timepoints, nperm)
  """

  # get model stats from observed data
  hit_miss = lme['t']

  # create channel-nested-in-subject IDs
  sid_ch = df['sid'].astype(str) + ':' + df['channel'].astype(str)
  uid = sid_ch.unique()  # unique IDs

  # initialize null distributions
  hit_miss_null = np.zeros((len(hit_miss), nperm))

  # loop through permutations
  for p in range(nperm):
    print(f'Shuffling the data for permutation {p+1}/{nperm}...')

    # initialize shuffled labels
    hit_miss_shuff = np.zeros(len(df), dtype = int)

    # loop through unique IDs
    for u in range(len(uid)):
      u_idx = np.where(sid_ch == uid[u])[0]

      # randomly shuffle within-channel variable labels
      if np.random.rand() > 0.5:  # coin flip
        hit_miss_shuff[u_idx[0]] = 1  # set 1st row of pair to condition 1 (hit)
      else:
        hit_miss_shuff[u_idx[1]] = 1  # set 2nd row of pair to condition 1 (hit)

    # create shuffled df
    df_shuff = df.copy()
    df_shuff['hit_miss'] = hit_miss_shuff.astype(bool)  # convert to boolean

    # run model per timepoint with shuffled labels
    lme_null = run_lme_1way(df_shuff, verbose = False)

    # extract model outputs
    hit_miss_null[:, p] = lme_null['t']

  return hit_miss, hit_miss_null

###Create null distributions for cluster testing:

In [None]:
hit_miss, hit_miss_null = create_null_dist_1way(df, lme, nperm = 10)  # lowered nperm for demo

###Run cluster test:

In [None]:
_, p, _ = cluster_test(hit_miss, hit_miss_null, tail = 0)
lme['p_clust'] = p.flatten()

###Save:

In [None]:
with open('lme_clust_1way.pkl', 'wb') as f:
  pickle.dump(lme, f)

To load from Colab Files panel:  
```
with open('lme_clust_2way.pkl', 'rb') as f:  
  lme = pickle.load(f)
```

To download from Colab:  
```
from google.colab import files  
files.download('lme_clust_2way.pkl')
```