# Sep 5, 2025: group-averaged graph
unit-grp

create animal-level FC matrices, Fisher transform them, average, Inverse Fisher transform, then threshold.

conda env: gt

In [1]:
import os
import glob
import pandas as pd
import re
import numpy as np
from tqdm import tqdm
from itertools import product, combinations
from sklearn.covariance import GraphicalLasso
from scipy.stats import entropy, zscore
from sklearn.metrics import mutual_info_score
from joblib import Parallel, delayed
import graph_tool.all as gt 
import seaborn as sns

In [2]:
class ARGS():
    pass

args = ARGS()

args.SEED = 100

np.random.seed(args.SEED)

In [3]:
args.source = 'allen' #'allen'
args.space = 'ccfv2' #'ccfv2'
args.brain_div = 'whl' #'whl'
args.num_rois = 172 #216 #334 #162 #172
args.resolution = 200 #200

PARC_DESC = (
    f'source-{args.source}'
    f'_space-{args.space}'
    f'_braindiv-{args.brain_div}'
    f'_nrois-{args.num_rois}'
    f'_res-{args.resolution}'
)
PARC_DESC

'source-allen_space-ccfv2_braindiv-whl_nrois-172_res-200'

In [4]:
BASE_path = f'{os.environ["HOME"]}/new_mouse_dataset'
PARCELS_path = f'{BASE_path}/parcels'
ROI_path = (
    f'{BASE_path}/roi-results-v3'
    f'/{PARC_DESC}'
)
os.system(f'mkdir -p {ROI_path}') 
TS_path = f'{ROI_path}/roi-timeseries'
os.system(f'mkdir -p {TS_path}')

0

In [5]:
args.eps = 1e-7

In [6]:
def collect_timeseries(files):
    data_df = []

    pattern = re.compile(
        r"sub-(?P<sub>\w+)_ses-(?P<ses>\d+)_run-(?P<run>\d+)_task-(?P<task>\w+)_desc-ts\.txt"
    )
    # sub-SLC01_ses-1_run-11_task-rest_desc-ts.txt

    for file in tqdm(files):
        file_name = os.path.basename(file)
        match = pattern.match(file_name)
        metadata = match.groupdict()
        ts = np.loadtxt(file)
        metadata['ts'] = zscore(ts, axis=0, nan_policy='omit')
        data_df.append(metadata)
    data_df = pd.DataFrame(data_df).reset_index(drop=True)
    return data_df

In [7]:
data_df = collect_timeseries(
    files=sorted(glob.glob(f'{TS_path}/*', recursive=True))
)

  0%|          | 0/86 [00:00<?, ?it/s]

100%|██████████| 86/86 [00:00<00:00, 120.17it/s]


In [8]:
data_df

Unnamed: 0,sub,ses,run,task,ts
0,SLC01,1,11,rest,"[[-0.09603879094503522, -0.948000069498075, 0...."
1,SLC01,1,15,rest,"[[0.22999063513993873, -1.755006218258903, -1...."
2,SLC01,1,19,rest,"[[0.17501412665192317, 2.05601323642688, 1.316..."
3,SLC01,2,10,rest,"[[2.1940397338766564, -0.10901452704916718, 1...."
4,SLC01,2,6,rest,"[[0.6019987463186959, -0.13199193256572464, -1..."
...,...,...,...,...,...
81,SLC10,2,9,rest,"[[-1.4239750882089304, 1.0019995779515494, 1.1..."
82,SLC10,3,13,rest,"[[0.12197742984919095, 0.757978449952126, 0.35..."
83,SLC10,3,17,rest,"[[-0.6010203919150929, -0.4650008433734304, -0..."
84,SLC10,3,5,rest,"[[-1.8219869276216611, 0.3349962480630326, 0.6..."


In [9]:
def get_cols(args):
    if args.DATA_UNIT == 'ses':
        cols = ['sub', 'ses', 'task']
    if args.DATA_UNIT == 'sub':
        cols = ['sub', 'task']
    if args.DATA_UNIT == 'grp':
        cols = ['task'] 
    return cols

In [10]:
# normalized mutual information
def optimal_bin_size(ts, method="fd"):
    """Computes the optimal number of bins for fMRI time series based on the selected method."""
    N = len(ts)  # Number of time points

    if method == "sturges":
        return int(np.ceil(np.log2(N) + 1))
    
    elif method == "rice":
        return int(np.ceil(2 * N ** (1/3)))

    elif method == "fd":  # Freedman-Diaconis Rule
        iqr = np.percentile(ts, 75) - np.percentile(ts, 25)
        bin_width = (2 * iqr) / (N ** (1/3))
        return int(np.ceil((np.max(ts) - np.min(ts)) / bin_width))

    elif method == "scott":  # Scott's Rule
        std_dev = np.std(ts)
        bin_width = (3.5 * std_dev) / (N ** (1/3))
        return int(np.ceil((np.max(ts) - np.min(ts)) / bin_width))
    
def compute_joint_density(ts1, ts2, bins=100):
    hist_xy, x_edges, y_edges = np.histogram2d(ts1, ts2, bins=bins, density=True)
    hist_x = np.histogram(ts1, bins=x_edges, density=True)[0]
    hist_y = np.histogram(ts2, bins=y_edges, density=True)[0]

    p_xy = hist_xy / np.sum(hist_xy) # joint density
    p_x = hist_x / np.sum(hist_x) # marginal of x
    p_y = hist_y / np.sum(hist_y) # marginal of y

    return p_xy, p_x, p_y

def compute_nmi(ts1, ts2, bins=100):
    # densities
    p_xy, p_x, p_y = compute_joint_density(ts1, ts2, bins)
    
    # entropies
    Hxy = entropy(p_xy.flatten(), base=2) # joint entropy: same as summing `- p_xy log(p_xy)` over each (x, y)
    Hx = entropy(p_x, base=2) 
    Hy = entropy(p_y, base=2)

    # mutual information
    Ixy = Hx + Hy - Hxy

    # normalize MI
    Ixy = Ixy / np.sqrt(Hx * Hy) if Hx > 0 and Hy > 0 else 0
    return Ixy

def compute_nmi_matrix(ts, bins=100, n_jobs=10):
    num_rois = ts.shape[1]
    nmi_matrix = np.zeros((num_rois, num_rois))

    def compute_nmi_pair(i, j):
        return compute_nmi(ts[:, i], ts[:, j], bins)
    
    results = Parallel(n_jobs=n_jobs)(
        delayed(compute_nmi_pair)(i, j)
        for i, j in combinations(range(num_rois), 2)
    )

    # fill nmi matrix
    for idx, (i, j) in enumerate(combinations(range(num_rois), 2)):
        nmi_matrix[i, j] = results[idx]
        nmi_matrix[j, i] = results[idx]
    
    return nmi_matrix

In [11]:
def compute_fc(args, ts):
    # ts.shape : time x rois
    if args.GRAPH_METHOD == 'pearson':
        fc = np.corrcoef(ts.T)
        # fc -= np.diag(np.diag(fc))
    if args.GRAPH_METHOD == 'partial':
        model = GraphicalLasso(alpha=0.01)
        model.fit(ts)
        fc = -model.precision_ # inverse covariance matrix
    if args.GRAPH_METHOD == 'mutualinfo':
        bins = optimal_bin_size(ts)
        fc = compute_nmi_matrix(ts, bins=bins, n_jobs=10)
    return np.nan_to_num(fc)

def threshold_fc(args, fc_matrix):
    keep_ratio = args.EDGE_DENSITY / 100
    
    fc_thresh = np.zeros_like(fc_matrix)

    # Compute percentile threshold
    fc_values = fc_matrix[np.triu_indices_from(fc_matrix, k=1)]  # Extract upper triangle
    if args.THRESHOLD=='signed':
        fc_values = fc_values  # Consider values with their signs
    if args.THRESHOLD=='unsigned':
        fc_values = np.abs(fc_values) # Consider values without their signs
    percentile_thresh = np.percentile(fc_values, 100 * (1 - keep_ratio))

    # Apply percentile threshold
    mask = fc_matrix >= percentile_thresh

    # construct edges by their definition
    if args.EDGE_DEF == 'binary':
        fc_thresh = mask
    elif args.EDGE_DEF == 'weighted':
        fc_thresh = fc_matrix * mask

    return fc_thresh

def make_graph(fc):
    fc = np.tril(fc, k=-1)

    edges = np.where(fc)
    edge_list = list(zip(*[*edges, fc[edges]]))

    g = gt.Graph(
        edge_list,
        eprops=[('weight', 'double')],
        directed=False, 
    )
    
    return g

def save_graph(g, identity, GRAPH_path):
    file = '_'.join([identity] + [f'desc-graph.gt.gz'])
    file = f'{GRAPH_path}/{file}'
    g.save(file)
    return file

In [12]:
def fisher_z(r):
    r = np.clip(r, -1+args.eps, 1-args.eps)
    return np.arctanh(r)

def tidy_corr(R):
    R = 0.5 * (R + R.T)
    np.fill_diagonal(R, 1.0)
    return R

In [13]:
GRAPH_DEFS = [f'constructed']
GRAPH_METHODS = [f'pearson'] # [f'pearson', f'mutualinfo']
THRESHOLDINGS = [f'signed', f'unsigned']
EDGE_DEFS = [f'binary', f'weighted']
EDGE_DENSITIES = [10, 20, 30] #[10, 15, 20, 25]
LAYER_DEFS = [f'individual'] #, f'multilayer']
DATA_UNITS = [f'ses', f'sub', f'grp']

In [14]:
args.GRAPH_DEF = f'constructed'
args.GRAPH_METHOD = f'pearson'
args.LAYER_DEF = f'individual'


In [15]:
data_df['t_eff'] = data_df['ts'].apply(lambda ts: len(ts))
data_df['fc'] = data_df['ts'].apply(lambda ts: compute_fc(args, ts))
data_df['t_eff'] = data_df['ts'].apply(lambda ts: len(ts))
data_df['w'] = data_df["t_eff"].clip(lower=3+1e-9) - 3  # ensure >0
data_df['z'] = data_df['fc'].apply(lambda fc: fisher_z(fc))
data_df

Unnamed: 0,sub,ses,run,task,ts,t_eff,fc,w,z
0,SLC01,1,11,rest,"[[-0.09603879094503522, -0.948000069498075, 0....",533,"[[1.0, -0.10886391766086419, 0.311364838037673...",530,"[[8.40562139102231, -0.10929706343270773, 0.32..."
1,SLC01,1,15,rest,"[[0.22999063513993873, -1.755006218258903, -1....",531,"[[0.9999999999999998, -0.12571264719482755, 0....",528,"[[8.40562139102231, -0.12638123884254845, 0.21..."
2,SLC01,1,19,rest,"[[0.17501412665192317, 2.05601323642688, 1.316...",531,"[[0.9999999999999998, -0.03422790702471271, 0....",528,"[[8.40562139102231, -0.03424128299215276, 0.18..."
3,SLC01,2,10,rest,"[[2.1940397338766564, -0.10901452704916718, 1....",532,"[[0.9999999999999998, 0.03591529175047554, 0.3...",529,"[[8.40562139102231, 0.035930746189347794, 0.35..."
4,SLC01,2,6,rest,"[[0.6019987463186959, -0.13199193256572464, -1...",521,"[[1.0, -0.21089883609353977, 0.220007051311404...",518,"[[8.40562139102231, -0.21411183603152698, 0.22..."
...,...,...,...,...,...,...,...,...,...
81,SLC10,2,9,rest,"[[-1.4239750882089304, 1.0019995779515494, 1.1...",533,"[[1.0, -0.019338181805197915, 0.08170109172096...",530,"[[8.40562139102231, -0.019340592949060504, 0.0..."
82,SLC10,3,13,rest,"[[0.12197742984919095, 0.757978449952126, 0.35...",536,"[[1.0, -0.21659691681487817, 0.260612114253286...",533,"[[8.40562139102231, -0.220082735374864, 0.2667..."
83,SLC10,3,17,rest,"[[-0.6010203919150929, -0.4650008433734304, -0...",537,"[[0.9999999999999998, -0.02282767323078009, 0....",534,"[[8.40562139102231, -0.022831639658123248, 0.1..."
84,SLC10,3,5,rest,"[[-1.8219869276216611, 0.3349962480630326, 0.6...",540,"[[1.0, -0.004312891989673601, 0.22980806545929...",537,"[[8.40562139102231, -0.004312918731393667, 0.2..."


In [16]:
def average_z(group):
    Z = np.stack(group['z'].to_list())
    W = group['w'].to_numpy()[:, None, None]
    z = np.nansum(W * Z, axis=0) / np.nansum(W, axis=0)
    fc = np.tanh(z)
    fc = tidy_corr(fc)
    return pd.Series({
        'z': z,
        'w': group['w'].sum(),
        'fc': fc,
    })

In [17]:
ses_df = data_df.groupby(by=['sub', 'ses', 'task']).apply(average_z, include_groups=True).reset_index()
ses_df

  ses_df = data_df.groupby(by=['sub', 'ses', 'task']).apply(average_z, include_groups=True).reset_index()


Unnamed: 0,sub,ses,task,z,w,fc
0,SLC01,1,rest,"[[8.40562139102231, -0.08999756314505507, 0.24...",1586,"[[1.0, -0.08975536752411069, 0.236645312971455..."
1,SLC01,2,rest,"[[8.40562139102231, -0.08777704520550716, 0.29...",1047,"[[1.0, -0.08755230267737592, 0.283925135085443..."
2,SLC01,3,rest,"[[8.405621391022311, 0.014810554526199664, 0.2...",2098,"[[1.0, 0.014809471710361085, 0.213537706881693..."
3,SLC03,1,rest,"[[8.40562139102231, 0.0037629145263606578, 0.2...",1592,"[[1.0, 0.0037628967660995596, 0.25966124935463..."
4,SLC03,2,rest,"[[8.40562139102231, 0.06687237863989279, 0.361...",2118,"[[1.0, 0.06677287409485314, 0.3466844735034332..."
5,SLC03,3,rest,"[[8.405621391022311, 0.04501265705516393, 0.37...",2127,"[[1.0, 0.04498228103559874, 0.3605289791271654..."
6,SLC04,1,rest,"[[8.40562139102231, -0.06939270583361283, 0.25...",2041,"[[1.0, -0.06928153662236834, 0.249713141357357..."
7,SLC04,2,rest,"[[8.40562139102231, -0.07243721355486878, 0.26...",2103,"[[1.0, -0.07231078260296601, 0.254387262054745..."
8,SLC04,3,rest,"[[8.40562139102231, 0.09644310645099868, 0.417...",2094,"[[1.0, 0.09614520021777095, 0.3950022559328045..."
9,SLC05,1,rest,"[[8.40562139102231, 0.07974200673437051, 0.383...",1074,"[[1.0, 0.07957341470886012, 0.3661388463823281..."


In [18]:
sub_df = ses_df.groupby(by=['sub', 'task']).apply(average_z, include_groups=True).reset_index()
sub_df['w'] = 1
sub_df

  sub_df = ses_df.groupby(by=['sub', 'task']).apply(average_z, include_groups=True).reset_index()


Unnamed: 0,sub,task,z,w,fc
0,SLC01,rest,"[[8.40562139102231, -0.043028145863930764, 0.2...",1,"[[1.0, -0.0430016111054574, 0.2370529275201532..."
1,SLC03,rest,"[[8.40562139102231, 0.041694051643240145, 0.34...",1,"[[1.0, 0.04166990820248833, 0.3286138315102526..."
2,SLC04,rest,"[[8.405621391022311, -0.014750578359073672, 0....",1,"[[1.0, -0.01474950864404698, 0.301686553438661..."
3,SLC05,rest,"[[8.40562139102231, 0.10473049241848142, 0.306...",1,"[[1.0, 0.10434925367537456, 0.2968628019133200..."
4,SLC06,rest,"[[8.40562139102231, -0.06184230126016711, 0.33...",1,"[[1.0, -0.06176358366593771, 0.322724632269617..."
5,SLC07,rest,"[[8.40562139102231, 0.057454669679421505, 0.25...",1,"[[1.0, 0.057391533008341995, 0.250147832964476..."
6,SLC08,rest,"[[8.40562139102231, 0.10044777385990537, 0.353...",1,"[[1.0, 0.10011130061576282, 0.3394851638533646..."
7,SLC09,rest,"[[8.40562139102231, -0.05760458042311011, 0.24...",1,"[[1.0, -0.05754094869117965, 0.240640629634531..."
8,SLC10,rest,"[[8.40562139102231, -0.034450925176802955, 0.2...",1,"[[1.0, -0.034437302097535614, 0.21292048177397..."


In [19]:
def save_graphs(args, df, ITERS, idx_boot):
    for (
        THRESHOLD, 
        EDGE_DEF, 
        EDGE_DENSITY,
    ) in ITERS:
        args.THRESHOLD = THRESHOLD
        args.EDGE_DEF = EDGE_DEF
        args.EDGE_DENSITY = EDGE_DENSITY
        
        ROI_RESULTS_path = (
            f'{ROI_path}'
            f'/graph-{args.GRAPH_DEF}/method-{args.GRAPH_METHOD}'
            f'/threshold-{args.THRESHOLD}/edge-{args.EDGE_DEF}/density-{args.EDGE_DENSITY}'
            f'/layer-{args.LAYER_DEF}/unit-{args.DATA_UNIT}-boot'
        )
        GRAPH_path = f'{ROI_RESULTS_path}/graphs'
        os.system(f'mkdir -p {GRAPH_path}')
        
        cols = get_cols(args)
        
        for i, row in df.iterrows():
            identity = f'boot-{idx_boot:03d}'
            fc = row['fc']
            fc = threshold_fc(args, fc)
            g = make_graph(fc)
            file = save_graph(g, identity, GRAPH_path)

In [20]:
for idx_boot in tqdm(range(0, 500)):
    boot_df = sub_df.sample(frac=1, replace=True, random_state=idx_boot)
    grp_df = pd.DataFrame(average_z(boot_df)).T
    
    args.DATA_UNIT = 'grp'
    ITERS = product( 
        THRESHOLDINGS, 
        EDGE_DEFS, 
        EDGE_DENSITIES, 
    )
    save_graphs(args, grp_df, ITERS, idx_boot)
    # break

  0%|          | 0/500 [00:00<?, ?it/s]

100%|██████████| 500/500 [01:10<00:00,  7.10it/s]
