# Sep 19, 2025: reduce size of bootstrap estimates
sample 1000 partitions in total from the modes made from 15,000 partitions

conda env: gt

In [1]:
import csv
import os
import sys
import numpy as np
import pandas as pd
import scipy as sp 
import dill as pickle 
from scipy import stats
from os.path import join as pjoin
from itertools import product
from tqdm import tqdm
from copy import deepcopy
from pathlib import Path
import re

import glob
import random

from itertools import product, combinations
import multiprocessing as mp
from functools import partial
from joblib import Parallel, delayed

from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from munkres import Munkres

# networks
import graph_tool.all as gt

# plotting
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.cm import rainbow

plt.rcParamsDefault['font.family'] = "sans-serif"
plt.rcParamsDefault['font.sans-serif'] = "Arial"
plt.rcParams['font.size'] = 14
plt.rcParams["errorbar.capsize"] = 0.5

import colorcet as cc

# ignore user warnings
import warnings
warnings.filterwarnings("ignore") #, category=UserWarning)

In [2]:
class ARGS():
    pass

args = ARGS()

args.SEED = 100

In [3]:
args.source = 'allen' #'spatial' #'allen'
args.space = 'ccfv2' #'ccfv2'
args.brain_div = 'whl' #'whl'
args.num_rois = 172 #162 #172
args.resolution = 200 #200

PARC_DESC = (
    f'source-{args.source}'
    f'_space-{args.space}'
    f'_braindiv-{args.brain_div}'
    f'_nrois-{args.num_rois}'
    f'_res-{args.resolution}'
)
PARC_DESC

'source-allen_space-ccfv2_braindiv-whl_nrois-172_res-200'

In [4]:
args.GRAPH_DEF = f'constructed'
args.GRAPH_METHOD = f'pearson'
args.THRESHOLD = f'signed'
args.EDGE_DEF = f'binary'
args.EDGE_DENSITY = 20
args.LAYER_DEF = f'individual'
args.DATA_UNIT = f'grp-boot'

BASE_path = f'{os.environ["HOME"]}/new_mouse_dataset'
PARCELS_path = f'{BASE_path}/parcels'
ROI_path = (
    f'{BASE_path}/roi-results-v3'
    f'/{PARC_DESC}'
)
TS_path = f'{ROI_path}/roi_timeseries'
ROI_RESULTS_path = (
    f'{ROI_path}'
    f'/graph-{args.GRAPH_DEF}/method-{args.GRAPH_METHOD}'
    f'/threshold-{args.THRESHOLD}/edge-{args.EDGE_DEF}/density-{args.EDGE_DENSITY}'
    f'/layer-{args.LAYER_DEF}/unit-{args.DATA_UNIT}'
)
GRAPH_path = f'{ROI_RESULTS_path}/graphs'
os.system(f'mkdir -p {GRAPH_path}')
SBM_path = f'{ROI_RESULTS_path}/model-fits'
os.system(f'mkdir -p {SBM_path}')
DIAG_path = f'{ROI_RESULTS_path}/diagnostics'
os.system(f'mkdir -p {DIAG_path}')
ESTIM_path = f'{ROI_RESULTS_path}/estimates'
os.system(f'mkdir -p {ESTIM_path}/individual')
os.system(f'mkdir -p {ESTIM_path}/group')

0

In [5]:
args.dc, args.sbm = False, 'd'
args.nested = args.sbm == 'h'

args.force_niter = 100000
args.num_draws = int((1/2) * args.force_niter)

args.epsilon = 0.4 # threshold KSD for convergence
args.delta = np.ceil(args.force_niter / 100).astype(int)

def sbm_name(args):
    dc = f'dc' if args.dc else f'nd'
    dc = f'' if args.sbm in ['a', 'm'] else dc
    file = f'sbm-{dc}-{args.sbm}'
    return file

SBM = sbm_name(args)
SBM

'sbm-nd-d'

In [6]:
gfile = sorted(glob.glob(f'{GRAPH_path}/*', recursive=True))[0]
g = gt.load_graph(gfile)
g

<Graph object, undirected, with 172 vertices and 2942 edges, 1 internal edge property, at 0x7efb5d665e80>

In [7]:
def sample_partitions(args, indests_df):
    all_bs_df = []
    for idx, row in tqdm(indests_df.iterrows()):
        bs = random.sample(
            list(row['mode'].get_partitions().values()), 
            row['num_samples']
        )
        df = pd.DataFrame(dict(
            boot=[row['boot']]*len(bs),
            mode_id=[idx]*len(bs),
            b=bs,
        ))
        all_bs_df += [df]
        # all_bs += [
        #   row['mode'].sample_partition(MLE=True) 
        #   for _ in range(row['num_samples'])
        # ]
    all_bs_df = pd.concat(all_bs_df).reset_index(drop=True)
    return all_bs_df

def sample_nested_partitions(args, indests_df):
    all_bs_df = []
    for idx, row in tqdm(indests_df.iterrows()):
        bs = random.sample(
            list(row['mode'].get_nested_partitions().values()), 
            row['num_samples']
        )
        # bs = [gt.nested_partition_clear_null(b) for b in bs]
        df = pd.DataFrame(dict(
            boot=[row['boot']]*len(bs),
            mode_id=[idx]*len(bs),
            b=bs,
        ))
        all_bs_df += [df]
        # all_bs += [
        #   row['mode'].sample_partition(MLE=True) 
        #   for _ in range(row['num_samples'])
        # ]
    all_bs_df = pd.concat(all_bs_df).reset_index(drop=True)
    return all_bs_df

In [8]:
def create_reduced_indiv_estim(args, indiv_file):
    try:
        # load individual estimates
        with open(indiv_file, 'rb') as f:
            df = pickle.load(f)
            
        # make output folder
        match = re.search(r'boot-(\d+)', indiv_file)
        if match: boot,  = match.groups()
        red_indiv_path = f'{ESTIM_path}/individual/boot-{boot}/partition-modes-reduced'
        os.makedirs(red_indiv_path, exist_ok=True)

        # sample partitions per mode
        args.total_samples = 1000
        df['num_samples'] = df['omega'].apply(lambda x: np.round(x * args.total_samples).astype(int) if x > 0.01 else 1)
        if args.sbm in ['m', 'a', 'd']:
            all_bs_df = sample_partitions(args, df)
        if args.sbm in ['h']:
            all_bs_df = sample_nested_partitions(args, df)
            
        # create the indiv_estim_df
        red_df = []
        for mode_id, group in all_bs_df.groupby('mode_id'):
            mode = gt.PartitionModeState(group['b'], relabel=False, nested=args.nested, converge=False)
            r = df[df['mode_id'] == mode_id].reset_index(drop=True)
            row = pd.DataFrame(dict(
                boot=[r['boot'][0]],
                sbm=[r['sbm'][0]],
                mode_id=[mode_id],
                mode=[mode],
                omega=[r['omega'][0]],
                sigma=[r['sigma'][0]],
            ))
            red_df += [row]
            # break
        red_df = pd.concat(red_df).reset_index(drop=True)

        # save the df
        with open(f'{red_indiv_path}/{SBM}_desc-df.pkl', 'wb') as f:
            pickle.dump(red_df, f)
    except:
        pass

In [9]:

indiv_files = sorted(glob.glob(f'{ESTIM_path}/individual/boot-*/partition-modes/{SBM}_desc-df.pkl'))
len(indiv_files)

200

In [10]:
results = Parallel(n_jobs=10)(
    delayed(create_reduced_indiv_estim)(args, indiv_file) 
    for indiv_file in (indiv_files)
)

14it [00:00, 298.49it/s]
5it [00:00, 100.36it/s]
7it [00:00, 139.90it/s]
12it [00:00, 271.67it/s]
15it [00:00, 229.54it/s]
16it [00:00, 356.78it/s]
7it [00:00, 143.50it/s]
4it [00:00, 79.04it/s]
6it [00:00, 114.08it/s]
2it [00:00, 37.40it/s]
9it [00:00, 198.32it/s]
4it [00:00, 79.58it/s]
6it [00:00, 113.10it/s]
9it [00:00, 173.42it/s]
13it [00:00, 281.57it/s]
10it [00:00, 202.30it/s]
17it [00:00, 371.59it/s]
5it [00:00, 96.95it/s]
5it [00:00, 101.65it/s]
11it [00:00, 229.22it/s]
7it [00:00, 132.85it/s]
12it [00:00, 243.08it/s]
9it [00:00, 144.20it/s]
10it [00:00, 212.26it/s]
3it [00:00, 58.29it/s]
4it [00:00, 76.87it/s]
10it [00:00, 210.11it/s]
7it [00:00, 136.10it/s]
6it [00:00, 114.13it/s]
15it [00:00, 314.69it/s]
7it [00:00, 143.06it/s]
4it [00:00, 76.31it/s]
10it [00:00, 209.91it/s]
5it [00:00, 104.28it/s]
6it [00:00, 116.72it/s]
5it [00:00, 98.21it/s]
5it [00:00, 107.76it/s]
3it [00:00, 58.10it/s]
11it [00:00, 233.91it/s]
14it [00:00, 293.25it/s]
10it [00:00, 212.40it/s]
8it [00:0