In [None]:
# param to run the encoding pipeline on all subjects or on 1 subject (faster)
batch_mode = True

In [None]:
# gets dataframes with 140 time points and cuts the excess
# 140 is the minimum 

import os 
import pandas as pd
import re

folder_path = "abide_timeseries/aal_csv" # can also be cc200

subjects = dict()

# subject_dfs = []

df = pd.read_csv("abide_timeseries/phenotypic.csv")
label_dict = dict(zip(df["SUB_ID"].astype(str), df["DX_GROUP"]))

for index, filename in enumerate(sorted(os.listdir(folder_path))):
    if filename.endswith(".csv"):
        file_path = os.path.join(folder_path, filename)
        try:
            df = pd.read_csv(file_path)
            df = df.drop(index=0).reset_index(drop=True) # skips second row which is metadata

            if len(df) >= 140:
                match = re.search(r'00\d+', filename)
                
                if match:
                    subj_id = str(int(match.group()))  # remove leading zeros

                    trimmed_df = df.iloc[:140]
                    # subject_dfs.append(trimmed_df)

                    subjects[subj_id] = trimmed_df
            
            if not batch_mode:
                if index == 20:
                    break

        except Exception as e:
            print(f"Error reading {filename}: {e}")

print("finished getting subject data frames (140 timepoints)")
print(f"subject_dfs length: {len(subjects.items())}")


In [None]:
# Inspect one item from the `subjects` dict
try:
    first_key = next(iter(subjects))
    first_value = subjects[first_key]
    print(f"First subject key: {first_key}")
    if hasattr(first_value, "shape"):
        print(f"Value type: {type(first_value)} shape={first_value.shape}")
    else:
        print(f"Value type: {type(first_value)}")
    print("Preview:")
    try:
        print(first_value.head())
    except Exception:
        print(first_value)
except StopIteration:
    print("subjects is empty (build it in the earlier cell first)")


In [None]:
# get windows per subject df, total per subject is 11 slices/windows
subject_df_windows = dict() # 2d arr [[...windows], [...windows]]

for subj_id, df in subjects.items():
    windows = []
    for start in range (0, 140-40 + 1, 10): # step 10 with 40 length
        window = df.iloc[start:start + 40].copy()
        windows.append(window)
    
    subject_df_windows[subj_id] = windows

print("finished getting slices from dfs")

avg_windows_length = 0
for df_windows in subject_df_windows.values():
    avg_windows_length += len(df_windows)

print(f"avg windows per df: {avg_windows_length/len(subject_df_windows)}\n")

print("sample window from first df:")
print(next(iter(next(iter(subject_df_windows.values())))))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

subject_fcs = dict() # 2d arr [[...fc matrices]]

for subj_id, subject_df in subject_df_windows.items():
    subject_fc_matrices = []
    for df_window in subject_df:
        fc_matrix = df_window.corr(method='pearson')
        fc_matrix = fc_matrix.fillna(0) # replaces w 0 null values (zero connectivity assumption)
        subject_fc_matrices.append(fc_matrix)
    
    subject_fcs[subj_id] = subject_fc_matrices

print("finished getting functional connectivity matrices")

avg_fc_matrix_length = 0
for fc in subject_fcs.values():
    avg_fc_matrix_length += len(fc)

print(f"avg fc matrices per subj: {avg_fc_matrix_length/len(subject_fcs.values())}\n")

print("sample first fc matrix from first subject:")
print(next(iter(next(iter(subject_fcs.values())))))

print("visualization of fc matrix:")
sns.heatmap(next(iter(next(iter(subject_fcs.values())))), cmap='coolwarm', center=0)
plt.title("Subject 0 - Window 0 FC Matrix")
plt.show()


In [None]:
# edges for the GCN (global z-score across subjects)
import numpy as np

subject_adjacency_matrices = dict()
subject_order = []

for subj_id, subject_fc in subject_fcs.items():
    stacked_fcs = np.stack(subject_fc)
    summed_fc_values = np.sum(stacked_fcs, axis=0)
    fc_strength_per_region = summed_fc_values / stacked_fcs.shape[0]

    x = stacked_fcs[:-1, :, :]
    y = stacked_fcs[1:, :, :]
    cov = np.mean((x - np.mean(x, axis=0)) * (y - np.mean(y, axis=0)), axis=0)
    std_x = np.std(x, axis=0)
    std_y = np.std(y, axis=0)
    eps = 1e-8
    fc_stability_per_region = cov / (std_x * std_y + eps)

    # Raw weights per subject (no per-subject minâ€“max)
    adj_matrix = (fc_strength_per_region + fc_stability_per_region) / 2

    np.fill_diagonal(adj_matrix, 0)

    if np.isnan(adj_matrix).any():
        print('has nan for subject', subj_id)

    subject_adjacency_matrices[subj_id] = adj_matrix
    subject_order.append(subj_id)

print("finished computing raw adjacency matrices per subject")

# Stack all adjacency matrices: shape [S, N, N]
all_adj = np.stack([subject_adjacency_matrices[sid] for sid in subject_order], axis=0)

# Global z-score per edge using shared mean/std across subjects
global_mean = np.mean(all_adj, axis=0)
global_std = np.std(all_adj, axis=0)
eps_z = 1e-8
all_adj_z = (all_adj - global_mean) / (global_std + eps_z)

# Map back into dict
for i, sid in enumerate(subject_order):
    subject_adjacency_matrices[sid] = all_adj_z[i]

print("applied global z-score normalization across subjects [S,N,N]")

print("sample adjacency matrix from first subject (z-scored):")
print(next(iter(subject_adjacency_matrices.values())))

print("visualization of adjacency matrix (global z-score):")
sns.heatmap(next(iter(subject_adjacency_matrices.values())), cmap='coolwarm', center=0)
plt.title("Subject 0 - Adjacency Matrix (global z-score)")
plt.show()


In [None]:
np.save("gcn_input/subject_adjacency_matrices.npy", np.array(list(subject_adjacency_matrices.values())))

In [None]:
# feed the fc matrices to the random walker plainly
# each jump is decided randomly first either interlayer or intralayer (weighted on how strong the intralayer connections are)
import random

def random_walk(matrices, num_walks=200, walk_length=100, threshold=0.7):
    walks = []
    for _ in range(num_walks):
        m = matrices[0].shape[0]
        t = np.random.randint(0, len(matrices))
        i = np.random.randint(0, m) # random region
        walk = []

        for _ in range(walk_length):
            region_id = i
            walk.append(region_id)

            if t == len(matrices) - 1:
                break
                
            current_matrix = matrices[t]
            strong_neighbours = []
            current_region_connections = current_matrix.iloc[i].values
            for index, conn in enumerate(current_region_connections):
                if conn > threshold and region_id != index:
                    strong_neighbours.append(index)

            if strong_neighbours:
                i = random.choice(strong_neighbours)
            else:
                t += 1

        walks.append(walk)
    return walks


In [None]:
# nodes for the GCN (global, label-aware Word2Vec + per-subject fine-tune)
from gensim.models import Word2Vec
from copy import deepcopy

subject_embedding_matrices = dict()
subject_ids = []

# 1) Build global corpus with label tokens
#    - Each subject's walk becomes a sentence of ROI ids as strings
#    - We inject a label token so the embedding learns class-aware co-occurrences
#      ABIDE DX_GROUP: 1=ASD, 2=Control (map others conservatively)

def _label_token_for_subject(subj_id: str) -> str:
    raw = label_dict.get(str(subj_id))
    if raw in (1, "1"):
        return "__LABEL_ASD__"
    if raw in (2, "2"):
        return "__LABEL_CTL__"
    # Fallback for any unexpected value
    return f"__LABEL_{raw}__"

# Cache subject-specific sentences to avoid regenerating walks twice
subject_sentences = dict()
global_sentences = []

print("Building global corpus for Word2Vec (with label tokens)...")
for subj_id, subject_fc in subject_fcs.items():
    subj_label_tok = _label_token_for_subject(subj_id)
    subject_walks = random_walk(subject_fc)

    # Insert label token once per sentence (at both ends to increase interactions)
    sentences = []
    for walk in subject_walks:
        tokens = [str(n) for n in walk]
        if len(tokens) == 0:
            continue
        sentences.append([subj_label_tok] + tokens + [subj_label_tok])

    subject_sentences[subj_id] = sentences
    global_sentences.extend(sentences)

print(f"Total sentences in global corpus: {len(global_sentences)}")

# 2) Train a single global model (shared across subjects) using labels
global_model = Word2Vec(
    sentences=global_sentences,
    vector_size=128,
    window=2,
    min_count=0,
    sg=1,
    workers=4,
    epochs=10,
    negative=5,
)

# 3) Per-subject fine-tuning to get subject-specific node embeddings
#    We start from the global weights, then adapt on that subject's sentences.
#    Finally, we extract per-ROI embeddings (116 x 128) for the GCN.

print("Fine-tuning per subject and extracting node embeddings...")
for subj_id, sentences in subject_sentences.items():
    print(f"subject: {subj_id}")

    # Make a subject-specific copy and fine-tune briefly
    model_subj = deepcopy(global_model)
    if len(sentences) > 0:
        model_subj.train(sentences, total_examples=len(sentences), epochs=5)

    # Build embedding matrix for ROI tokens '0'..'115'
    embedding_matrix = np.zeros((116, 128), dtype=np.float32)
    for roi_idx in range(116):
        token = str(roi_idx)
        if token in model_subj.wv:
            embedding_matrix[roi_idx] = model_subj.wv[token]
        else:
            # If somehow OOV, keep zeros for this ROI (should be rare)
            embedding_matrix[roi_idx] = 0.0

    subject_embedding_matrices[subj_id] = embedding_matrix

# 4) Save outputs for downstream
np.save("gcn_input/subject_embedding_matrices.npy", np.array(list(subject_embedding_matrices.values())))
np.save("gcn_input/subject_ids.npy", np.array(list(subject_embedding_matrices.keys())))

print(next(iter(subject_adjacency_matrices.values())).shape)