# BrainGraph 86-node set — minimal EDA

This notebook:
- Loads `.graphml` brain connectomes (86-node set) with **NetworkX**.
- Inspects node/edge attributes.
- Summarizes per-graph stats across many files.
- Plots simple distributions.

> Update `DATA_DIR` to your folder (defaults to `data/HCP/86_nodes`).

In [None]:
from pathlib import Path
import networkx as nx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch_geometric.data import Data


In [None]:
DATA_DIR = Path("data/HCP/86_nodes")

assert DATA_DIR.exists(), f"DATA_DIR does not exist: {DATA_DIR}"
files = sorted(DATA_DIR.glob("*.graphml"))
print(f"Found {len(files)} .graphml files in {DATA_DIR}")
print("First 5 files:")
for f in files[:5]:
    print("  ", f.name)

Found 1064 .graphml files in data/HCP/86_nodes
First 5 files:
   100206_repeated10_scale33.graphml
   100307_repeated10_scale33.graphml
   100408_repeated10_scale33.graphml
   100610_repeated10_scale33.graphml
   101006_repeated10_scale33.graphml


In [None]:
PHENO_PATH = Path("data/HCP/HCP_phenotypes.csv")

assert PHENO_PATH.exists(), f"PHENO_PATH does not exist: {PHENO_PATH}"
pheno = pd.read_csv(PHENO_PATH)

print("Age_in_Yrs" in pheno.columns)

True


In [None]:
pheno.head()

Unnamed: 0,Subject,Release,Acquisition,Gender,Age,Age_in_Yrs,HasGT,ZygositySR,ZygosityGT,Family_ID,...,MOV2_TRACKFRAC,MOV2_TRFRAC,MOV3_TRACKFRAC,MOV3_TRFRAC,MOV4_TRACKFRAC,MOV4_TRFRAC,MOV_EYETRACK_COMPL,REST_TRACKFRAC_MIN,REST_TRFRAC_MIN,REST_EYETRACK_COMPL
0,100004,S900,Q06,M,22-25,24,True,NotTwin,,52259_82122,...,,,,,,,,,,
1,100206,S900,Q11,M,26-30,27,True,NotTwin,,56037_85858,...,,,,,,,,,,
2,100307,Q1,Q01,F,26-30,27,True,NotMZ,MZ,51488_81352,...,,,,,,,,,,
3,100408,Q3,Q03,M,31-35,33,True,MZ,MZ,51730_81594,...,,,,,,,,,,
4,100610,S900,Q08,M,26-30,27,True,NotMZ,DZ,52813_82634,...,,,,,,,,,,


In [None]:
# Load a single example graph & peek
g = nx.read_graphml(files[0])
print("loaded:", files[0].name)
print("nodes:", g.number_of_nodes(), "edges:", g.number_of_edges())

# peek a node
n0 = list(g.nodes())[0]
print("\nexample node:", n0)
print("node attrs:", g.nodes[n0])

# peek an edge
e0 = list(g.edges())[0]
print("\nexample edge:", e0)
print("edge attrs:", g.edges[e0])

loaded: 100206_repeated10_scale33.graphml
nodes: 83 edges: 725

example node: 1
node attrs: {'dn_position_x': 34.0889628925, 'dn_position_y': 83.5585156993, 'dn_position_z': 8.72454804948, 'dn_correspondence_id': '1', 'dn_region': 'cortical', 'dn_fsname': 'lateralorbitofrontal', 'dn_name': 'rh.lateralorbitofrontal', 'dn_hemisphere': 'right'}

example edge: ('1', '34')
edge attrs: {'fiber_length_mean': 13.3253736496, 'FA_mean': 0.202591825277, 'number_of_fibers': 10.125}


## Following cell creates a smaller version of subjects csv file with just age and id

In [None]:
# keep only subject ID and age
meta = pheno[['Subject', 'Age_in_Yrs']].copy()

# rename columns
meta = meta.rename(columns={
    'Subject': 'subject_id',
    'Age_in_Yrs': 'age'
})

meta['subject_id'] = meta['subject_id'].astype(str)  # raw data uses integers for id

meta.to_csv("data/HCP/HCP_subjects_age_only.csv", index=False)
meta.shape


(1206, 2)

In [None]:
def graph_to_data(graph_path):
    G = nx.read_graphml(graph_path)
    nodes = list(G.nodes())
    idx = {n: i for i, n in enumerate(nodes)}

    src, dst = [], []
    for u, v in G.edges():
        ui, vi = idx[u], idx[v]
        src += [ui, vi]
        dst += [vi, ui]
    edge_index = torch.tensor([src, dst], dtype=torch.long)

    coords = np.zeros((len(nodes), 3))
    for n in nodes:
        attrs = g.nodes[n]
        coords[idx[n]] = [float(attrs['dn_position_x']), float(attrs['dn_position_y']), float(attrs['dn_position_z'])]

    x = torch.tensor(coords, dtype=torch.float32)

    # Label (age)
    filename = graph_path.name
    subject_id = filename.split('_')[0]
    age = meta.loc[meta.subject_id == subject_id, "age"].values[0]
    y = torch.tensor([age], dtype=torch.float32)

    return Data(x=x, edge_index=edge_index, y=y)

In [None]:
# Use a small subset first to test
N = 200
subset_files = files[:N]

data_list = [graph_to_data(p) for p in subset_files]
len(data_list)


200

In [None]:
from sklearn.model_selection import train_test_split

train_idx, test_idx = train_test_split(range(len(data_list)), test_size=0.2, random_state=42)
train_idx, val_idx = train_test_split(train_idx, test_size=0.2, random_state=42)

train_data = [data_list[i] for i in train_idx]
val_data   = [data_list[i] for i in val_idx]
test_data  = [data_list[i] for i in test_idx]


In [None]:
import torch.nn as nn
from torch_geometric.nn import SAGEConv

In [None]:
class GraphSAGERegressor(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = SAGEConv(1, 16)
        self.conv2 = SAGEConv(16, 16)
        self.readout = nn.Linear(16, 1)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = torch.relu(self.conv1(x, edge_index))
        x = torch.relu(self.conv2(x, edge_index))
        x = x.mean(dim=0)  # graph level mean pooling
        return self.readout(x)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GraphSAGERegressor().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.L1Loss()

def run_batch(data_list, train=True):
    total_loss = 0
    for data in data_list:
        data = data.to(device)
        pred = model(data)
        loss = loss_fn(pred, data.y)
        if train:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        total_loss += loss.item()
    return total_loss / len(data_list)

for epoch in range(1, 31):
    train_loss = run_batch(train_data, train=True)
    val_loss = run_batch(val_data, train=False)
    if epoch % 5 == 0:
        print(f"epoch {epoch:3d} | train MAE {train_loss:.3f} | val MAE {val_loss:.3f}")


RuntimeError: mat1 and mat2 shapes cannot be multiplied (83x3 and 1x16)

In [None]:
test_mae = run_batch(test_data, train=False)
print(f"Test MAE: {test_mae:.3f} years")

Test MAE: 3.084 years
