# Imports

In [1]:
from rbclib import RBCPath
from pathlib import Path
import pandas as pd
import numpy as np

# Load data


In [2]:
rbcdata_path = Path('/home/jovyan/shared/data/RBC')
train_filepath = rbcdata_path / 'train_participants.tsv'
test_filepath = rbcdata_path / 'test_participants.tsv'

with train_filepath.open('r') as f:
    train_data = pd.read_csv(f, sep='\t')
with test_filepath.open('r') as f:
    test_data = pd.read_csv(f, sep='\t')

all_data = pd.concat([train_data, test_data])


In [3]:
def load_fsdata(participant_id, local_cache_dir=(Path.home() / 'cache')):
    "Loads and returns the dataframe of a PNC participant's FreeSurfer data."

    # Check that the local_cache_dir exists and make it if it doesn't.
    if local_cache_dir is not None:
        local_cache_dir = Path(local_cache_dir)
        local_cache_dir.mkdir(exist_ok=True)
    
    # Make the RBCPath and find the appropriate file:
    pnc_freesurfer_path = RBCPath(
        'rbc://PNC_FreeSurfer/freesurfer',
        # We provide the local_cache_dir to the RBCPath object; all paths made
        # from this object will use the same cache directory.
        local_cache_dir=local_cache_dir)
    participant_path = pnc_freesurfer_path / f'sub-{participant_id}'
    tsv_path = participant_path / f'sub-{participant_id}_regionsurfacestats.tsv'

    # Use pandas to read in the TSV file:
    with tsv_path.open('r') as f:
        data = pd.read_csv(f, sep='\t')

    # Return the loaded data:
    return data

# Option A


In [69]:
def load_features_by_region(participant_id, regions):
    data = load_fsdata(participant_id)
    region_vectors = []

    for region in regions:
        row_mask = data['atlas'] == region

        if row_mask.sum() == 0:
            # Region missing: fill with NaNs
            numeric_cols = data.select_dtypes(include='number').columns
            vec = pd.Series([float('nan')] * len(numeric_cols), index=numeric_cols)
        else:
            vec = data.loc[row_mask].select_dtypes(include='number').mean()

        # Rename features to include region name
        vec.index = [f"{col}_{region}" for col in vec.index]
        region_vectors.append(vec)

    full_vector = pd.concat(region_vectors)
    full_vector.name = participant_id
    return full_vector

In [None]:
regions =  ['aparc']
X_rows = []
participant_ids = []

for _, row in all_data.iterrows():
    pid = row['participant_id']
    try:
        feats = load_features_by_region(pid, regions)
        X_rows.append(feats)
        participant_ids.append(pid)
    except Exception as e:
        print(f"Error loading participant {pid}: {e}")

X_matrix = pd.DataFrame(X_rows, index=participant_ids)

# Option B

In [86]:
def get_features_subject(data):
    # filter by atlas, get one row of mean volume by region
    df_ = (data
     .query("atlas=='aparc'")
     .groupby(['StructName'])
     ['GrayVol']
     .mean()
     .to_frame()
     .transpose()
    )
    # some pandas weird things to make the output nice
    df_.columns.name = 'index'
    df_ = df_.reset_index(drop=True)
    # add the subject id to be able to concat data later on
    df_['subject_id'] = data['subject_id'].unique()
    return df_


In [87]:
participants = list(all_data['participant_id'].unique())
feature_df_list = []
for participant in participants[:3]:
    df_ = load_fsdata(participant)
    feature_df_list.append(get_features_subject(df_))
feature_df = pd.concat(feature_df_list)
feature_df

index,bankssts,caudalanteriorcingulate,caudalmiddlefrontal,cuneus,entorhinal,frontalpole,fusiform,inferiorparietal,inferiortemporal,insula,...,precuneus,rostralanteriorcingulate,rostralmiddlefrontal,superiorfrontal,superiorparietal,superiortemporal,supramarginal,temporalpole,transversetemporal,subject_id
0,2636.0,2001.0,6526.5,4226.0,2625.5,1433.0,9963.5,16145.0,11816.5,7008.5,...,11686.5,2612.0,17090.5,25891.5,14106.5,15273.0,11645.5,2926.0,1261.5,sub-1000393599
0,2561.0,1496.5,5812.0,3069.0,1506.0,1046.0,9044.5,13748.5,10105.5,8036.0,...,9481.0,2166.5,15741.5,25257.0,12508.5,14181.5,12747.5,2141.0,1019.0,sub-1001970838
0,2882.5,2223.5,6976.0,3144.0,2025.0,1154.0,11627.5,18001.0,11854.5,6900.0,...,10801.5,2456.0,17091.0,23280.5,14245.5,13120.0,12955.0,2532.0,1003.5,sub-1007995238


index,bankssts,caudalanteriorcingulate,caudalmiddlefrontal,cuneus,entorhinal,frontalpole,fusiform,inferiorparietal,inferiortemporal,insula,...,precuneus,rostralanteriorcingulate,rostralmiddlefrontal,superiorfrontal,superiorparietal,superiortemporal,supramarginal,temporalpole,transversetemporal,subject_id
0,2636.0,2001.0,6526.5,4226.0,2625.5,1433.0,9963.5,16145.0,11816.5,7008.5,...,11686.5,2612.0,17090.5,25891.5,14106.5,15273.0,11645.5,2926.0,1261.5,sub-1000393599


In [None]:
y_train = all_data.set_index('participant_id').loc[X_matrix.index, 'pfactor']
