In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from rbclib import RBCPath

In [2]:
# Paths
rbcdata_path = Path('/home/jovyan/shared/data/RBC')
train_filepath = rbcdata_path / 'train_participants.tsv'
test_filepath = rbcdata_path / 'test_participants.tsv'

# Load the PNC participants TSV files...
with train_filepath.open('r') as f:
    train_data = pd.read_csv(f, sep='\t')
with test_filepath.open('r') as f:
    test_data = pd.read_csv(f, sep='\t')

# We can also concatenate the two datasets into a single dataset of all study participants:
all_data = pd.concat([train_data, test_data])

# Display the full dataframe:
all_data

Unnamed: 0,participant_id,study,study_site,session_id,wave,age,sex,race,ethnicity,bmi,handedness,participant_education,parent_1_education,parent_2_education,p_factor,internalizing_mcelroy_harmonized_all_samples,externalizing_mcelroy_harmonized_all_samples,attention_mcelroy_harmonized_all_samples,cubids_acquisition_group
0,1000393599,PNC,PNC1,PNC1,1,15.583333,Male,Black,not Hispanic or Latino,22.15,Right,9th Grade,Complete primary,Complete secondary,0.589907,-0.449373,-0.630780,-1.842178,1
1,1001970838,PNC,PNC1,PNC1,1,17.833333,Male,Other,Hispanic or Latino,23.98,Right,11th Grade,Complete tertiary,Complete tertiary,-0.659061,0.531072,0.392751,0.190706,1
2,1007995238,PNC,PNC1,PNC1,1,13.750000,Female,Other,not Hispanic or Latino,23.77,Right,6th Grade,Complete tertiary,Complete primary,-1.608375,-0.744118,-0.314187,-0.432662,1
3,1011497669,PNC,PNC1,PNC1,1,16.666667,Male,White,not Hispanic or Latino,29.68,Right,9th Grade,Complete tertiary,Complete tertiary,-1.233807,-0.896835,-0.449099,0.111167,1
4,1017092387,PNC,PNC1,PNC1,1,18.666667,Female,Black,not Hispanic or Latino,23.24,Right,11th Grade,Complete primary,Complete primary,-0.923100,-0.313455,2.204168,-0.782266,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
529,969649154,PNC,PNC1,PNC1,1,12.333333,Male,White,not Hispanic or Latino,17.38,Right,5th Grade,Complete tertiary,Complete secondary,,-0.148520,0.556444,0.024228,1
530,970890500,PNC,PNC1,PNC1,1,18.166667,Female,White,not Hispanic or Latino,30.89,Right,11th Grade,Complete secondary,Complete secondary,,0.993806,1.578177,-0.373470,1
531,975856179,PNC,PNC1,PNC1,1,11.000000,Male,White,not Hispanic or Latino,15.67,Right,4th Grade,Complete primary,Complete secondary,,-1.026645,-0.582212,1.333857,1
532,984757368,PNC,PNC1,PNC1,1,13.416667,Male,Black,not Hispanic or Latino,16.66,Right,5th Grade,Complete primary,,,0.360029,-0.515655,1.509584,114


In [3]:
# Load function for brain structures (FreeSurfer SurfaceStats)
def load_fsdata(participant_id, local_cache_dir=(Path.home() / 'cache')):
    if local_cache_dir is not None:
        local_cache_dir = Path(local_cache_dir)
        local_cache_dir.mkdir(exist_ok=True)
    pnc_freesurfer_path = RBCPath('rbc://PNC_FreeSurfer/freesurfer', local_cache_dir=local_cache_dir)
    participant_path = pnc_freesurfer_path / f'sub-{participant_id}'
    tsv_path = participant_path / f'sub-{participant_id}_regionsurfacestats.tsv'
    with tsv_path.open('r') as f:
        data = pd.read_csv(f, sep='\t')
    return data

In [4]:
from ipywidgets import IntProgress
from IPython.display import display 

In [5]:
# Collect aparc features for each participant
all_vars = {'participant_id': [], 'p_factor': []}
progress = IntProgress(min=0, max=len(all_data))
display(progress)

all_records = []
for (ii, row) in all_data.iterrows():
    participant_id = row['participant_id']
    p_factor = row.get('p_factor', np.nan)
    try:
        data = load_fsdata(participant_id)
        if 'atlas' in data.columns:
            data = data[data['atlas'] == 'aparc']
        else:
            continue

        record = {'participant_id': participant_id, 'p_factor': p_factor}

        for struct_name in data['StructName'].unique():
            row_mask = (data['StructName'] == struct_name)
            for metric in ['SurfArea', 'GrayVol', 'ThickAvg']:
                if metric in data.columns:
                    values = data.loc[row_mask, metric]
                    if not values.empty:
                        value = values.sum() if metric != 'ThickAvg' else values.mean()
                        record[f"{struct_name}_{metric}"] = value

        all_records.append(record)

    except FileNotFoundError:
        continue
    progress.value += 1

IntProgress(value=0, max=1601)

In [6]:
all_vars = pd.DataFrame(all_records)

train_vars = all_vars[~np.isnan(all_vars['p_factor'])]
test_vars = all_vars[np.isnan(all_vars['p_factor'])]

all_vars

Unnamed: 0,participant_id,p_factor,bankssts_SurfArea,bankssts_GrayVol,bankssts_ThickAvg,caudalanteriorcingulate_SurfArea,caudalanteriorcingulate_GrayVol,caudalanteriorcingulate_ThickAvg,caudalmiddlefrontal_SurfArea,caudalmiddlefrontal_GrayVol,...,frontalpole_ThickAvg,temporalpole_SurfArea,temporalpole_GrayVol,temporalpole_ThickAvg,transversetemporal_SurfArea,transversetemporal_GrayVol,transversetemporal_ThickAvg,insula_SurfArea,insula_GrayVol,insula_ThickAvg
0,1000393599,0.589907,2024,5272,2.8115,1186,4002,2.8760,4332,13053,...,3.1460,1001,5852,3.7405,925,2523,2.5080,4490,14017,3.0450
1,1001970838,-0.659061,2003,5122,2.6610,1120,2993,2.6140,3953,11624,...,2.9440,772,4282,3.5675,783,2038,2.3645,5200,16072,3.0055
2,1007995238,-1.608375,2138,5765,2.8985,1368,4447,2.7920,4306,13952,...,2.6575,846,5064,3.6650,730,2007,2.5880,4093,13800,3.3325
3,1011497669,-1.233807,2086,5387,2.6495,1564,5125,2.7680,4451,14348,...,2.8965,754,4401,3.3695,717,1820,2.4845,5082,15933,2.9905
4,1017092387,-0.923100,1753,4499,2.6755,1259,4348,2.8755,4183,13355,...,2.9625,945,5285,3.5570,696,1984,2.5270,4817,15034,3.0325
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1587,969649154,,2190,5636,2.7400,1554,4868,2.8835,5386,15811,...,2.6820,915,5029,3.5720,852,2655,2.6130,4978,16383,3.1765
1588,970890500,,1609,3906,2.6330,804,2875,3.3575,4061,13061,...,2.6760,741,4282,3.4825,565,1577,2.4760,3525,11338,3.1790
1589,975856179,,2488,6656,2.8785,1563,5023,2.8210,5528,18063,...,2.3380,1014,5371,3.6040,941,2831,2.9020,4511,15248,3.3425
1590,984757368,,2395,6443,2.7775,1054,2819,2.5405,5182,15640,...,2.7155,973,4059,3.0540,946,2564,2.2855,5815,16367,2.9140


In [7]:
# Correlation
missing_thresh = 0.3
missing_frac = train_vars.isna().mean()
feature_cols = missing_frac[(missing_frac < missing_thresh)].index.difference(['participant_id', 'p_factor'])
corr = train_vars[feature_cols.tolist() + ['p_factor']].corr()
top_corr = corr['p_factor'].drop('p_factor').abs().sort_values(ascending=False).head(15)
top_corr

precuneus_ThickAvg              0.193352
superiorparietal_GrayVol        0.190804
precuneus_GrayVol               0.190091
rostralmiddlefrontal_GrayVol    0.181274
postcentral_GrayVol             0.176104
paracentral_ThickAvg            0.176056
superiorfrontal_GrayVol         0.173695
posteriorcingulate_GrayVol      0.168816
postcentral_ThickAvg            0.165617
precentral_GrayVol              0.164963
inferiorparietal_GrayVol        0.161817
bankssts_ThickAvg               0.160552
caudalmiddlefrontal_GrayVol     0.159332
cuneus_ThickAvg                 0.158056
lateralorbitofrontal_GrayVol    0.156392
Name: p_factor, dtype: float64

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib

In [9]:
# ML Pipeline with ElasticNet 
from sklearn.linear_model import ElasticNetCV

param_grid = {
    'model__alpha': [0.001, 0.01, 0.1, 1.0, 10.0],
    'model__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
}

pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('model', ElasticNetCV(
        l1_ratio=[0.1, 0.3, 0.5, 0.7, 0.9],
        alphas=[0.001, 0.01, 0.1, 1.0, 10.0],
        cv=5,
        max_iter=10000,
        random_state=42
    ))
])

# Drop rows with missing values for model training
X = train_vars[feature_cols.tolist()]
#X = X.dropna()
y = train_vars['p_factor']

pipeline.fit(X, y)

print("Best alpha:", pipeline.named_steps['model'].alpha_)
print("Best l1_ratio:", pipeline.named_steps['model'].l1_ratio_)

# Mean R² score across folds 
mean_r2 = pipeline.named_steps['model'].mse_path_.mean(axis=1)
print("Mean MSEs:", mean_r2)

Best alpha: 0.1
Best l1_ratio: 0.3
Mean MSEs: [[0.87416538 0.96007875 0.80520173 0.8238343  0.85075443]
 [0.87304279 0.95934031 0.80181027 0.82797947 0.85039408]
 [0.87234875 0.95800649 0.79973502 0.82876898 0.84815405]
 [0.87209664 0.95742954 0.79857729 0.83026935 0.84758791]
 [0.87195371 0.95722843 0.79786799 0.83204081 0.84670892]]


In [10]:
# Optional
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print(f'Final R²: {r2_score(y_test, y_pred):.3f}')
print(f'MAE: {mean_absolute_error(y_test, y_pred):.3f}')
print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.3f}')

Final R²: 0.030
MAE: 0.769
RMSE: 0.925


In [11]:
# Predict on test data
X_final = test_vars[X.columns]
test_okrows = np.all(~np.isnan(X_final.values), axis=1)
X_valid = X_final.loc[test_okrows]
predictions = pipeline.predict(X_valid)

# Apply predictions to subset of test_vars
pred_ids = test_vars.loc[test_okrows, 'participant_id']
results = pd.DataFrame({
    'participant_id': pred_ids,
    'p_factor_pred': predictions
})

test_data.loc[test_data['participant_id'].isin(pred_ids), 'p_factor'] = predictions

group_name = 'miniproj-group4'  
Path("results").mkdir(parents=True, exist_ok=True)
test_data.to_csv(f'results/{group_name}.tsv', sep='\t', index=False)