In [1]:
#import relevant libraries
import os
import subprocess
import numpy as np
import pandas as pd
import sys; sys.path
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import explained_variance_score, r2_score
from sklearn.linear_model import Ridge

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, GroupKFold, GroupShuffleSplit

In [2]:
# read and clean HCP data
# set base dirctories
HCP_base_dir   = '/Users/elvishadhamala/Documents/yale/HCP/'

# load dataframes for the cortical areal-level properties
# suraface area, gray matter volume, and cortical thickness
HCP_surf_area_df = pd.read_csv(os.path.join(HCP_base_dir, 'group_net_surfarea_native_freesurfer.csv'))
HCP_gray_vol_df = pd.read_csv(os.path.join(HCP_base_dir, 'group_net_grayvol_native_freesurfer.csv'))
HCP_thick_df = pd.read_csv(os.path.join(HCP_base_dir, 'group_net_thickavg_native_freesurfer.csv'))

# load subj behavioural and family data
HCP_subj_data_df = pd.read_csv(os.path.join(HCP_base_dir, 'hcp_behaviour.csv'))
HCP_subj_fam_df = pd.read_csv(os.path.join(HCP_base_dir, 'hcp_restricted.csv'))
HCP_subj_fs_df = pd.read_csv(os.path.join(HCP_base_dir, 'hcp_freesurfer.csv'))

#create a mask for the subject and family data and only keep the ones that have the cortical areal-level properties 
HCP_mask_data=np.isin(HCP_subj_data_df.Subject, HCP_surf_area_df.id)
HCP_subj_data_df = HCP_subj_data_df[HCP_mask_data]

HCP_mask_fam=np.isin(HCP_subj_fam_df.Subject, HCP_surf_area_df.id)
HCP_subj_fam_df = HCP_subj_fam_df[HCP_mask_fam]

HCP_mask_fs=np.isin(HCP_subj_fs_df.Subject, HCP_surf_area_df.id)
HCP_subj_fs_df = HCP_subj_fs_df[HCP_mask_fs]


HCP_surf_area_df = HCP_surf_area_df.set_index(HCP_subj_data_df.index)
HCP_gray_vol_df = HCP_gray_vol_df.set_index(HCP_subj_data_df.index)
HCP_thick_df = HCP_thick_df.set_index(HCP_subj_data_df.index)
HCP_subj_fs_df = HCP_subj_fs_df.set_index(HCP_subj_data_df.index)


HCP_icv = pd.DataFrame(HCP_subj_fs_df.FS_InterCranial_Vol)

# drop the id and 'none' columns
HCP_surf_area_df = HCP_surf_area_df.drop(columns=['id', 'lh_None', 'rh_None'])
HCP_gray_vol_df = HCP_gray_vol_df.drop(columns=['id', 'lh_None', 'rh_None'])
HCP_thick_df = HCP_thick_df.drop(columns=['id', 'lh_None', 'rh_None'])



HCP_cog = HCP_subj_data_df[["CogFluidComp_Unadj", "CogEarlyComp_Unadj", "CogTotalComp_Unadj", "CogCrystalComp_Unadj",
                  "PicSeq_Unadj", "CardSort_Unadj", "Flanker_Unadj", "PMAT24_A_CR", "ReadEng_Unadj", 
                  "PicVocab_Unadj", "ProcSpeed_Unadj", "DDisc_AUC_40K", "VSPLOT_TC", "SCPT_SEN", "SCPT_SPEC", 
                  "IWRD_TOT", "ListSort_Unadj", "MMSE_Score", "Language_Task_Math_Avg_Difficulty_Level", 
                  "Language_Task_Story_Avg_Difficulty_Level", "Relational_Task_Acc", "WM_Task_Acc"]] 


col_headers_main = ['Fluid Composite', 'Early Composite', 'Total Composite',
               'Crystal Composite', 'Visual Episodic Memory', 'Cognitive Flexibility (Card Sort)',
               'Inhibition (Flanker)', 'Fluid Intelligence (PMAT)', 'Reading Decoding', 
               'Vocabulary Comprehension', 'Processing Speed', 'Delay Discounting',
               'Spatial Orientation', 'Sustained Attention - Sens.', 
               'Sustained Attention - Spec.', 'Verbal Episodic Memory', 
               'Working Memory (List Sorting)', 'Cognitive Status', 'Arithmetic',
               'Story Comprehension', 'Relational Processing', 'Working Memory (N-Back)']

HCP_cog.columns = col_headers_main

HCP_cog = HCP_cog[['Fluid Composite', 'Total Composite', 'Crystal Composite',
                     'Visual Episodic Memory', 'Cognitive Flexibility (Card Sort)',
                     'Inhibition (Flanker)', 'Reading Decoding', 'Vocabulary Comprehension', 
                     'Processing Speed', 'Working Memory (List Sorting)']]



#get rid of all the subjects with nans
HCP_mask = np.asarray([~HCP_cog.isna().any(axis=1)])
HCP_surf_area = HCP_surf_area_df[np.transpose(HCP_mask==True)]
HCP_gray_vol = HCP_gray_vol_df[np.transpose(HCP_mask==True)]
HCP_thick = HCP_thick_df[np.transpose(HCP_mask==True)]
HCP_icv = HCP_icv[np.transpose(HCP_mask==True)]
HCP_cog = HCP_cog[np.transpose(HCP_mask==True)]
HCP_fam = HCP_subj_fam_df.loc[np.transpose(HCP_mask==True)]
HCP_subj = HCP_subj_data_df.loc[np.transpose(HCP_mask==True)]

# get normalised measured (by icv)
HCP_surf_area_norm = pd.DataFrame(HCP_surf_area.values/HCP_icv.values, columns=HCP_surf_area.columns)
HCP_gray_vol_norm = pd.DataFrame(HCP_gray_vol.values/HCP_icv.values, columns=HCP_gray_vol.columns)
HCP_thick_norm = pd.DataFrame(HCP_thick.values/HCP_icv.values, columns=HCP_thick.columns)


HCP_surf_area = HCP_surf_area.set_index(HCP_subj.index)
HCP_gray_vol = HCP_gray_vol.set_index(HCP_subj.index)
HCP_thick = HCP_thick.set_index(HCP_subj.index)
HCP_surf_area_norm = HCP_surf_area_norm.set_index(HCP_subj.index)
HCP_gray_vol_norm = HCP_gray_vol_norm.set_index(HCP_subj.index)
HCP_thick_norm = HCP_thick_norm.set_index(HCP_subj.index)
HCP_fam = HCP_fam.set_index(HCP_subj.index)

In [3]:
# read and clean up ABCD data
# set base dirctories
ABCD_base_dir   = '/Users/elvishadhamala/Documents/yale/ABCD'

# load dataframes for the cortical areal-level properties
# suraface area, gray matter volume, and cortical thickness 
ABCD_surf_area_df = pd.read_csv(os.path.join(ABCD_base_dir, 'ABCD_group_surfarea.csv'), header=None)
ABCD_gray_vol_df = pd.read_csv(os.path.join(ABCD_base_dir, 'ABCD_group_grayvol.csv'), header=None)
ABCD_thick_df = pd.read_csv(os.path.join(ABCD_base_dir, 'ABCD_group_thickavg.csv'), header=None)
ABCD_icv_df = pd.read_csv(os.path.join(ABCD_base_dir, 'ABCD_icv.csv'), header=None)

# load subj behavioural and family data
ABCD_subj = pd.read_csv(os.path.join(ABCD_base_dir, 'ABCD_1823_demo_cog.csv'))


ABCD_surf_area = ABCD_surf_area_df.T
ABCD_gray_vol = ABCD_gray_vol_df.T
ABCD_thick = ABCD_thick_df.T


# get normalised measured (by icv)
ABCD_surf_area_norm = pd.DataFrame(ABCD_surf_area.values/ABCD_icv_df.values, columns=ABCD_surf_area.columns)
ABCD_gray_vol_norm = pd.DataFrame(ABCD_gray_vol.values/ABCD_icv_df.values, columns=ABCD_gray_vol.columns)
ABCD_thick_norm = pd.DataFrame(ABCD_thick.values/ABCD_icv_df.values, columns=ABCD_thick.columns)


ABCD_cog = ABCD_subj
ABCD_cog = ABCD_cog.drop(columns=['subjectkey', 'src_subject_id', 'sex', 'race_ethnicity', 'site_id_l'])
ABCD_cog

col_headers_main = ['Vocabulary Comprehension', 'Inhibition (Flanker)', 'Working Memory (List Sorting)',
                   'Cognitive Flexibility (Card Sort)', 'Processing Speed', 'Visual Episodic Memory',
                   'Reading Decoding', 'Fluid Composite', 'Crystal Composite', 'Total Composite',
                   'RAVLT - Trial VI Correct', 'RAVLT - Trial VII Correct', 'WISC-V - Total Raw Score',
                   'LMT - % Correct', 'LMT - RT Correct', 'LMT Efficiency']

ABCD_cog.columns = col_headers_main

ABCD_cog = ABCD_cog[['Fluid Composite', 'Total Composite', 'Crystal Composite',
                     'Visual Episodic Memory', 'Cognitive Flexibility (Card Sort)',
                     'Inhibition (Flanker)', 'Reading Decoding', 'Vocabulary Comprehension', 
                     'Processing Speed', 'Working Memory (List Sorting)']]

In [4]:
# mask sex for HCP
mask_sex = HCP_subj.Gender=='M'
HCP_subj_m = HCP_subj[mask_sex]
HCP_surfarea_m = HCP_surf_area[mask_sex]
HCP_grayvol_m = HCP_gray_vol[mask_sex]
HCP_thick_m = HCP_thick[mask_sex]
HCP_surfarea_norm_m = HCP_surf_area_norm[mask_sex]
HCP_grayvol_norm_m = HCP_gray_vol_norm[mask_sex]
HCP_thick_norm_m = HCP_thick_norm[mask_sex]
HCP_fam_m = HCP_fam[mask_sex]
HCP_cog_m = HCP_cog[mask_sex]


mask_sex = HCP_subj.Gender=='F'
HCP_subj_f = HCP_subj[mask_sex]
HCP_surfarea_f = HCP_surf_area[mask_sex]
HCP_grayvol_f = HCP_gray_vol[mask_sex]
HCP_thick_f = HCP_thick[mask_sex]
HCP_surfarea_norm_f = HCP_surf_area_norm[mask_sex]
HCP_grayvol_norm_f = HCP_gray_vol_norm[mask_sex]
HCP_thick_norm_f = HCP_thick_norm[mask_sex]
HCP_fam_f = HCP_fam[mask_sex]
HCP_cog_f = HCP_cog[mask_sex]



In [5]:
# mask sex for ABCD

mask_sex = ABCD_subj.sex=='M'
ABCD_subj_m = ABCD_subj[mask_sex]
ABCD_surfarea_m = ABCD_surf_area[mask_sex]
ABCD_grayvol_m = ABCD_gray_vol[mask_sex]
ABCD_thick_m = ABCD_thick[mask_sex]
ABCD_surfarea_norm_m = ABCD_surf_area_norm[mask_sex]
ABCD_grayvol_norm_m = ABCD_gray_vol_norm[mask_sex]
ABCD_thick_norm_m = ABCD_thick_norm[mask_sex]
ABCD_cog_m = ABCD_cog[mask_sex]

mask_sex = ABCD_subj.sex=='F'
ABCD_subj_f = ABCD_subj[mask_sex]
ABCD_surfarea_f = ABCD_surf_area[mask_sex]
ABCD_grayvol_f = ABCD_gray_vol[mask_sex]
ABCD_thick_f = ABCD_thick[mask_sex]
ABCD_surfarea_norm_f = ABCD_surf_area_norm[mask_sex]
ABCD_grayvol_norm_f = ABCD_gray_vol_norm[mask_sex]
ABCD_thick_norm_f = ABCD_thick_norm[mask_sex]
ABCD_cog_f = ABCD_cog[mask_sex]


In [9]:
HCP_cog_m

Unnamed: 0,Fluid Composite,Total Composite,Crystal Composite,Visual Episodic Memory,Cognitive Flexibility (Card Sort),Inhibition (Flanker),Reading Decoding,Vocabulary Comprehension,Processing Speed,Working Memory (List Sorting)
1,134.34,137.66,117.33,125.07,119.14,130.42,113.54600,119.8914,138.72,112.89
3,116.64,137.53,134.34,125.71,111.14,121.18,131.81000,134.2400,107.08,108.06
4,124.23,153.36,144.96,109.04,129.84,126.53,141.31660,140.8151,111.11,117.39
6,117.37,126.37,120.73,105.60,119.76,107.04,119.24340,119.8459,112.27,130.38
8,107.14,112.60,115.59,102.89,99.76,113.67,106.91730,123.3551,105.28,117.39
...,...,...,...,...,...,...,...,...,...,...
1198,124.93,117.89,103.71,109.96,122.18,112.21,94.23567,112.9724,146.95,104.06
1199,129.94,140.97,125.92,105.63,136.10,126.53,125.39000,123.1400,119.54,121.89
1201,111.07,109.50,107.68,89.89,126.37,124.64,108.69000,106.0900,114.31,96.99
1203,100.46,122.98,133.94,87.10,112.17,123.22,141.31660,122.3772,83.25,108.06


In [None]:
#number of repetitions you want to perform
rep = 100
#number of folds you want in the cross-validation
k = 3
#proportion of data you want in your training set and test set
train_size = .66
test_size = 1-train_size

#regression model type
regr = Ridge(normalize=True, max_iter=1000000, solver='lsqr')

#set hyperparameter grid space you want to search through for the model
#adapted from the Thomas Yeo Lab Github: 
#ThomasYeoLab/CBIG/blob/master/stable_projects/predict_phenotypes/He2019_KRDNN/KR_HCP/CBIG_KRDNN_KRR_HCP.m
alphas = [0, 0.00001, 0.0001, 0.001, 0.004, 0.007, 0.01, 0.04, 0.07, 0.1, 0.4, 0.7, 1, 1.5, 2, 2.5, 3,
          3.5, 4, 5, 10, 15, 20, 30, 40, 50, 60, 70, 80, 100, 150, 200, 300, 500, 700, 1000, 10000]

#param grid set to the hyperparamters you want to search through
paramGrid ={'alpha': alphas}

#set x data to be the input variable you want to use
X_HCP_m = HCP_surfarea_norm_m
X_ABCD_m = ABCD_surfarea_norm_m

Y_HCP_m = HCP_cog_m
Y_ABCD_m = ABCD_cog_m


X_HCP_f = HCP_surfarea_norm_f
X_ABCD_f = ABCD_surfarea_norm_f

Y_HCP_f = HCP_cog_f
Y_ABCD_f = ABCD_cog_f

#number of variables you want to predict to be the number of variables stored in the cognition variablse
n_cog = Y_HCP_m.shape[1]

#number of features 
n_feat = X_HCP_m.shape[1]

#number of test sets
n_test = 4

In [None]:
#create arrays to store variables

#r^2 - coefficient of determination
r2_ABCD_m = np.zeros([rep,n_cog,n_test])
#explained variance
var_ABCD_m = np.zeros([rep,n_cog,n_test])
#correlation between true and predicted (aka prediction accuracy)
corr_ABCD_m = np.zeros([rep,n_cog,n_test])
#optimised alpha (hyperparameter)
opt_alpha_ABCD_m = np.zeros([rep,n_cog])
#predictions made by the model
#don't need to save any of these right now
#preds = np.zeros([rep,n_cog,int(np.ceil(X.shape[0]*test_size))])
#true test values for cognition
#cogtest = np.zeros([rep,n_cog,int(np.ceil(X.shape[0]*test_size))])
#feature importance extracted from the model
featimp_ABCD_m = np.zeros([rep,n_feat,n_cog])
#for when the feat weights get haufe-inverted
featimp_haufe_ABCD_m = np.zeros([rep,n_feat,n_cog])

#r^2 - coefficient of determination
r2_ABCD_f = np.zeros([rep,n_cog,n_test])
#explained variance
var_ABCD_f = np.zeros([rep,n_cog,n_test])
#correlation between true and predicted (aka prediction accuracy)
corr_ABCD_f = np.zeros([rep,n_cog,n_test])
#optimised alpha (hyperparameter)
opt_alpha_ABCD_f = np.zeros([rep,n_cog])
#predictions made by the model
#don't need to save any of these right now
#preds = np.zeros([rep,n_cog,int(np.ceil(X.shape[0]*test_size))])
#true test values for cognition
#cogtest = np.zeros([rep,n_cog,int(np.ceil(X.shape[0]*test_size))])
#feature importance extracted from the model
featimp_ABCD_f = np.zeros([rep,n_feat,n_cog])
#for when the feat weights get haufe-inverted
featimp_haufe_ABCD_f = np.zeros([rep,n_feat,n_cog])


#r^2 - coefficient of determination
r2_HCP_m = np.zeros([rep,n_cog,n_test])
#explained variance
var_HCP_m = np.zeros([rep,n_cog,n_test])
#correlation between true and predicted (aka prediction accuracy)
corr_HCP_m = np.zeros([rep,n_cog,n_test])
#optimised alpha (hyperparameter)
opt_alpha_HCP_m = np.zeros([rep,n_cog])
#predictions made by the model
#don't need to save any of these right now
#preds = np.zeros([rep,n_cog,int(np.ceil(X.shape[0]*test_size))])
#true test values for cognition
#cogtest = np.zeros([rep,n_cog,int(np.ceil(X.shape[0]*test_size))])
#feature importance extracted from the model
featimp_HCP_m = np.zeros([rep,n_feat,n_cog])
#for when the feat weights get haufe-inverted
featimp_haufe_HCP_m = np.zeros([rep,n_feat,n_cog])

#r^2 - coefficient of determination
r2_HCP_f = np.zeros([rep,n_cog,n_test])
#explained variance
var_HCP_f = np.zeros([rep,n_cog,n_test])
#correlation between true and predicted (aka prediction accuracy)
corr_HCP_f = np.zeros([rep,n_cog,n_test])
#optimised alpha (hyperparameter)
opt_alpha_HCP_f = np.zeros([rep,n_cog])
#predictions made by the model
#don't need to save any of these right now
#preds = np.zeros([rep,n_cog,int(np.ceil(X.shape[0]*test_size))])
#true test values for cognition
#cogtest = np.zeros([rep,n_cog,int(np.ceil(X.shape[0]*test_size))])
#feature importance extracted from the model
featimp_HCP_f = np.zeros([rep,n_feat,n_cog])
#for when the feat weights get haufe-inverted
featimp_haufe_HCP_f = np.zeros([rep,n_feat,n_cog])

In [None]:
#iterate through number of models
for p in range(rep):
    #print model # you're on
    print('Model %d' %(p+1))
    #for HCP males
    #group split HCP male data into train and test sets, using family ID as group ccategory
    train_inds_HCP_m, test_inds_HCP_m = next(GroupShuffleSplit(test_size=1-train_size, n_splits=1, random_state = p).split(X_HCP_m, groups=HCP_fam_m['Family_ID']))
    
    #set x values based on indices from split
    x_train_HCP_m = X_HCP_m.iloc[train_inds_HCP_m].values
    x_test_HCP_m = X_HCP_m.iloc[test_inds_HCP_m].values
        
    #set y values for based on indices from split    
    cog_train_HCP_m = Y_HCP_m.iloc[train_inds_HCP_m].values
    cog_test_HCP_m = Y_HCP_m.iloc[test_inds_HCP_m].values
    
    #for HCP females
    #group split HCP female data into train and test sets, using family ID as group ccategory
    train_inds_HCP_f, test_inds_HCP_f = next(GroupShuffleSplit(test_size=1-train_size, n_splits=1, random_state = p).split(X_HCP_f, groups=HCP_fam_f['Family_ID']))
    
    #set x values based on indices from split
    x_train_HCP_f = X_HCP_f.iloc[train_inds_HCP_f].values
    x_test_HCP_f = X_HCP_f.iloc[test_inds_HCP_f].values
        
    #set y values for based on indices from split      
    cog_train_HCP_f = Y_HCP_f.iloc[train_inds_HCP_f].values
    cog_test_HCP_f = Y_HCP_f.iloc[test_inds_HCP_f].values
    
    
    #for ABCD males
    train_inds_ABCD_m, test_inds_ABCD_m = next(GroupShuffleSplit(test_size=1-train_size, n_splits=1, random_state = p).split(X_ABCD_m, groups=ABCD_subj_m['site_id_l']))
    
    #set x values based on indices from split
    x_train_ABCD_m = X_ABCD_m.iloc[train_inds_ABCD_m].values
    x_test_ABCD_m = X_ABCD_m.iloc[test_inds_ABCD_m].values
        
    #set y values for based on indices from split      
    cog_train_ABCD_m = Y_ABCD_m.iloc[train_inds_ABCD_m].values
    cog_test_ABCD_m = Y_ABCD_m.iloc[test_inds_ABCD_m].values    
    
    #for ABCD females
    train_inds_ABCD_f, test_inds_ABCD_f = next(GroupShuffleSplit(test_size=1-train_size, n_splits=1, random_state = p).split(X_ABCD_f, groups=ABCD_subj_f['site_id_l']))
    
    #set x values based on indices from split
    x_train_ABCD_f = X_ABCD_f.iloc[train_inds_ABCD_f].values
    x_test_ABCD_f = X_ABCD_f.iloc[test_inds_ABCD_f].values
        
    #set y values for based on indices from split      
    cog_train_ABCD_f = Y_ABCD_f.iloc[train_inds_ABCD_f].values
    cog_test_ABCD_f = Y_ABCD_f.iloc[test_inds_ABCD_f].values  
    
    
    #iterate through the cognitive metrics you want to predict
    for cog in range (n_cog):

        #print and set cognitive metrics being predicted 
        print ("Behaviour: %s" % Y_HCP_m.columns[cog])
    
        y_train_HCP_m = cog_train_HCP_m[:,cog]
        y_test_HCP_m = cog_test_HCP_m[:,cog]
        
        y_train_HCP_f = cog_train_HCP_f[:,cog]
        y_test_HCP_f = cog_test_HCP_f[:,cog]
        
        y_train_ABCD_m = cog_train_ABCD_m[:,cog]
        y_test_ABCD_m = cog_test_ABCD_m[:,cog]
        
        y_train_ABCD_f = cog_train_ABCD_f[:,cog]
        y_test_ABCD_f = cog_test_ABCD_f[:,cog]
        
        #store all the y_test values in a separate variable that can be accessed later if needed
        #cogtest[p,cog,:] = y_test

        ##HCP
        #create variables to store nested CV scores, and best parameters from hyperparameter optimisation
        best_scores_HCP_m = []
        best_params_HCP_m = []
        
        best_scores_HCP_f = []
        best_params_HCP_f = []
        

        #set parameters for inner and outer loops for CV
        cv_split = GroupKFold(n_splits=k)
            
            
        #define regressor with grid-search CV for inner loop
        gridSearch_HCP_m = GridSearchCV(estimator=regr, param_grid=paramGrid, n_jobs=1, verbose=0, cv=cv_split, scoring='explained_variance')

        #fit regressor to the model, use family ID as group category again
        gridSearch_HCP_m.fit(x_train_HCP_m, y_train_HCP_m, groups=HCP_fam_m.iloc[train_inds_HCP_m]['Family_ID'])

        #save parameters corresponding to the best score
        best_params_HCP_m.append(list(gridSearch_HCP_m.best_params_.values()))
        best_scores_HCP_m.append(gridSearch_HCP_m.best_score_)
        
        
        #define regressor with grid-search CV for inner loop
        gridSearch_HCP_f = GridSearchCV(estimator=regr, param_grid=paramGrid, n_jobs=1, verbose=0, cv=cv_split, scoring='explained_variance')

        #fit regressor to the model, use family ID as group category again
        gridSearch_HCP_f.fit(x_train_HCP_f, y_train_HCP_f, groups=HCP_fam_f.iloc[train_inds_HCP_f]['Family_ID'])

        #save parameters corresponding to the best score
        best_params_HCP_f.append(list(gridSearch_HCP_f.best_params_.values()))
        best_scores_HCP_f.append(gridSearch_HCP_f.best_score_)
        
        
        
        ##ABCD
        #create variables to store nested CV scores, and best parameters from hyperparameter optimisation
        best_scores_ABCD_m = []
        best_params_ABCD_m = []        
        
        best_scores_ABCD_f = []
        best_params_ABCD_f = []        
        
        
        #set parameters for inner and outer loops for CV
        cv_split = GroupKFold(n_splits=k)
            
        #define regressor with grid-search CV for inner loop
        gridSearch_ABCD_m = GridSearchCV(estimator=regr, param_grid=paramGrid, n_jobs=1, verbose=0, cv=cv_split, scoring='explained_variance')

        #fit regressor to the model, use family ID as group category again
        gridSearch_ABCD_m.fit(x_train_ABCD_m, y_train_ABCD_m, groups=ABCD_subj_m.iloc[train_inds_ABCD_m]['site_id_l'])

        #save parameters corresponding to the best score
        best_params_ABCD_m.append(list(gridSearch_ABCD_m.best_params_.values()))
        best_scores_ABCD_m.append(gridSearch_ABCD_m.best_score_)
        
        
        #define regressor with grid-search CV for inner loop
        gridSearch_ABCD_f = GridSearchCV(estimator=regr, param_grid=paramGrid, n_jobs=1, verbose=0, cv=cv_split, scoring='explained_variance')

        #fit regressor to the model, use family ID as group category again
        gridSearch_ABCD_f.fit(x_train_ABCD_f, y_train_ABCD_f, groups=ABCD_subj_f.iloc[train_inds_ABCD_f]['site_id_l'])

        #save parameters corresponding to the best score
        best_params_ABCD_f.append(list(gridSearch_ABCD_f.best_params_.values()))
        best_scores_ABCD_f.append(gridSearch_ABCD_f.best_score_)

        
        
        #save optimised alpha values
        #opt_alpha[p,cog] = best_params[nested_scores.index(np.max(nested_scores))][0]
        #ends up just being a single value that's chosen by GridSearchCV here since it's no longer nested
        #but the line below just makes it easier to go back to a nested set-up if needed
        opt_alpha_HCP_m[p,cog] = best_params_HCP_m[best_scores_HCP_m.index(np.max(best_scores_HCP_m))][0]
        opt_alpha_HCP_f[p,cog] = best_params_HCP_f[best_scores_HCP_f.index(np.max(best_scores_HCP_f))][0]
        
        opt_alpha_ABCD_m[p,cog] = best_params_ABCD_m[best_scores_ABCD_m.index(np.max(best_scores_ABCD_m))][0]
        opt_alpha_ABCD_f[p,cog] = best_params_ABCD_f[best_scores_ABCD_f.index(np.max(best_scores_ABCD_f))][0]

        #fit model using optimised hyperparameter
        model_HCP_m = Ridge(alpha = opt_alpha_HCP_m[p,cog], normalize=True, max_iter=1000000, solver='lsqr')
        model_HCP_m.fit(x_train_HCP_m, y_train_HCP_m);
            
        model_HCP_f = Ridge(alpha = opt_alpha_HCP_f[p,cog], normalize=True, max_iter=1000000, solver='lsqr')
        model_HCP_f.fit(x_train_HCP_f, y_train_HCP_f);
        
        
        model_ABCD_m = Ridge(alpha = opt_alpha_ABCD_m[p,cog], normalize=True, max_iter=1000000, solver='lsqr')
        model_ABCD_m.fit(x_train_ABCD_m, y_train_ABCD_m);
        
        model_ABCD_f = Ridge(alpha = opt_alpha_ABCD_f[p,cog], normalize=True, max_iter=1000000, solver='lsqr')
        model_ABCD_f.fit(x_train_ABCD_f, y_train_ABCD_f);
        
        
        #compute r^2 (coefficient of determination)
        r2_HCP_m[p,cog,0]=model_HCP_m.score(x_test_HCP_m,y_test_HCP_m)
        r2_HCP_m[p,cog,1]=model_HCP_m.score(x_test_HCP_f,y_test_HCP_f)
        r2_HCP_m[p,cog,2]=model_HCP_m.score(x_test_ABCD_m,y_test_ABCD_m)
        r2_HCP_m[p,cog,3]=model_HCP_m.score(x_test_ABCD_f,y_test_ABCD_f)
        
        r2_HCP_f[p,cog,0]=model_HCP_f.score(x_test_HCP_m,y_test_HCP_m)
        r2_HCP_f[p,cog,1]=model_HCP_f.score(x_test_HCP_f,y_test_HCP_f)
        r2_HCP_f[p,cog,2]=model_HCP_f.score(x_test_ABCD_m,y_test_ABCD_m)
        r2_HCP_f[p,cog,3]=model_HCP_f.score(x_test_ABCD_f,y_test_ABCD_f)
        
        r2_ABCD_m[p,cog,0]=model_ABCD_m.score(x_test_HCP_m,y_test_HCP_m)
        r2_ABCD_m[p,cog,1]=model_ABCD_m.score(x_test_HCP_f,y_test_HCP_f)
        r2_ABCD_m[p,cog,2]=model_ABCD_m.score(x_test_ABCD_m,y_test_ABCD_m)
        r2_ABCD_m[p,cog,3]=model_ABCD_m.score(x_test_ABCD_f,y_test_ABCD_f)
        
        r2_ABCD_f[p,cog,0]=model_ABCD_f.score(x_test_HCP_m,y_test_HCP_m)
        r2_ABCD_f[p,cog,1]=model_ABCD_f.score(x_test_HCP_f,y_test_HCP_f)
        r2_ABCD_f[p,cog,2]=model_ABCD_f.score(x_test_ABCD_m,y_test_ABCD_m)
        r2_ABCD_f[p,cog,3]=model_ABCD_f.score(x_test_ABCD_f,y_test_ABCD_f)
        
        
        preds_HCP_m = []
        preds_HCP_f = []
        preds_ABCD_m = []
        preds_ABCD_f = []

        #generate predictions from HCP m model
        preds_HCP_m = model_HCP_m.predict(x_test_HCP_m).ravel()
        preds_HCP_f = model_HCP_m.predict(x_test_HCP_f).ravel()
        preds_ABCD_m = model_HCP_m.predict(x_test_ABCD_m).ravel()
        preds_ABCD_f = model_HCP_m.predict(x_test_ABCD_f).ravel()
        
        #compute explained variance 
        var_HCP_m[p,cog,0] = explained_variance_score(y_test_HCP_m, preds_HCP_m)
        var_HCP_m[p,cog,1] = explained_variance_score(y_test_HCP_f, preds_HCP_f)
        var_HCP_m[p,cog,2] = explained_variance_score(y_test_ABCD_m, preds_ABCD_m)
        var_HCP_m[p,cog,3] = explained_variance_score(y_test_ABCD_f, preds_ABCD_f)

        #compute correlation between true and predicted (prediction accuracy)
        corr_HCP_m[p,cog,0] = np.corrcoef(y_test_HCP_m.ravel(), preds_HCP_m)[1,0]
        corr_HCP_m[p,cog,1] = np.corrcoef(y_test_HCP_f.ravel(), preds_HCP_f)[1,0]
        corr_HCP_m[p,cog,2] = np.corrcoef(y_test_ABCD_m.ravel(), preds_ABCD_m)[1,0]
        corr_HCP_m[p,cog,3] = np.corrcoef(y_test_ABCD_f.ravel(), preds_ABCD_f)[1,0]
        
        preds_HCP_m = []
        preds_HCP_f = []
        preds_ABCD_m = []
        preds_ABCD_f = []

        #generate predictions from HCP f model
        preds_HCP_m = model_HCP_f.predict(x_test_HCP_m).ravel()
        preds_HCP_f = model_HCP_f.predict(x_test_HCP_f).ravel()
        preds_ABCD_m = model_HCP_f.predict(x_test_ABCD_m).ravel()
        preds_ABCD_f = model_HCP_f.predict(x_test_ABCD_f).ravel()
        
        #compute explained variance 
        var_HCP_f[p,cog,0] = explained_variance_score(y_test_HCP_m, preds_HCP_m)
        var_HCP_f[p,cog,1] = explained_variance_score(y_test_HCP_f, preds_HCP_f)
        var_HCP_f[p,cog,2] = explained_variance_score(y_test_ABCD_m, preds_ABCD_m)
        var_HCP_f[p,cog,3] = explained_variance_score(y_test_ABCD_f, preds_ABCD_f)

        #compute correlation between true and predicted (prediction accuracy)
        corr_HCP_f[p,cog,0] = np.corrcoef(y_test_HCP_m.ravel(), preds_HCP_m)[1,0]
        corr_HCP_f[p,cog,1] = np.corrcoef(y_test_HCP_f.ravel(), preds_HCP_f)[1,0]
        corr_HCP_f[p,cog,2] = np.corrcoef(y_test_ABCD_m.ravel(), preds_ABCD_m)[1,0]
        corr_HCP_f[p,cog,3] = np.corrcoef(y_test_ABCD_f.ravel(), preds_ABCD_f)[1,0]
        
        preds_HCP_m = []
        preds_HCP_f = []
        preds_ABCD_m = []
        preds_ABCD_f = []

        #generate predictions from ABCD m model
        preds_HCP_m = model_ABCD_m.predict(x_test_HCP_m).ravel()
        preds_HCP_f = model_ABCD_m.predict(x_test_HCP_f).ravel()
        preds_ABCD_m = model_ABCD_m.predict(x_test_ABCD_m).ravel()
        preds_ABCD_f = model_ABCD_m.predict(x_test_ABCD_f).ravel()
        
        #compute explained variance 
        var_ABCD_m[p,cog,0] = explained_variance_score(y_test_HCP_m, preds_HCP_m)
        var_ABCD_m[p,cog,1] = explained_variance_score(y_test_HCP_f, preds_HCP_f)
        var_ABCD_m[p,cog,2] = explained_variance_score(y_test_ABCD_m, preds_ABCD_m)
        var_ABCD_m[p,cog,3] = explained_variance_score(y_test_ABCD_f, preds_ABCD_f)

        #compute correlation between true and predicted (prediction accuracy)
        corr_ABCD_m[p,cog,0] = np.corrcoef(y_test_HCP_m.ravel(), preds_HCP_m)[1,0]
        corr_ABCD_m[p,cog,1] = np.corrcoef(y_test_HCP_f.ravel(), preds_HCP_f)[1,0]
        corr_ABCD_m[p,cog,2] = np.corrcoef(y_test_ABCD_m.ravel(), preds_ABCD_m)[1,0]
        corr_ABCD_m[p,cog,3] = np.corrcoef(y_test_ABCD_f.ravel(), preds_ABCD_f)[1,0]
        
        
        preds_HCP_m = []
        preds_HCP_f = []
        preds_ABCD_m = []
        preds_ABCD_f = []

        #generate predictions from HCP f model
        preds_HCP_m = model_ABCD_f.predict(x_test_HCP_m).ravel()
        preds_HCP_f = model_ABCD_f.predict(x_test_HCP_f).ravel()
        preds_ABCD_m = model_ABCD_f.predict(x_test_ABCD_m).ravel()
        preds_ABCD_f = model_ABCD_f.predict(x_test_ABCD_f).ravel()
        
        #compute explained variance 
        var_ABCD_f[p,cog,0] = explained_variance_score(y_test_HCP_m, preds_HCP_m)
        var_ABCD_f[p,cog,1] = explained_variance_score(y_test_HCP_f, preds_HCP_f)
        var_ABCD_f[p,cog,2] = explained_variance_score(y_test_ABCD_m, preds_ABCD_m)
        var_ABCD_f[p,cog,3] = explained_variance_score(y_test_ABCD_f, preds_ABCD_f)

        #compute correlation between true and predicted (prediction accuracy)
        corr_ABCD_f[p,cog,0] = np.corrcoef(y_test_HCP_m.ravel(), preds_HCP_m)[1,0]
        corr_ABCD_f[p,cog,1] = np.corrcoef(y_test_HCP_f.ravel(), preds_HCP_f)[1,0]
        corr_ABCD_f[p,cog,2] = np.corrcoef(y_test_ABCD_m.ravel(), preds_ABCD_m)[1,0]
        corr_ABCD_f[p,cog,3] = np.corrcoef(y_test_ABCD_f.ravel(), preds_ABCD_f)[1,0]
        
        
        cov_x = []
        cov_y = []
        #extract feature importance
        featimp_HCP_m[p,:,cog] = model_HCP_m.coef_
        #compute Haufe-inverted feature weights
        cov_x = np.cov(np.transpose(x_train_HCP_m))
        cov_y = np.cov(y_train_HCP_m)
        featimp_haufe_HCP_m[p,:,cog] = np.matmul(cov_x,featimp_HCP_m[p,:,cog])*(1/cov_y)
        
        cov_x = []
        cov_y = []
        #extract feature importance
        featimp_HCP_f[p,:,cog] = model_HCP_f.coef_
        #compute Haufe-inverted feature weights
        cov_x = np.cov(np.transpose(x_train_HCP_f))
        cov_y = np.cov(y_train_HCP_f)
        featimp_haufe_HCP_f[p,:,cog] = np.matmul(cov_x,featimp_HCP_f[p,:,cog])*(1/cov_y)
        
        cov_x = []
        cov_y = []
        #extract feature importance
        featimp_ABCD_m[p,:,cog] = model_ABCD_m.coef_
        #compute Haufe-inverted feature weights
        cov_x = np.cov(np.transpose(x_train_ABCD_m))
        cov_y = np.cov(y_train_ABCD_m)
        featimp_haufe_ABCD_m[p,:,cog] = np.matmul(cov_x,featimp_ABCD_m[p,:,cog])*(1/cov_y)
        
        cov_x = []
        cov_y = []
        #extract feature importance
        featimp_ABCD_f[p,:,cog] = model_ABCD_f.coef_
        #compute Haufe-inverted feature weights
        cov_x = np.cov(np.transpose(x_train_ABCD_f))
        cov_y = np.cov(y_train_ABCD_f)
        featimp_haufe_ABCD_f[p,:,cog] = np.matmul(cov_x,featimp_ABCD_f[p,:,cog])*(1/cov_y)
        
 

In [None]:
# save results
base_dir  = '/Users/elvishadhamala/Documents/yale/HCP_ABCD_preds_results'

np.save(os.path.join(base_dir, 'surfarea_norm_r2_HCP_m.npy'),r2_HCP_m)
np.save(os.path.join(base_dir, 'surfarea_norm_var_HCP_m.npy'),var_HCP_m)
np.save(os.path.join(base_dir, 'surfarea_norm_corr_HCP_m.npy'),corr_HCP_m)
np.save(os.path.join(base_dir, 'surfarea_norm_alpha_HCP_m.npy'),opt_alpha_HCP_m)
np.save(os.path.join(base_dir, 'surfarea_norm_featimp_HCP_m.npy'),featimp_HCP_m)
np.save(os.path.join(base_dir, 'surfarea_norm_featimp_haufe_HCP_m.npy'),featimp_haufe_HCP_m)

np.save(os.path.join(base_dir, 'surfarea_norm_r2_HCP_f.npy'),r2_HCP_f)
np.save(os.path.join(base_dir, 'surfarea_norm_var_HCP_f.npy'),var_HCP_f)
np.save(os.path.join(base_dir, 'surfarea_norm_corr_HCP_f.npy'),corr_HCP_f)
np.save(os.path.join(base_dir, 'surfarea_norm_alpha_HCP_f.npy'),opt_alpha_HCP_f)
np.save(os.path.join(base_dir, 'surfarea_norm_featimp_HCP_f.npy'),featimp_HCP_f)
np.save(os.path.join(base_dir, 'surfarea_norm_featimp_haufe_HCP_f.npy'),featimp_haufe_HCP_f)


np.save(os.path.join(base_dir, 'surfarea_norm_r2_ABCD_m.npy'),r2_ABCD_m)
np.save(os.path.join(base_dir, 'surfarea_norm_var_ABCD_m.npy'),var_ABCD_m)
np.save(os.path.join(base_dir, 'surfarea_norm_corr_ABCD_m.npy'),corr_ABCD_m)
np.save(os.path.join(base_dir, 'surfarea_norm_alpha_ABCD_m.npy'),opt_alpha_ABCD_m)
np.save(os.path.join(base_dir, 'surfarea_norm_featimp_ABCD_m.npy'),featimp_ABCD_m)
np.save(os.path.join(base_dir, 'surfarea_norm_featimp_haufe_ABCD_m.npy'),featimp_haufe_ABCD_m)

np.save(os.path.join(base_dir, 'surfarea_norm_r2_ABCD_f.npy'),r2_ABCD_f)
np.save(os.path.join(base_dir, 'surfarea_norm_var_ABCD_f.npy'),var_ABCD_f)
np.save(os.path.join(base_dir, 'surfarea_norm_corr_ABCD_f.npy'),corr_ABCD_f)
np.save(os.path.join(base_dir, 'surfarea_norm_alpha_ABCD_f.npy'),opt_alpha_ABCD_f)
np.save(os.path.join(base_dir, 'surfarea_norm_featimp_ABCD_f.npy'),featimp_ABCD_f)
np.save(os.path.join(base_dir, 'surfarea_norm_featimp_haufe_ABCD_f.npy'),featimp_haufe_ABCD_f)
