In [None]:
#import relevant libraries

import os
import sys
#only need to run these if packages haven't been installed yet
#!{sys.executable} -m pip install numpy
#!{sys.executable} -m pip install pandas
#!{sys.executable} -m pip install sklearn
#!{sys.executable} -m pip install matplotlib
#!{sys.executable} -m pip install datetime
#!{sys.executable} -m pip install seaborn

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

from sklearn.metrics import explained_variance_score, r2_score
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, GroupKFold, GroupShuffleSplit

import warnings
warnings.filterwarnings('ignore')

In [None]:
# read and clean up ABCD data
# set base dirctories
ABCD_base_dir   = 'base_directory_path'

#load subj fc data
ABCD_fc_df = pd.read_csv(os.path.join(ABCD_base_dir, 'fc_data.csv'), header=None)
ABCD_fc_subj = pd.read_csv(os.path.join(ABCD_base_dir, 'fc_subj_data.txt'), header=None)
ABCD_fc = ABCD_fc_df.T

# load subj demo and clinical data
ABCD_subj = pd.read_csv(os.path.join(ABCD_base_dir, 'clin_subj_data.csv'))
ABCD_clin = pd.read_csv(os.path.join(ABCD_base_dir, 'clin_subj_data.csv'))

#drop duplicate header rows
header_row = 0
ABCD_subj = ABCD_subj.drop(header_row)
ABCD_clin = ABCD_clin.drop(header_row)

In [None]:
#add subj key data to fc data and sort
ABCD_fc_subj.columns=['subjectkey']
ABCD_fc.insert(0, "subjectkey", ABCD_fc_subj, True)
ABCD_fc_sorted = ABCD_fc.sort_values(by='subjectkey', ascending=True)


In [None]:
#clean and sort clinical data
mask = ABCD_clin.subjectkey.isin(ABCD_fc_sorted['subjectkey'])
ABCD_clin_subjs = ABCD_clin[mask]
ABCD_clin_baseline = ABCD_clin_subjs[ABCD_clin_subjs.eventname == 'baseline_year_1_arm_1']
ABCD_clin_sorted = ABCD_clin_baseline.sort_values(by='subjectkey', ascending=True)

In [None]:
#clean and sort clinical data
mask = ABCD_subj.subjectkey.isin(ABCD_fc_sorted['subjectkey'])
ABCD_subj_incl = ABCD_subj[mask]
ABCD_subj_baseline = ABCD_subj_incl[ABCD_subj_incl.eventname == 'baseline_year_1_arm_1']
ABCD_subj_sorted = ABCD_subj_baseline.sort_values(by='subjectkey', ascending=True)
ABCD_subj_data = ABCD_subj_sorted

In [None]:
#isolate and clean clinical variable to be predicted
ABCD_clin_data = ABCD_clin_sorted[['cbcl_scr_syn_anxdep_r', 'cbcl_scr_syn_withdep_r',
                                  'cbcl_scr_syn_somatic_r', 'cbcl_scr_syn_social_r',
                                  'cbcl_scr_syn_thought_r', 'cbcl_scr_syn_attention_r',
                                  'cbcl_scr_syn_rulebreak_r', 'cbcl_scr_syn_aggressive_r',
                                  'cbcl_scr_syn_internal_r',  'cbcl_scr_syn_external_r',
                                  'cbcl_scr_syn_totprob_r', 'cbcl_scr_dsm5_depress_r',
                                  'cbcl_scr_dsm5_anxdisord_r', 'cbcl_scr_dsm5_somaticpr_r',
                                  'cbcl_scr_dsm5_adhd_r', 'cbcl_scr_dsm5_opposit_r',
                                  'cbcl_scr_dsm5_conduct_r', 'cbcl_scr_07_sct_r',
                                  'cbcl_scr_07_ocd_r', 'cbcl_scr_07_stress_r']]

ABCD_clin_labels = ['AnxDep', 'WithDep', 'Somatic', 'Social', 'Thought', 'Attention',
                    'RuleBreak', 'Aggresive', 'Internal', 'External', 'TotProb', 'Depress',
                    'AnxDiscord', 'SomaticPr', 'ADHD', 'Opposit', 'Conduct', 'Sluggish', 
                    'OCD', 'Stress']

ABCD_clin_data.columns = ABCD_clin_labels
ABCD_clin_data.reset_index(inplace=True)
ABCD_clin_data = ABCD_clin_data.drop(columns=['index'])

#clean fc data 
ABCD_fc_data = ABCD_fc_sorted.drop(columns=['subjectkey'])
ABCD_fc_data.reset_index(inplace=True) 
ABCD_fc_data = ABCD_fc_data.drop(columns=['index'])

#clean subj data
ABCD_subj_data.reset_index(inplace=True) 
ABCD_subj_data = ABCD_subj_data.drop(columns=['index'])



In [None]:
#get sex-specific variables
mask_m = ABCD_subj_sorted.sex=='M'
ABCD_subj_m = ABCD_subj_sorted[mask_m]
ABCD_clin_m = ABCD_clin_data[mask_m]
ABCD_fc_m = ABCD_fc_data[mask_m]

mask_f = ABCD_subj_sorted.sex=='F'
ABCD_subj_f = ABCD_subj_sorted[mask_f]
ABCD_clin_f = ABCD_clin_data[mask_f]
ABCD_fc_f = ABCD_fc_data[mask_f]

In [None]:
#number of repetitions you want to perform
rep = 100
#number of folds you want in the cross-validation
k = 3
#proportion of data you want in your training set and test set
train_size = .66
test_size = 1-train_size

#regression model type
regr = Ridge(normalize=True, max_iter=1000000, solver='lsqr')

#set x data to be the input variable you want to use
X_m = ABCD_fc_m
X_f = ABCD_fc_f

Y_m = ABCD_clin_m
Y_f = ABCD_clin_f

#number of variables 
#iterating through all of the clinical variables
n_beh = Y_f.shape[1]

#number of features 
n_feat = X_f.shape[1]

#test within sex only here
n_test = 1

In [None]:
#create arrays to store variables

#r^2 - coefficient of determination
r2_mm = np.zeros([rep,n_beh])
r2_mf = np.zeros([rep,n_beh])
r2_fm = np.zeros([rep,n_beh])
r2_ff = np.zeros([rep,n_beh])

#explained variance
var_mm = np.zeros([rep,n_beh])
var_mf = np.zeros([rep,n_beh])
var_fm = np.zeros([rep,n_beh])
var_ff = np.zeros([rep,n_beh])

#correlation between true and predicted (aka prediction accuracy)
corr_mm = np.zeros([rep,n_beh])
corr_mf = np.zeros([rep,n_beh])
corr_fm = np.zeros([rep,n_beh])
corr_ff = np.zeros([rep,n_beh])


#feature importance extracted from the model
featimp_m = np.zeros([rep,n_feat])
featimp_f = np.zeros([rep,n_feat])
#for when the feat weights get haufe-inverted
#featimp_haufe_m = np.zeros([rep,n_feat])
#featimp_haufe_f = np.zeros([rep,n_feat])

In [None]:
results_dir   = 'results_directory_path'

#iterate through train behaviors 1 at a time
beh_train = 0

#ABCD_clin_labels = ['AnxDep', 'WithDep', 'Somatic', 'Social', 'Thought', 'Attention',
#                    'RuleBreak', 'Aggresive', 'Internal', 'External', 'TotProb', 'Depress',
#                    'AnxDiscord', 'SomaticPr', 'ADHD', 'Opposit', 'Conduct', 'Sluggish', 
#                    'OCD', 'Stress']

clin_var = ABCD_clin_labels[beh_train]
print(clin_var)

#load in optimized alpha from when models were trained
opt_alpha_m = np.load(results_dir + '/fc_alpha_m_' + clin_var + '.npy')
opt_alpha_f = np.load(results_dir + '/fc_alpha_f_' + clin_var + '.npy')


In [None]:
#iterate through number of models
for p in range(rep):
    #print model # you're on
    print('Model %d' %(p+1))
    
    #print time
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print("Current Time =", current_time)
    
    #split data into train/test
    train_inds_m, test_inds_m = next(GroupShuffleSplit(test_size=1-train_size, n_splits=1, random_state = p).split(X_m, groups=ABCD_subj_m['site_id_l']))
    train_inds_f, test_inds_f = next(GroupShuffleSplit(test_size=1-train_size, n_splits=1, random_state = p).split(X_f, groups=ABCD_subj_f['site_id_l']))
    #x_train, x_test, cog_train, cog_test = train_test_split(X, Y, test_size=1-train_size, shuffle=True, random_state=p)
    
    #set x values based on indices from split
    x_train_m = X_m.iloc[train_inds_m].values
    x_test_m = X_m.iloc[test_inds_m].values
        
    #set y values based on indices from split  
    beh_train_m = Y_m.iloc[train_inds_m].values
    beh_test_m = Y_m.iloc[test_inds_m].values 
    
    #set x values based on indices from split
    x_train_f = X_f.iloc[train_inds_f].values
    x_test_f = X_f.iloc[test_inds_f].values
        
    #set y values based on indices from split  
    beh_train_f = Y_f.iloc[train_inds_f].values
    beh_test_f = Y_f.iloc[test_inds_f].values 
    
    #convert y values to to double
    y_train_m = np.double(beh_train_m)[:, beh_train]
    y_train_f = np.double(beh_train_f)[:, beh_train]
        

    #fit model to train data
    model_m = Ridge(alpha = opt_alpha_m[p], normalize=True, max_iter=1000000, solver='lsqr')
    model_m.fit(x_train_m, y_train_m);
        
    model_f = Ridge(alpha = opt_alpha_f[p], normalize=True, max_iter=1000000, solver='lsqr')
    model_f.fit(x_train_f, y_train_f);
        
        
    #iterate through all of the clinical variables to see if models generalize across sexes and across behaviors
    for beh_test in range(n_beh):
        print ("Testing Behaviour: %s" % Y_m.columns[beh_test])
            
        y_test_m = np.double(beh_test_m)[:,beh_test]
        y_test_f = np.double(beh_test_f)[:,beh_test]
    
        r2_mm[p,beh_test]=model_m.score(x_test_m,y_test_m)
        r2_mf[p,beh_test]=model_m.score(x_test_f,y_test_f)
        
        r2_fm[p,beh_test]=model_f.score(x_test_m,y_test_m)
        r2_ff[p,beh_test]=model_f.score(x_test_f,y_test_f)
        
        #generate predictions within and between sexes
        preds_mm = []
        preds_mf = []
        preds_fm = []
        preds_ff = []
        
        preds_mm = model_m.predict(x_test_m).ravel()
        preds_mf = model_m.predict(x_test_f).ravel()
        preds_fm = model_f.predict(x_test_m).ravel()
        preds_ff = model_f.predict(x_test_f).ravel()
        
        
        #compute explained variance 
        var_mm[p,beh_test] = explained_variance_score(y_test_m, preds_mm)
        var_mf[p,beh_test] = explained_variance_score(y_test_f, preds_mf)
        
        var_fm[p,beh_test] = explained_variance_score(y_test_m, preds_fm)
        var_ff[p,beh_test] = explained_variance_score(y_test_f, preds_ff)


        #compute correlation between true and predicted (prediction accuracy)
        corr_mm[p,beh_test] = np.corrcoef(y_test_m.ravel(), preds_mm)[1,0]
        corr_mf[p,beh_test] = np.corrcoef(y_test_f.ravel(), preds_mf)[1,0]
        
        corr_fm[p,beh_test] = np.corrcoef(y_test_m.ravel(), preds_fm)[1,0]
        corr_ff[p,beh_test] = np.corrcoef(y_test_f.ravel(), preds_ff)[1,0]
        
        
        #print ("Haufe-Transforming Feature Weights")
        #cov_x = []
        #cov_y = []
    
        #extract feature importance
        featimp_m[p,:] = model_m.coef_
        featimp_f[p,:] = model_f.coef_
        #compute Haufe-inverted feature weights
        #cov_x_m = np.cov(np.transpose(x_train_m))
        #cov_y_m = np.cov(y_train_m)
        #featimp_haufe_m[p,:] = np.matmul(cov_x_m,featimp_m[p,:])*(1/cov_y_m)
        
        #cov_x_f = np.cov(np.transpose(x_train_f))
        #cov_y_f = np.cov(y_train_f)
        #featimp_haufe_f[p,:] = np.matmul(cov_x_f,featimp_f[p,:])*(1/cov_y_f)


    # save results
    results_dir   = 'results_directory_path'

    np.save((results_dir + '/fc_featimp_m_' + clin_var + '.npy'),featimp_m)
    np.save((results_dir + '/fc_featimp_f_' + clin_var + '.npy'),featimp_f)    
    
    np.save((results_dir + '/fc_crossbehav_r2_mm_' + clin_var + '.npy'),r2_mm)
    np.save((results_dir + '/fc_crossbehav_var_mm_' + clin_var + '.npy'),var_mm)
    np.save((results_dir + '/fc_crossbehav_corr_mm_' + clin_var + '.npy'),corr_mm)
    
    np.save((results_dir + '/fc_crossbehav_r2_mf_' + clin_var + '.npy'),r2_mf)
    np.save((results_dir + '/fc_crossbehav_var_mf_' + clin_var + '.npy'),var_mf)
    np.save((results_dir + '/fc_crossbehav_corr_mf_' + clin_var + '.npy'),corr_mf)
    
    np.save((results_dir + '/fc_crossbehav_r2_fm_' + clin_var + '.npy'),r2_fm)
    np.save((results_dir + '/fc_crossbehav_var_fm_' + clin_var + '.npy'),var_fm)
    np.save((results_dir + '/fc_crossbehav_corr_fm_' + clin_var + '.npy'),corr_fm)

    np.save((results_dir + '/fc_crossbehav_r2_ff_' + clin_var + '.npy'),r2_ff)
    np.save((results_dir + '/fc_crossbehav_var_ff_' + clin_var + '.npy'),var_ff)
    np.save((results_dir + '/fc_crossbehav_corr_ff_' + clin_var + '.npy'),corr_ff)

    
    
    
        
        