In [None]:
#import relevant libraries

import os
import sys
#only need to run these if packages haven't been installed yet
#!{sys.executable} -m pip install -U cupy
#!{sys.executable} -m pip install cuda-python
#!{sys.executable} -m pip install -U statistics

import pandas as pd
import sys; sys.path
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime


import warnings
warnings.filterwarnings('ignore')


In [None]:
# read and clean up ABCD data
# set base dirctories
ABCD_base_dir   = 'base_directory_path'

#load subj fc data
ABCD_fc_df = pd.read_csv(os.path.join(ABCD_base_dir, 'fc_data.csv'), header=None)
ABCD_fc_subj = pd.read_csv(os.path.join(ABCD_base_dir, 'fc_subj_data.txt'), header=None)
ABCD_fc = ABCD_fc_df.T

# load subj demo and clinical data
ABCD_subj = pd.read_csv(os.path.join(ABCD_base_dir, 'clin_subj_data.csv'))
ABCD_clin = pd.read_csv(os.path.join(ABCD_base_dir, 'clin_subj_data.csv'))

#drop duplicate header rows
header_row = 0
ABCD_subj = ABCD_subj.drop(header_row)
ABCD_clin = ABCD_clin.drop(header_row)

In [None]:
#add subj key data to fc data and sort
ABCD_fc_subj.columns=['subjectkey']
ABCD_fc.insert(0, "subjectkey", ABCD_fc_subj, True)
ABCD_fc_sorted = ABCD_fc.sort_values(by='subjectkey', ascending=True)

In [None]:
#clean and sort clinical data
mask = ABCD_clin.subjectkey.isin(ABCD_fc_sorted['subjectkey'])
ABCD_clin_subjs = ABCD_clin[mask]
ABCD_clin_baseline = ABCD_clin_subjs[ABCD_clin_subjs.eventname == 'baseline_year_1_arm_1']
ABCD_clin_sorted = ABCD_clin_baseline.sort_values(by='subjectkey', ascending=True)

In [None]:
#clean and sort subject data
mask = ABCD_subj.subjectkey.isin(ABCD_fc_sorted['subjectkey'])
ABCD_subj_incl = ABCD_subj[mask]
ABCD_subj_baseline = ABCD_subj_incl[ABCD_subj_incl.eventname == 'baseline_year_1_arm_1']
ABCD_subj_sorted = ABCD_subj_baseline.sort_values(by='subjectkey', ascending=True)
ABCD_subj_data = ABCD_subj_sorted

In [None]:
#isolate and clean clinical variable to be predicted
ABCD_clin_data = ABCD_clin_sorted[['cbcl_scr_syn_anxdep_r', 'cbcl_scr_syn_withdep_r',
                                  'cbcl_scr_syn_somatic_r', 'cbcl_scr_syn_social_r',
                                  'cbcl_scr_syn_thought_r', 'cbcl_scr_syn_attention_r',
                                  'cbcl_scr_syn_rulebreak_r', 'cbcl_scr_syn_aggressive_r',
                                  'cbcl_scr_syn_internal_r',  'cbcl_scr_syn_external_r',
                                  'cbcl_scr_syn_totprob_r', 'cbcl_scr_dsm5_depress_r',
                                  'cbcl_scr_dsm5_anxdisord_r', 'cbcl_scr_dsm5_somaticpr_r',
                                  'cbcl_scr_dsm5_adhd_r', 'cbcl_scr_dsm5_opposit_r',
                                  'cbcl_scr_dsm5_conduct_r', 'cbcl_scr_07_sct_r',
                                  'cbcl_scr_07_ocd_r', 'cbcl_scr_07_stress_r']]

ABCD_clin_labels = ['AnxDep', 'WithDep', 'Somatic', 'Social', 'Thought', 'Attention',
                   'RuleBreak', 'Aggresive', 'Internal', 'External', 'TotProb', 'Depress',
                   'AnxDiscord', 'SomaticPr', 'ADHD', 'Opposit', 'Conduct', 'Sluggish', 
                    'OCD', 'Stress']

ABCD_clin_data.columns = ABCD_clin_labels
ABCD_clin_data.reset_index(inplace=True)
ABCD_clin_data = ABCD_clin_data.drop(columns=['index'])

#clean fc data 
ABCD_fc_data = ABCD_fc_sorted.drop(columns=['subjectkey'])
ABCD_fc_data.reset_index(inplace=True) 
ABCD_fc_data = ABCD_fc_data.drop(columns=['index'])

#clean subj data
ABCD_subj_data.reset_index(inplace=True) 
ABCD_subj_data = ABCD_subj_data.drop(columns=['index'])

In [None]:
#get sex-specific variables
mask_m = ABCD_subj_sorted.sex=='M'
ABCD_subj_m = ABCD_subj_sorted[mask_m]
ABCD_clin_m = ABCD_clin_data[mask_m]
ABCD_fc_m = ABCD_fc_data[mask_m]

mask_f = ABCD_subj_sorted.sex=='F'
ABCD_subj_f = ABCD_subj_sorted[mask_f]
ABCD_clin_f = ABCD_clin_data[mask_f]
ABCD_fc_f = ABCD_fc_data[mask_f]

In [None]:
#number of repetitions you want to perform
rep = 100

#set x data to be the input variable you want to use
X_m = ABCD_fc_m
X_f = ABCD_fc_f

Y_m = ABCD_clin_m
Y_f = ABCD_clin_f

#number of variables 
n_beh = Y_f.shape[1]

#number of features 
n_feat = X_f.shape[1]

#test within sex only here
#n_test = 1

results_dir   = 'results_directory_path'

In [None]:
#feature importance extracted from the model
#featimp_m = np.zeros([rep,n_feat])
#featimp_f = np.zeros([rep,n_feat])
#for when the feat weights get haufe-inverted

#iterate through one sex at a time to prevent crashes, requires ~100GB RAM on a single core to run without crashing

#featimp_haufe_m = np.zeros([n_beh,rep,n_feat])
featimp_haufe_f = np.zeros([n_beh,rep,n_feat])

In [None]:
#iterate through number of models
for p in range(rep):
    #print model # you're on
    print('Model %d' %(p+1))
    
    #create empty array to save cov x data
    cov_x_f=[]
    
    #print time
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print("Current Time =", current_time)
    
    #load train indices 
    #train_inds_m = np.load(results_dir + '/fc_traininds_m_' + np.str(p) + '.npy')
    train_inds_f = np.load(results_dir + '/fc_traininds_f_' + np.str(p) + '.npy')
    
    
    #set x values based on indices from split
    #x_train_m = X_m.iloc[train_inds_m].values
    x_train_f = X_f.iloc[train_inds_f].values
    

    #set y values based on indices from split  
    #beh_train_m = Y_m.iloc[train_inds_m].values 
    beh_train_f = Y_f.iloc[train_inds_f].values
    
    
    #compute Haufe-inverted feature weights
    
    #calculate covariance of x train data
    #cov_x_m = np.cov(x_train_m.T)
    cov_x_f = np.cov(x_train_f.T)
    
    #printing this just to keep track of it as it's running
    print("Covariance Calculation Completed")
    
    #iterate through the different behaviors
    for beh in range(n_beh):
        print ("Behaviour: %s" % Y_m.columns[beh])
          
        #y_train_m = np.double(beh_train_m)[:,beh]
        y_train_f = np.double(beh_train_f)[:,beh]
        
        #load in feat imp data from when models were trained
        #featimp_m = np.load(results_dir + '/fc_featimp_m_' + ABCD_clin_labels[beh] + '.npy')
        featimp_f = np.load(results_dir + '/fc_featimp_f_' + ABCD_clin_labels[beh] + '.npy')
        
        #calculate covariance of y train data and then complete the matrix multiplication to 
        #compute the haufe-transformed feature weights
        #cov_y_m = np.cov(y_train_m)
        #featimp_haufe_m[beh,p,:] = np.matmul(cov_x_m,featimp_m[p,:])*(1/cov_y_m)
        cov_y_f = np.cov(y_train_f)
        featimp_haufe_f[beh,p,:] = np.matmul(cov_x_f,featimp_f[p,:])*(1/cov_y_f)

        #save results
        #np.save((results_dir + '/fc_featimp_haufe_m_' + ABCD_clin_labels[beh] + '.npy'),featimp_haufe_m[beh,:,:])
        np.save((results_dir + '/fc_featimp_haufe_f_' + ABCD_clin_labels[beh] + '.npy'),featimp_haufe_f[beh,:,:])    
    

In [None]:
featimp_haufe_f