In [1]:
import pandas as pd
import numpy as np
import os
import random
from scipy.stats import pearsonr,spearmanr

In [2]:
# load brain measures

# CSA phenotypes fields
area_items = pd.read_csv('raw_data/Area_items.csv')
s1 = ['eid']
for i in range(area_items.shape[0]):
    s1.append(str(area_items.iloc[i,0])+'-2.0')
# load CSA data   
data1 = pd.read_csv('/dbstore/UKBiobank/Data_Download_02JULY2020/ukb42608.csv',usecols=s1)
data1.dropna(axis=0,how='any',inplace=True)


# CT phenotypes fields
thickness_items = pd.read_csv('raw_data/Thickness_items.csv')
s2= ['eid']
for i in range(thickness_items.shape[0]):
    s2.append(str(thickness_items.iloc[i,0])+'-2.0')   
# load CT data
data2 = pd.read_csv('/dbstore/UKBiobank/Data_Download_02JULY2020/ukb42608.csv',usecols=s2)
data2.dropna(axis=0,how='any',inplace=True)

# load functional connectivity between 21 networks
temp_FC_data = pd.read_csv('raw_data/ukbb_FC2.csv')

In [5]:
# load global brain measures (rsfMRI head motion,total gray matter volume,ICV, total CT, average CT, scanning sites)
global_brain_cols = ['eid','25741-2.0','25005-2.0','26521-2.0','26721-2.0','26822-2.0','26755-2.0','26856-2.0','54-2.0']
cov1 = pd.read_csv('/dbstore/UKBiobank/Data_Download_02JULY2020/ukb42608.csv',usecols=global_brain_cols)
cov1.dropna(how='any',inplace=True)
# coding three scanning sites
sites1 = []
sites2 = []
sites3 = []
for i in range(cov1.shape[0]):
    if cov1['54-2.0'].iloc[i] == 11025:
        sites1.append(1)
        sites2.append(0)
        sites3.append(0)
    if cov1['54-2.0'].iloc[i] == 11026:
        sites1.append(0)
        sites2.append(1)
        sites3.append(0)
    if cov1['54-2.0'].iloc[i] == 11027:
        sites1.append(0)
        sites2.append(0)
        sites3.append(1)
cov1['site1'] = sites1
cov1['site2'] = sites2
cov1['site3'] = sites3

In [6]:
# define the targt trait 
#'Age','BMI' (Body mass index),'IQ' (fluid intelligence),'NM' (Numeric memeory),
#'alcohol' (alcohol consumption),'NS' (Neuroticism),'BM' (Birth month)
trait = 'NS' 
pheno_path = 'raw_data/ukbb_phenos/'
pheno = pd.read_csv(os.path.join(pheno_path,trait+'.csv'))
pheno_colname = pheno.columns.values[1]
pheno = pheno[pheno[pheno_colname] >= 0]

# get information about age ,sex
if trait == 'Age':
    cov2 = pd.read_csv('raw_data/Sex.csv')
else:
    Sex = pd.read_csv('raw_data/Sex.csv').set_index('eid')
    Age = pd.read_csv('raw_data/ukbb_phenos/Age.csv').set_index('eid')
    cov2 = pd.concat([Age,Sex], axis=1,join="inner").reset_index() 

In [7]:
# overlap datasets by subject ids
data11 = data1.set_index('eid')
data22 = data2.set_index('eid')
temp_FC_data2 = temp_FC_data.set_index('eid')
cov11 = cov1.set_index('eid')
cov22 = cov2.set_index('eid')
pheno2 = pheno.set_index('eid')

l = list(set(data11.index) & set(data22.index) & set(temp_FC_data2.index) & set(cov11.index) & \
         set(cov22.index) & set(pheno2.index))

CSA_data = data11.loc[l]
CT_data = data22.loc[l]
FC_data = temp_FC_data2.loc[l]
global_data = cov11.loc[l]
age_and_sex = cov22.loc[l]
Y = pheno2.loc[l]

In [8]:
# save subject list
subj_list = pd.DataFrame(data=Y.index)
subj_list.to_csv('raw_data/ukbb_subjs/'+trait+'_subjs.txt',index=False,header=None)

In [9]:
# transfer the column names of global brain measures
global_data2 = pd.DataFrame({"eid" : global_data.index})
global_data2['motion'] = global_data['25741-2.0'].values
global_data2['TCSA'] = global_data['26721-2.0'].values + global_data['26822-2.0'].values
global_data2['ACT'] = global_data['26755-2.0'].values + global_data['26856-2.0'].values
global_data2['site1'] = global_data['site1'].values
global_data2['site2'] = global_data['site2'].values
global_data2['site3'] = global_data['site3'].values
global_data2.set_index('eid',inplace=True)

In [10]:
# build a function to get the covariates
# info1 is the matrix including global brain measures
# info2 is the matrix including age and sex
def get_covariates(info1,info2,cotype):
    if cotype == 'CSA':
        co_items = ['TCSA','site1','site2','site3']
    if cotype == 'CT':
        co_items = ['ACT','site1','site2','site3']
    if cotype == 'FC':
        co_items = ['motion','site1','site2','site3']
    if cotype == 'pheno':
        co_items = ['TCSA','ACT','motion','site1','site2','site3']
    
    info11 = info1[co_items]
    info = pd.concat([info11,info2], axis=1,join="inner")
    return info

In [11]:
# regressing out the effects of covariates
from sklearn.preprocessing import StandardScaler
def regression_covariant(covariant_matrix, y, standard_scale=True):
    a = np.hstack((covariant_matrix,np.ones((covariant_matrix.shape[0], 1))))
    w = np.linalg.lstsq(a,y,rcond=None)[0]

    residual = y - covariant_matrix.dot(w[:-1])
    residual = residual.astype('float64')

    if standard_scale:
        residual = StandardScaler().fit_transform(residual.reshape(-1,1)).flatten()

    return residual, w

# data is the neuroimaging measures or phenotypes
# co is the covariates
def regress_data(data,co):
    codata = co.values
    s1 = data.shape
    reg_data = np.zeros(s1)
    for i in range(s1[1]):
        x = data.iloc[:,i].values
        [rx,w1] = regression_covariant(codata,x,standard_scale=False)
        reg_data[:,i] = rx
    return reg_data

In [12]:
# regress out the covariates for brain measures and the variable
CSA_co = get_covariates(global_data2,age_and_sex,'CSA')
reg_CSA_data = regress_data(CSA_data,CSA_co)

CT_co = get_covariates(global_data2,age_and_sex,'CT')
reg_CT_data = regress_data(CT_data,CT_co)

FC_co = get_covariates(global_data2,age_and_sex,'FC')
reg_FC_data = regress_data(FC_data,FC_co)

pheno_co = get_covariates(global_data2,age_and_sex,'pheno')
if trait == 'BM':
    reg_var_data = Y.values
else:
    reg_var_data = regress_data(Y,pheno_co)

## calculate_correlations_in_full_sample: The main function to calculate correlations between brain measures and a variable in the full sample
### input parameters: img_data, variable, mytype, output_path
**img_data:** Neuroimaging measures (size =*n* x *d*, n is sample size, d is the number of brain measures)

**variable:** The variable matrix (size = *n* x 1)

**mytype:** The imaging type: "CSA", "CT", or "FC"

**output_path:** The folder where the results of correlations in full sample are saved 

In [13]:
def calculate_correlations_in_full_sample(img_data,variable,mytype,output_path):
    s = img_data.shape
    corr_data = np.zeros((s[1],2))
    regions = pd.read_csv('raw_data/Area_items.csv').iloc[:,-1].values.tolist()
    y = variable[:,0]
    for i in range(s[1]):
        x = img_data[:,i]
        r,p = spearmanr(x,y)
        corr_data[i,0] = r
        corr_data[i,1] = p
    
    d = pd.DataFrame(data=corr_data,columns=[['r','p']])
    if mytype == 'FC':
        d['FCs'] = np.arange(210) + 1
    else:
        d['regions'] = regions
    
    file_name = output_path+mytype+'_'+trait+'.csv'
    d.to_csv(file_name,index=False)

In [14]:
# correlation analysis in full sample
calculate_correlations_in_full_sample(reg_CSA_data,reg_var_data,'CSA','correlations_full_sample/')
calculate_correlations_in_full_sample(reg_CT_data,reg_var_data,'CT','correlations_full_sample/')
calculate_correlations_in_full_sample(reg_FC_data,reg_var_data,'FC','correlations_full_sample/')

## calculate_correlations: The main function to calculate correlations between brain measures and a variable using bootstrapping methods
### input parameters: img_data, variable
**img_data:** Neuroimaging measures (size =*n* x *d*, n is sample size, d is the number of brain measures)

**variable:** The variable matrix (size = *n* x 1)

#### The subsample sizes:100,200,300,400,...n/2
#### The random selection times = 100

In [15]:
def calculate_correlations(img_data, variable):
    s = img_data.shape
    # the subsample sizes:100,200,300,400,...s[0]/2
    subsampling_times = int(s[0]/200)
    # the random selection times 
    random_num = 100
    
    # create empty arrays
    random_img1 = np.zeros((subsampling_times,random_num,s[1],2))
    random_img2 = np.zeros((subsampling_times,random_num,s[1],2))

    for i in range(subsampling_times):
        print(i)
        #subsample size
        random_sample = 100*(i+1)
        
        for j in range(random_num):
            # random select two non-overlapping subsamples
            total_list = np.arange(s[0]).tolist()
            random_inds1 = random.sample(total_list,random_sample)
            rest_total_list = list(set(total_list) - set(random_inds1))
            random_inds2 = random.sample(rest_total_list,random_sample)
            X1 = img_data[random_inds1,:]
            X2 = img_data[random_inds2,:]
            m1 = variable[random_inds1,0]
            m2 = variable[random_inds2,0]
            
            # Spearman's correlation analysis
            for k in range(s[1]):
                r1,p1 = spearmanr(X1[:,k],m1)
                r2,p2 = spearmanr(X2[:,k],m2)
                random_img1[i,j,k,0] = r1
                random_img1[i,j,k,1] = p1
                random_img2[i,j,k,0] = r2
                random_img2[i,j,k,1] = p2

                
    return random_img1,random_img2

In [1]:
# bootstrapping correlation analysis
random_data_CT1,random_data_CT2 = calculate_correlations(reg_CT_data,reg_var_data)
random_data_FC1,random_data_FC2 = calculate_correlations(reg_FC_data,reg_var_data)
random_data_CSA1,random_data_CSA2 = calculate_correlations(reg_CSA_data,reg_var_data)

In [19]:
# save the results of correlation analysis
if not os.path.exists('boostrap_correlations/'+trait):
    os.mkdir('boostrap_correlations/'+trait)
save_path = 'boostrap_correlations/'+trait
CSA_file_name1 = save_path+'/random_data_CSA1.npy'
CSA_file_name2 = save_path+'/random_data_CSA2.npy'
CT_file_name1 = save_path+'/random_data_CT1.npy'
CT_file_name2 = save_path+'/random_data_CT2.npy'
FC_file_name1 = save_path+'/random_data_FC1.npy'
FC_file_name2 = save_path+'/random_data_FC2.npy'

np.save(CSA_file_name1,random_data_CSA1)
np.save(CSA_file_name2,random_data_CSA2)
np.save(CT_file_name1,random_data_CT1)
np.save(CT_file_name2,random_data_CT2)
np.save(FC_file_name1,random_data_FC1)
np.save(FC_file_name2,random_data_FC2)