In [1]:
import pandas as pd
import numpy as np
import os
import random
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import pearsonr,spearmanr
from sklearn.cross_decomposition import PLSRegression

In [2]:
# load CSA data
# CSA phenotypes fields
area_items = pd.read_csv('raw_data/Area_items.csv')
s1 = ['eid']
for i in range(area_items.shape[0]):
    s1.append(str(area_items.iloc[i,0])+'-2.0')
data1 = pd.read_csv('/dbstore/UKBiobank/Data_Download_02JULY2020/ukb42608.csv',usecols=s1)
data1.dropna(axis=0,how='any',inplace=True)

# load CT data
# CT phenotypes fields
thickness_items = pd.read_csv('raw_data/Thickness_items.csv')
s2= ['eid']
for i in range(thickness_items.shape[0]):
    s2.append(str(thickness_items.iloc[i,0])+'-2.0')
data2 = pd.read_csv('/dbstore/UKBiobank/Data_Download_02JULY2020/ukb42608.csv',usecols=s2)
data2.dropna(axis=0,how='any',inplace=True)

# load functional connectivity between 21 networks
temp_FC_data = pd.read_csv('raw_data/ukbb_FC2.csv')

In [3]:
# load global brain measures (rsfMRI head motion,total gray matter volume,ICV, total CT, average CT, scanning sites)
global_brain_cols = ['eid','25741-2.0','25005-2.0','26521-2.0','26721-2.0','26822-2.0','26755-2.0','26856-2.0','54-2.0']
cov1 = pd.read_csv('/dbstore/UKBiobank/Data_Download_02JULY2020/ukb42608.csv',usecols=global_brain_cols)
cov1.dropna(how='any',inplace=True)
# coding three scanning sites
sites1 = []
sites2 = []
sites3 = []
for i in range(cov1.shape[0]):
    if cov1['54-2.0'].iloc[i] == 11025:
        sites1.append(1)
        sites2.append(0)
        sites3.append(0)
    if cov1['54-2.0'].iloc[i] == 11026:
        sites1.append(0)
        sites2.append(1)
        sites3.append(0)
    if cov1['54-2.0'].iloc[i] == 11027:
        sites1.append(0)
        sites2.append(0)
        sites3.append(1)
cov1['site1'] = sites1
cov1['site2'] = sites2
cov1['site3'] = sites3

In [4]:
# build a function to get the covariates
# info1 is the matrix including global brain measures
# info2 is the matrix including age and sex
def get_covariates(info1,info2,cotype):
    if cotype == 'CSA':
        co_items = ['TCSA','site1','site2','site3']
    if cotype == 'CT':
        co_items = ['ACT','site1','site2','site3']
    if cotype == 'FC':
        co_items = ['motion','site1','site2','site3']
    if cotype == 'pheno':
        co_items = ['TCSA','ACT','motion','site1','site2','site3']
    
    info11 = info1[co_items]
    info = pd.concat([info11,info2], axis=1,join="inner")
    return info

In [5]:
# regressing out the effects of covariates
from sklearn.preprocessing import StandardScaler
def regression_covariant(covariant_matrix, y, standard_scale=True):
    a = np.hstack((covariant_matrix,np.ones((covariant_matrix.shape[0], 1))))
    w = np.linalg.lstsq(a,y,rcond=None)[0]

    residual = y - covariant_matrix.dot(w[:-1])
    residual = residual.astype('float64')

    if standard_scale:
        residual = StandardScaler().fit_transform(residual.reshape(-1,1)).flatten()

    return residual, w

# data is the neuroimaging measures or phenotypes
# co is the covariates
def regress_data(data,co):
    codata = co.values
    s1 = data.shape
    reg_data = np.zeros(s1)
    for i in range(s1[1]):
        x = data.iloc[:,i].values
        [rx,w1] = regression_covariant(codata,x,standard_scale=True)
        reg_data[:,i] = rx
    return reg_data

In [31]:
# bootstrapping PLS regression analysis
# img_data is the neuorimaging measures (X)
# variable is the phenotype measure (y)
from tqdm import tqdm
 
def bootstrap_pls_regression(img_data, variable,mytrait,img_type,output_path):
    s = img_data.shape
    # the subsample sizes:100,200,300,400,...s[0]/2
    subsampling_times = int(s[0]/200)
    # the random selection times 
    random_num = 100
    
    # create empty arrays
    bootstrap_r2_score = np.zeros((subsampling_times,2))
    bootstrap_pls1_weights = np.zeros((subsampling_times,random_num,s[1],2))
    bootstrap_pls1_components = np.zeros((subsampling_times,random_num,s[0],2))
    bootstrap_pls1_corr = np.zeros((subsampling_times,random_num,4))
    for i in tqdm (range(subsampling_times), desc=mytrait + ':' + img_type + " Bootstrapping..."):
        #subsample size
        random_sample = 100*(i+1)
        tmp_r2_scores = []
        for j in range(random_num):
            # random select two non-overlapping subsamples
            total_list = np.arange(s[0]).tolist()
            random_inds1 = random.sample(total_list,random_sample)
            rest_total_list = list(set(total_list) - set(random_inds1))
            random_inds2 = random.sample(rest_total_list,random_sample)
            X1 = img_data[random_inds1,:]
            X2 = img_data[random_inds2,:]
            m1 = variable[random_inds1,0]
            m2 = variable[random_inds2,0]
            
            # PLS regression
            pls = PLSRegression(n_components=1)
            pls.fit(X1, m1)
            X1_r, m1_r = pls.transform(X1, m1)
            X2_r, m2_r = pls.transform(X2, m2)
            r1,p1 = pearsonr(X1_r[:,0],m1_r)
            r2,p2 = pearsonr(X2_r[:,0],m2_r)
            bootstrap_pls1_corr[i,j,:] = [r1,p1,r2,p2]
            bootstrap_pls1_components[i,j,:random_sample,0] = X1_r[:,0]
            bootstrap_pls1_components[i,j,:random_sample,1] = X2_r[:,0]
            bootstrap_pls1_weights[i,j,:,0] = pls.x_weights_[:,0]
            tmp_r2_scores.append(pls.score(X2,m2))
            
            pls2 = PLSRegression(n_components=1)
            pls2.fit(X2, m2)
            bootstrap_pls1_weights[i,j,:,1] = pls2.x_weights_[:,0]
            
        
        bootstrap_r2_score[i,0] = np.mean(tmp_r2_scores)
        bootstrap_r2_score[i,1] = np.std(tmp_r2_scores)
        
    if not os.path.exists(output_path+mytrait):
        os.mkdir(output_path+mytrait)

    r2_score_file = output_path+mytrait+'/'+ img_type +'_r2_score.csv'
    df = pd.DataFrame(data=bootstrap_r2_score)
    df.to_csv(r2_score_file,index=False)

    pls1_corr_file = output_path+mytrait+'/'+ img_type +'_pls1_corr.npy'
    np.save(pls1_corr_file,bootstrap_pls1_corr)

    pls1_componenets_file = output_path+mytrait+'/'+ img_type +'_pls1_componenets.npy'
    np.save(pls1_componenets_file,bootstrap_pls1_components)

    pls1_weights_file = output_path+mytrait+'/'+ img_type +'_pls1_weights.npy'
    np.save(pls1_weights_file,bootstrap_pls1_weights)
    return

In [None]:
# define the targt trait ('Age','BMI','IQ','NM','alcohol','NS','BM')
traits = ['Age','BMI','IQ','NM','alcohol','NS','BM']
for trait in traits:
    pheno_path = 'raw_data/ukbb_phenos/'
    pheno = pd.read_csv(os.path.join(pheno_path,trait+'.csv'))
    pheno_colname = pheno.columns.values[1]
    pheno = pheno[pheno[pheno_colname] >= 0]

    # get information about age ,sex
    if trait == 'Age':
        cov2 = pd.read_csv('raw_data/Sex.csv')
    else:
        Sex = pd.read_csv('raw_data/Sex.csv').set_index('eid')
        Age = pd.read_csv('raw_data/ukbb_phenos/Age.csv').set_index('eid')
        cov2 = pd.concat([Age,Sex], axis=1,join="inner").reset_index() 
        
    # overlap datasets by subject ids
    data11 = data1.set_index('eid')
    data22 = data2.set_index('eid')
    temp_FC_data2 = temp_FC_data.set_index('eid')
    cov11 = cov1.set_index('eid')
    cov22 = cov2.set_index('eid')
    pheno2 = pheno.set_index('eid')

    l = list(set(data11.index) & set(data22.index) & set(temp_FC_data2.index) & set(cov11.index) & \
             set(cov22.index) & set(pheno2.index))

    CSA_data = data11.loc[l]
    CT_data = data22.loc[l]
    FC_data = temp_FC_data2.loc[l]
    global_data = cov11.loc[l]
    age_and_sex = cov22.loc[l]
    Y = pheno2.loc[l]
    
    # transfer the column names of global brain measures
    global_data2 = pd.DataFrame({"eid" : global_data.index})
    global_data2['motion'] = global_data['25741-2.0'].values
    global_data2['TCSA'] = global_data['26721-2.0'].values + global_data['26822-2.0'].values
    global_data2['ACT'] = global_data['26755-2.0'].values + global_data['26856-2.0'].values
    global_data2['site1'] = global_data['site1'].values
    global_data2['site2'] = global_data['site2'].values
    global_data2['site3'] = global_data['site3'].values
    global_data2.set_index('eid',inplace=True)
    
    # regress out the covariates for brain measures and the variable
    CSA_co = get_covariates(global_data2,age_and_sex,'CSA')
    reg_CSA_data = regress_data(CSA_data,CSA_co)

    CT_co = get_covariates(global_data2,age_and_sex,'CT')
    reg_CT_data = regress_data(CT_data,CT_co)

    FC_co = get_covariates(global_data2,age_and_sex,'FC')
    reg_FC_data = regress_data(FC_data,FC_co)

    pheno_co = get_covariates(global_data2,age_and_sex,'pheno')
    reg_var_data = regress_data(Y,pheno_co)
    

    bootstrap_pls_regression(reg_CSA_data,reg_var_data,trait,'CSA','pls_regression/')
    bootstrap_pls_regression(reg_CT_data,reg_var_data,trait,'CT','pls_regression/')
    bootstrap_pls_regression(reg_FC_data,reg_var_data,trait,'FC','pls_regression/')
    

Age:CSA Bootstrapping...: 100%|██████████| 187/187 [1:02:24<00:00, 20.02s/it]
Age:CT Bootstrapping...: 100%|██████████| 187/187 [1:02:35<00:00, 20.08s/it]
Age:FC Bootstrapping...: 100%|██████████| 187/187 [2:41:08<00:00, 51.70s/it]   
BMI:CSA Bootstrapping...: 100%|██████████| 180/180 [55:45<00:00, 18.58s/it]
BMI:CT Bootstrapping...: 100%|██████████| 180/180 [57:07<00:00, 19.04s/it]
BMI:FC Bootstrapping...: 100%|██████████| 180/180 [2:25:40<00:00, 48.56s/it]  
IQ:CSA Bootstrapping...: 100%|██████████| 172/172 [53:40<00:00, 18.72s/it]
IQ:CT Bootstrapping...: 100%|██████████| 172/172 [56:35<00:00, 19.74s/it]
IQ:FC Bootstrapping...: 100%|██████████| 172/172 [2:17:33<00:00, 47.99s/it]  
NM:CSA Bootstrapping...: 100%|██████████| 126/126 [17:03<00:00,  8.13s/it]
NM:CT Bootstrapping...: 100%|██████████| 126/126 [15:53<00:00,  7.57s/it]
NM:FC Bootstrapping...: 100%|██████████| 126/126 [57:56<00:00, 27.59s/it]
alcohol:CSA Bootstrapping...: 100%|██████████| 171/171 [51:15<00:00, 17.99s/it]
alcoh