In [1]:
import pandas as pd
import numpy as np
import os
import random
from scipy.stats import ttest_ind
import seaborn as sns
from matplotlib import pyplot as plt

In [2]:
# load CSA data
# CSA phenotypes fields
area_items = pd.read_csv('raw_data/Area_items.csv')
s1 = ['eid']
for i in range(area_items.shape[0]):
    s1.append(str(area_items.iloc[i,0])+'-2.0')
data1 = pd.read_csv('/dbstore/UKBiobank/Data_Download_02JULY2020/ukb42608.csv',usecols=s1)
data1.dropna(axis=0,how='any',inplace=True)

In [3]:
# load CT data
# CT phenotypes fields
thickness_items = pd.read_csv('raw_data/Thickness_items.csv')
s2= ['eid']
for i in range(thickness_items.shape[0]):
    s2.append(str(thickness_items.iloc[i,0])+'-2.0')
data2 = pd.read_csv('/dbstore/UKBiobank/Data_Download_02JULY2020/ukb42608.csv',usecols=s2)
data2.dropna(axis=0,how='any',inplace=True)

In [4]:
# load functional connectivity between 21 networks
temp_FC_data = pd.read_csv('raw_data/ukbb_FC2.csv')

In [5]:
# load global brain measures (rsfMRI head motion,total gray matter volume,ICV, total CT, average CT, scanning sites)
global_brain_cols = ['eid','25741-2.0','25005-2.0','26521-2.0','26721-2.0','26822-2.0','26755-2.0','26856-2.0','54-2.0']
cov1 = pd.read_csv('/dbstore/UKBiobank/Data_Download_02JULY2020/ukb42608.csv',usecols=global_brain_cols)
cov1.dropna(how='any',inplace=True)
# coding three scanning sites
sites1 = []
sites2 = []
sites3 = []
for i in range(cov1.shape[0]):
    if cov1['54-2.0'].iloc[i] == 11025:
        sites1.append(1)
        sites2.append(0)
        sites3.append(0)
    if cov1['54-2.0'].iloc[i] == 11026:
        sites1.append(0)
        sites2.append(1)
        sites3.append(0)
    if cov1['54-2.0'].iloc[i] == 11027:
        sites1.append(0)
        sites2.append(0)
        sites3.append(1)
cov1['site1'] = sites1
cov1['site2'] = sites2
cov1['site3'] = sites3

In [6]:
# function to show the distribution of phenotypes
def plot_distribution(variable,trait,output_path):
    
    plot_data = pd.DataFrame(data=variable,columns=['value'])
    fig = plt.figure(figsize=(4,4))
    sns.displot(plot_data, x="value", discrete=True)
    plt.tick_params(direction='out', length=6, width=2,labelsize=15)
    plt.ylabel('Count',fontsize=18)
    plt.xlabel(trait,fontsize=18)
    plt.vlines((Q1, Q2, Q3,), 0, plt.yticks()[0][-1], colors = ("r", "black", "b"),
         linestyles = ("solid", "solid", "solid"))
    final_output_path = output_path+trait+'_distribution.png'
    plt.savefig(final_output_path,dpi=300,bbox_inches = 'tight')
    plt.clf()

In [2]:
# visualize the quartiles and show the distribution histograms
pheno_path = 'raw_data/ukbb_phenos/'
tmp_files = os.listdir(pheno_path)
names = ['Fluid intelligence', 'Body mass index', 'sex','Birth month', 'Neuroticism','Alcohol consumption',\
         'Numeric memory', 'Age']
for i in range(len(tmp_files)):
    file = tmp_files[i]
    f = file[:-4]
    if f == 'Sex':
        continue
    file_path = os.path.join(pheno_path,file)
    subjs = pd.read_csv('raw_data/ukbb_subjs/'+f+'_subjs.txt',header=None).iloc[:,0].values.tolist()
    data = pd.read_csv(file_path).set_index('eid')
    variable = data.loc[subjs].values[:,0]
    
    Q1 = np.quantile(variable,0.25)
    Q2 = np.quantile(variable,0.5)
    Q3 = np.quantile(variable,0.75)
    print(f,": ",Q2,np.sum(variable <= Q2),np.sum(variable <= Q2)/variable.shape[0])
    print(f,": ",Q1,Q2,Q3,np.sum(variable <= Q1),np.sum(variable <= Q1)/variable.shape[0],\
          np.sum(variable >= Q3),np.sum(variable >= Q3)/variable.shape[0])
    plot_distribution(variable,names[i],'figures2/')

In [8]:
data11 = data1.set_index('eid')
data22 = data2.set_index('eid')
temp_FC_data2 = temp_FC_data.set_index('eid')
cov11 = cov1.set_index('eid')

In [9]:
# build a function to get the covariates
# info1 is the matrix including global brain measures
# info2 is the matrix including age and sex
def get_covariates(info1,info2,cotype):
    if cotype == 'CSA':
        co_items = ['TCSA','site1','site2','site3']
    if cotype == 'CT':
        co_items = ['ACT','site1','site2','site3']
    if cotype == 'FC':
        co_items = ['motion','site1','site2','site3']
    if cotype == 'pheno':
        co_items = ['TCSA','ACT','motion','site1','site2','site3']
    
    info11 = info1[co_items]
    info = pd.concat([info11,info2], axis=1,join="inner")
    return info

In [10]:
# regressing out the effects of covariates
from sklearn.preprocessing import StandardScaler
def regression_covariant(covariant_matrix, y, standard_scale=True):
    a = np.hstack((covariant_matrix,np.ones((covariant_matrix.shape[0], 1))))
    w = np.linalg.lstsq(a,y,rcond=None)[0]

    residual = y - covariant_matrix.dot(w[:-1])
    residual = residual.astype('float64')

    if standard_scale:
        residual = StandardScaler().fit_transform(residual.reshape(-1,1)).flatten()

    return residual, w

# data is the neuroimaging measures or phenotypes
# co is the covariates
def regress_data(data,co):
    codata = co.values
    s1 = data.shape
    reg_data = np.zeros(s1)
    for i in range(s1[1]):
        x = data.iloc[:,i].values
        [rx,w1] = regression_covariant(codata,x,standard_scale=False)
        reg_data[:,i] = rx
    return reg_data

In [11]:
# a funtion to select two groups from the sample
def split_sample(sample,split_type,Q_value):
    if split_type == '0.5':
        Q_value = Q_value[0]
        sample1 = sample[sample.iloc[:,1] <= Q_value]
        sample2 = sample[sample.iloc[:,1] > Q_value]
    else:
        Q_value1 = Q_value[0]
        Q_value2 = Q_value[1]
        sample1 = sample[sample.iloc[:,1] <= Q_value1]
        sample2 = sample[sample.iloc[:,1] >= Q_value2]
    
    subjs1 = sample1['eid'].values.tolist()
    subjs2 = sample2['eid'].values.tolist()
    
    return subjs1,subjs2

In [12]:
# calculate the t statistics between high and low groups
def bootstrap_ttest(img_data, len_sample1, len_sample2):
    
    img_data1 = img_data[:len_sample1,:]
    img_data2 = img_data[len_sample1:,:]
    
    N = img_data.shape[1]
    if len_sample1 < len_sample2:
        subsampling_times = int(len_sample1/100)
    else:
        subsampling_times = int(len_sample2/100)
    random_num = 100
    
    random_data1 = np.zeros((subsampling_times,random_num,N,2))
    random_data2 = np.zeros((subsampling_times,random_num,N,2))

    for i in range(subsampling_times):
        print(i)
        random_sample = 50*(i+1)
        for j in range(random_num):
            total_list = np.arange(len_sample1).tolist()
            random_inds1 = random.sample(total_list,random_sample)
            rest_total_list = list(set(total_list) - set(random_inds1))
            random_inds2 = random.sample(rest_total_list,random_sample)
            
            Low_X1 = img_data1[random_inds1,:]
            Low_X2 = img_data1[random_inds2,:]
            

            total_list = np.arange(len_sample2).tolist()
            random_inds1 = random.sample(total_list,random_sample)
            rest_total_list = list(set(total_list) - set(random_inds1))
            random_inds2 = random.sample(rest_total_list,random_sample)
            
            High_X1 = img_data2[random_inds1,:]
            High_X2 = img_data2[random_inds2,:]
            

            for k in range(N):
                t1,p1 = ttest_ind(High_X1[:,k],Low_X1[:,k])
                t2,p2 = ttest_ind(High_X2[:,k],Low_X2[:,k])
                random_data1[i,j,k,0] = t1
                random_data1[i,j,k,1] = p1
                random_data2[i,j,k,0] = t2
                random_data2[i,j,k,1] = p2
     
    return random_data1,random_data2

In [13]:
# The quartiles that can be used to split the sample
files = ['IQ','BMI','NS','alcohol','NM','Age']
Q1_values = [5, 23.46, 0, 2, 6, 57]
Q2_values = [6, 25.84, 2, 5, 6, 64]
Q3_values = [8, 28.75, 6, 11, 8, 70]

In [1]:
# split_type = '0.25' (Quartiles) or '0.5' (median values)
split_type = "0.25"

# create an empty folder to save results
if not os.path.exists('bootstrap_ttest/'+"split_"+split_type):
        os.mkdir('bootstrap_ttest/'+"split_"+split_type)
empty_path = 'bootstrap_ttest/'+"split_"+split_type


for i in range(len(files)):
    f = files[i]
    print(f)
    pheno_path = 'raw_data/ukbb_phenos/'
    pheno = pd.read_csv(os.path.join(pheno_path,f+'.csv'))

    # get information about age ,sex
    if f == 'Age':
        cov2 = pd.read_csv('raw_data/Sex.csv')
    else:
        Sex = pd.read_csv('raw_data/Sex.csv').set_index('eid')
        Age = pd.read_csv('raw_data/ukbb_phenos/Age.csv').set_index('eid')
        cov2 = pd.concat([Age,Sex], axis=1,join="inner").reset_index() 
        
    cov22 = cov2.set_index('eid')
    pheno2 = pheno.set_index('eid')
    subjs = pd.read_csv('raw_data/ukbb_subjs/'+f+'_subjs.txt',header=None).iloc[:,0].values.tolist()
    pheno3 = pheno2.loc[subjs].reset_index()
    
    # use function "split_sample" to get the subjects of two subsamples
    if split_type == "0.5":
        Q = [Q2_values[i]]
    else:
        Q = [Q1_values[i],Q3_values[i]]
        
    l1,l2 = split_sample(pheno3,split_type,Q)
    l = l1 + l2
    print("sample1 size: ",len(l1)/len(subjs),"sample2 size: ", len(l2)/len(subjs))
    
    CSA_data = data11.loc[l]
    CT_data = data22.loc[l]
    FC_data = temp_FC_data2.loc[l]
    global_data = cov11.loc[l]
    age_and_sex = cov22.loc[l]
    
    # transfer the column names of global brain measures
    global_data2 = pd.DataFrame({"eid" : global_data.index})
    global_data2['motion'] = global_data['25741-2.0'].values
    global_data2['TCSA'] = global_data['26721-2.0'].values + global_data['26822-2.0'].values
    global_data2['ACT'] = global_data['26755-2.0'].values + global_data['26856-2.0'].values
    global_data2['site1'] = global_data['site1'].values
    global_data2['site2'] = global_data['site2'].values
    global_data2['site3'] = global_data['site3'].values
    global_data2.set_index('eid',inplace=True)
    
    # regress out the covariates for brain measures and the variable
    CSA_co = get_covariates(global_data2,age_and_sex,'CSA')
    reg_CSA_data = regress_data(CSA_data,CSA_co)

    CT_co = get_covariates(global_data2,age_and_sex,'CT')
    reg_CT_data = regress_data(CT_data,CT_co)

    FC_co = get_covariates(global_data2,age_and_sex,'FC')
    reg_FC_data = regress_data(FC_data,FC_co)
    
    random_data_CSA1,random_data_CSA2 = bootstrap_ttest(reg_CSA_data,len(l1),len(l2))
    random_data_CT1,random_data_CT2 = bootstrap_ttest(reg_CT_data,len(l1),len(l2))
    random_data_FC1,random_data_FC2 = bootstrap_ttest(reg_FC_data,len(l1),len(l2))
    
    if not os.path.exists(empty_path+'/'+f):
        os.mkdir(empty_path+'/'+f)
    save_path = empty_path+'/'+f
    CSA_file_name1 = save_path+'/random_data_CSA1.npy'
    CSA_file_name2 = save_path+'/random_data_CSA2.npy'
    CT_file_name1 = save_path+'/random_data_CT1.npy'
    CT_file_name2 = save_path+'/random_data_CT2.npy'
    FC_file_name1 = save_path+'/random_data_FC1.npy'
    FC_file_name2 = save_path+'/random_data_FC2.npy'

    np.save(CSA_file_name1,random_data_CSA1)
    np.save(CSA_file_name2,random_data_CSA2)
    np.save(CT_file_name1,random_data_CT1)
    np.save(CT_file_name2,random_data_CT2)
    np.save(FC_file_name1,random_data_FC1)
    np.save(FC_file_name2,random_data_FC2)
