In [14]:
import pandas as pd
import numpy as np
import os
import random
from scipy.stats import pearsonr,spearmanr
from scipy import stats
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings("ignore")

In [2]:
# load CSA data
# CSA phenotypes fields
area_items = pd.read_csv('raw_data/Area_items.csv')
s1 = ['eid']
for i in range(area_items.shape[0]):
    s1.append(str(area_items.iloc[i,0])+'-2.0')
data1 = pd.read_csv('/dbstore/UKBiobank/Data_Download_02JULY2020/ukb42608.csv',usecols=s1)
data1.dropna(axis=0,how='any',inplace=True)

In [3]:
# load CT data
# CT phenotypes fields
thickness_items = pd.read_csv('raw_data/Thickness_items.csv')
s2= ['eid']
for i in range(thickness_items.shape[0]):
    s2.append(str(thickness_items.iloc[i,0])+'-2.0')
data2 = pd.read_csv('/dbstore/UKBiobank/Data_Download_02JULY2020/ukb42608.csv',usecols=s2)
data2.dropna(axis=0,how='any',inplace=True)

In [4]:
# load functional connectivity between 21 networks
temp_FC_data = pd.read_csv('raw_data/ukbb_FC2.csv')

In [5]:
# load global brain measures (rsfMRI head motion,total gray matter volume,ICV, total CT, average CT, scanning sites)
global_brain_cols = ['eid','25741-2.0','25005-2.0','26521-2.0','26721-2.0','26822-2.0','26755-2.0','26856-2.0','54-2.0']
cov1 = pd.read_csv('/dbstore/UKBiobank/Data_Download_02JULY2020/ukb42608.csv',usecols=global_brain_cols)
cov1.dropna(how='any',inplace=True)
# coding three scanning sites
sites1 = []
sites2 = []
sites3 = []
for i in range(cov1.shape[0]):
    if cov1['54-2.0'].iloc[i] == 11025:
        sites1.append(1)
        sites2.append(0)
        sites3.append(0)
    if cov1['54-2.0'].iloc[i] == 11026:
        sites1.append(0)
        sites2.append(1)
        sites3.append(0)
    if cov1['54-2.0'].iloc[i] == 11027:
        sites1.append(0)
        sites2.append(0)
        sites3.append(1)
cov1['site1'] = sites1
cov1['site2'] = sites2
cov1['site3'] = sites3

In [6]:
# transfer the column names of global brain measures
global_data = pd.DataFrame({"eid" : cov1['eid'].values})
global_data['motion'] = cov1['25741-2.0'].values
global_data['TCSA'] = cov1['26721-2.0'].values + cov1['26822-2.0'].values
global_data['ACT'] = cov1['26755-2.0'].values + cov1['26856-2.0'].values
global_data['site1'] = cov1['site1'].values
global_data['site2'] = cov1['site2'].values
global_data['site3'] = cov1['site3'].values
global_data.set_index('eid',inplace=True)

In [7]:
data11 = data1.set_index('eid')
data22 = data2.set_index('eid')
temp_FC_data2 = temp_FC_data.set_index('eid')
# cov11 = cov1.set_index('eid')

In [8]:
# build a function to get the covariates
# info1 is the matrix including global brain measures
# info2 is the matrix including age and sex
def get_covariates(info1,info2,cotype):
    if cotype == 'CSA':
        co_items = ['TCSA','site1','site2','site3']
    if cotype == 'CT':
        co_items = ['ACT','site1','site2','site3']
    if cotype == 'FC':
        co_items = ['motion','site1','site2','site3']
    if cotype == 'pheno':
        co_items = ['TCSA','ACT','motion','site1','site2','site3']
    
    info11 = info1[co_items]
    info = pd.concat([info11,info2], axis=1,join="inner")
    return info

In [9]:
# regressing out the effects of covariates
from sklearn.preprocessing import StandardScaler
def regression_covariant(covariant_matrix, y, standard_scale=True):
    a = np.hstack((covariant_matrix,np.ones((covariant_matrix.shape[0], 1))))
    w = np.linalg.lstsq(a,y,rcond=None)[0]

    residual = y - covariant_matrix.dot(w[:-1])
    residual = residual.astype('float64')

    if standard_scale:
        residual = StandardScaler().fit_transform(residual.reshape(-1,1)).flatten()

    return residual, w

# data is the neuroimaging measures or phenotypes
# co is the covariates
def regress_data(data,co):
    codata = co.values
    s1 = data.shape
    reg_data = np.zeros(s1)
    for i in range(s1[1]):
        x = data.iloc[:,i].values
        [rx,w1] = regression_covariant(codata,x,standard_scale=False)
        reg_data[:,i] = rx
    return reg_data

In [10]:
# bootstrapping feature selection using random forest regression
def compute_feature_importances(imaging_data,variable):
    s1 = imaging_data.shape
    max_samples_num = int(s1[0]/200)
    random_samples = []

    for i in range(max_samples_num):
        if i < 10:
            random_samples.append(int(100*(i+1)))
        else:
            tmp = 1000 + 1000*(i-9)
            if tmp < s1[0]/2:
                random_samples.append(tmp)
            else:
                random_samples.append(int(s1[0]/2))
                break

    subsampling_times = len(random_samples)
    random_num = 50
    stability = np.zeros((subsampling_times,random_num,s1[1],2))
    
    for i in range(subsampling_times):
        random_sample = random_samples[i]
        print(random_sample)
        for j in range(random_num):
            total_list = np.arange(s1[0]).tolist()
            random_inds1 = random.sample(total_list,random_sample)
            rest_total_list = list(set(total_list) - set(random_inds1))
            random_inds2 = random.sample(rest_total_list,random_sample)

            Y1 = imaging_data[random_inds1,:]
            Y2 = imaging_data[random_inds2,:]

            m1 = variable[random_inds1]
            m2 = variable[random_inds2]

            reg1 = RandomForestRegressor(n_estimators=20)
            reg2 = RandomForestRegressor(n_estimators=20)
            reg1 = reg1.fit(Y1,m1)
            reg2 = reg2.fit(Y2,m2)
            stability[i,j,:,0] = reg1.feature_importances_
            stability[i,j,:,1] = reg2.feature_importances_
    
    return stability

In [1]:
files = ['IQ','BMI','NS','alcohol','NM','Age','BM']
empty_path = 'bootstrap_feature_selection/'

for i in range(len(files)):
    f = files[i]
    print(f)
    pheno_path = 'raw_data/ukbb_phenos/'
    pheno = pd.read_csv(os.path.join(pheno_path,f+'.csv'))

    # get information about age ,sex
    if f == 'Age':
        cov2 = pd.read_csv('raw_data/Sex.csv')
    else:
        Sex = pd.read_csv('raw_data/Sex.csv').set_index('eid')
        Age = pd.read_csv('raw_data/ukbb_phenos/Age.csv').set_index('eid')
        cov2 = pd.concat([Age,Sex], axis=1,join="inner").reset_index() 
        
    cov22 = cov2.set_index('eid')
    pheno2 = pheno.set_index('eid')
    subjs = pd.read_csv('raw_data/ukbb_subjs/'+f+'_subjs.txt',header=None).iloc[:,0].values.tolist()
    
    CSA_data = data11.loc[subjs]
    CT_data = data22.loc[subjs]
    FC_data = temp_FC_data2.loc[subjs]
    global_data2 = global_data.loc[subjs]
    age_and_sex = cov22.loc[subjs]
    Y = pheno2.loc[subjs]
    
    # regress out the covariates for brain measures and the variable
    CSA_co = get_covariates(global_data2,age_and_sex,'CSA')
    reg_CSA_data = regress_data(CSA_data,CSA_co)

    CT_co = get_covariates(global_data2,age_and_sex,'CT')
    reg_CT_data = regress_data(CT_data,CT_co)

    FC_co = get_covariates(global_data2,age_and_sex,'FC')
    reg_FC_data = regress_data(FC_data,FC_co)
    
    pheno_co = get_covariates(global_data2,age_and_sex,'pheno')
    if f == 'BM':
        reg_var_data = Y.values
    else:
        reg_var_data = regress_data(Y,pheno_co)
        
    CT_selection_stability = compute_feature_importances(reg_CT_data,reg_var_data)
    CSA_selection_stability = compute_feature_importances(reg_CSA_data,reg_var_data)
    FC_selection_stability = compute_feature_importances(reg_FC_data,reg_var_data)
    
    if not os.path.exists(empty_path+f):
        os.mkdir(empty_path+f)
    save_path = empty_path+f
    
    CSA_file_name = save_path +'/feature_importances_CSA.npy'
    CT_file_name = save_path+'/feature_importances_CT.npy'
    FC_file_name = save_path+'/feature_importances_FC.npy'

    np.save(CSA_file_name,CSA_selection_stability)
    np.save(CT_file_name,CT_selection_stability)
    np.save(FC_file_name,FC_selection_stability)