In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import os

In [2]:
# load risky behaviors, IQ, and education
temp1 = pd.read_csv('/dbstore/ukbb_cognition/ukb669679.csv')
s1 = ['eid','1100-2.0','2040-2.0','2149-2.0','20160-2.0','20016-2.0','6138-2.0']
pheno1 = temp1[s1]
pheno1.dropna(how='any',inplace=True)
pheno1.drop_duplicates(subset=['eid'],inplace=True)

In [3]:
# filering precedures:
# removing subjects with values of -1 (Do not know) or -3 (Prefer not to answer)
# 1100 (driving): remove 5 (do not drive)
pheno1 = pheno1[pheno1['1100-2.0'] > -0.5]
pheno1 = pheno1[pheno1['2040-2.0'] > -0.5]
pheno1 = pheno1[pheno1['2149-2.0'] > -0.5]
pheno1 = pheno1[pheno1['20160-2.0'] > -0.5]
pheno1 = pheno1[pheno1['1100-2.0'] < 5]

In [8]:
# load alcoholic drinks per week
pheno2 = pd.read_csv('raw_data/alcohol.csv')
pheno2 = pheno2.rename(columns={"value": "Alc"})

In [9]:
# transformant of educational attainment
# 1	College or University degree   --> 20
# 2	A levels/AS levels or equivalent --> 13
# 3	O levels/GCSEs or equivalent --> 10
# 4	CSEs or equivalent --> 10
# 5	NVQ or HND or HNC or equivalent -->19
# 6	Other professional qualifications eg: nursing, teaching --> 15
# -7	None of the above -->7
# -3	Prefer not to answer --> remove

dl = []
for i in range(pheno1.shape[0]):
    if pheno1['6138-2.0'].iloc[i] == -3:
        dl.append(pheno1.index[i])
    if pheno1['6138-2.0'].iloc[i] == 1:
        pheno1['6138-2.0'].iloc[i] = 20
    if pheno1['6138-2.0'].iloc[i] == 2:
        pheno1['6138-2.0'].iloc[i] = 13
    if pheno1['6138-2.0'].iloc[i] == 3:
        pheno1['6138-2.0'].iloc[i] = 10
    if pheno1['6138-2.0'].iloc[i] == 4:
        pheno1['6138-2.0'].iloc[i] = 10
    if pheno1['6138-2.0'].iloc[i] == 5:
        pheno1['6138-2.0'].iloc[i] = 19 
    if pheno1['6138-2.0'].iloc[i] == 6:
        pheno1['6138-2.0'].iloc[i] = 15
    if pheno1['6138-2.0'].iloc[i] == -7:
        pheno1['6138-2.0'].iloc[i] = 7

pheno1.drop(dl,inplace=True)

In [6]:
# merge pheno1 and pheno2
mdata = pd.concat([pheno1.set_index('eid'),pheno2.set_index('eid')], join='inner',axis=1)
mdata = pd.merge(pheno1,pheno2,on='eid',how='inner')
s3 = ['eid','1100-2.0','2040-2.0','2149-2.0','20160-2.0','Alc','20016-2.0','6138-2.0']
mdata = mdata[s3]
print("Number of individuals that report risky behaviors at the imaging visit:",mdata.shape[0])

Number of individuals that report risky behaviors at the imaging visit: 36439


In [8]:
# load CSA data
area_items1 = pd.read_csv('raw_data/DK_CSA_items.csv')
s1 = ['eid']
for i in range(area_items1.shape[0]):
    s1.append(str(area_items1.iloc[i,0])+'-2.0')
    
data1 = pd.read_csv('/dbstore/UKBiobank/Data_Download_02JULY2020/ukb42608.csv',usecols=s1)
data1.dropna(axis=0,how='any',inplace=True)


# load CT data
thickness_items1 = pd.read_csv('raw_data/DK_CT_items.csv')
s2= ['eid']
for i in range(thickness_items1.shape[0]):
    s2.append(str(thickness_items1.iloc[i,0])+'-2.0')

data2 = pd.read_csv('/dbstore/UKBiobank/Data_Download_02JULY2020/ukb42608.csv',usecols=s2)
data2.dropna(axis=0,how='any',inplace=True)

In [11]:
# select participants with both structural and functional imaging data
import os
fMRI_path = '/dbstore/UKBiobank/RSFMRI'
files = os.listdir(fMRI_path)
rsfmri_list = []
for f in files:
    if f[-7:] == '2_0.zip':
        file_path = os.path.join(fMRI_path,f)
        fMRI_size = os.path.getsize(file_path)
        if fMRI_size > 400000654:
            rsfmri_list.append(int(f[:7]))

smri_list = list(data1['eid'].values)
subjs = list(set(rsfmri_list) & set(smri_list))

fdata1 = data1.set_index('eid')
fdata11 = fdata1.loc[subjs]
fdata11.reset_index(inplace=True)
fdata2 = data2.set_index('eid')
fdata22 = fdata2.loc[subjs]
fdata22.reset_index(inplace=True)

In [13]:
# loading global brain measures including head motion (25741), total brain volume (25005), total CSA, average CT, and ICV
cols = ['eid','25741-2.0','25005-2.0']

total_items = pd.read_csv('raw_data/global_brain_items.csv')
for i in range(total_items.shape[0]):
    cols.append(str(total_items.iloc[i,0])+'-2.0')

l = fdata11['eid'].values
temp3 = pd.read_csv('/dbstore/UKBiobank/Data_Download_02JULY2020/ukb42608.csv',usecols=cols)
temp3.set_index('eid',inplace=True)
pheno3 = temp3.loc[l]
pheno3.reset_index(inplace=True)

In [14]:
# load age and sex
cols2 = ['eid','31-0.0','21003-2.0']
temp4 = pd.read_csv('/dbstore/ukbb_cognition/ukb669603.csv',usecols=cols2)
temp4.dropna(how='any',inplace=True)

In [16]:
# save merged data
imaging_data = pd.merge(fdata11,fdata22,on='eid',how='inner')
co_data = pd.merge(pheno3,temp4,on='eid',how='inner')
tdata = pd.merge(mdata,imaging_data,on='eid',how='inner')
tdata = pd.merge(tdata,co_data,on='eid',how='inner')

tdata.to_csv('raw_data/merged_data.csv',index=False)