### Data Wrangling

In [95]:
### Loading python libraries for data wrangling
import pandas as pd
import numpy as np
import math
import functools

In [96]:
### Load data paths and data files
data_path ="/data/biobank/biobank_data_January2023_withdrawals_May2023/by_udi/"
# clinical_codes = pd.read_excel("/workspace/home/gandriamiadana/mres_dissertation/clinical_codes.xlsx", sheet_name="Sheet1")
ethnicity_codes = pd.read_csv("/workspace/home/gandriamiadana/mres_dissertation/coding1001.tsv", sep= "\t")
alc_intake_codes = pd.read_csv("/workspace/home/gandriamiadana/mres_dissertation/coding_alc_intake.tsv", sep= "\t")
print(data_path)

/data/biobank/biobank_data_January2023_withdrawals_May2023/by_udi/


In [97]:
## Recruitment
recruitment_datapath = data_path + "f.53.tab"
df_recruitment = pd.read_csv(recruitment_datapath, sep = "\t")
df_recruitment.rename(columns={'f.53.0.0': 'recruitment_date'}, inplace=True) 
df_recruitment['recruitment_date'] = pd.to_datetime(df_recruitment['recruitment_date'])

cols = [2,3,4]
df_recruitment.drop(df_recruitment.columns[cols],axis=1,inplace=True)

df_recruitment.head()

Unnamed: 0,f.eid,recruitment_date
0,1000018,2009-05-29
1,1000020,2008-10-10
2,1000034,2007-08-17
3,1000041,2006-06-12
4,1000056,2008-06-19


In [98]:
ethnicity_codes.head()
dict_ethnicity = ethnicity_codes.set_index('coding').meaning.to_dict()

In [99]:
## Sex 
sex_datapath = data_path + "f.31.tab"
df_sex = pd.read_csv(sex_datapath, sep = "\t")
df_sex.rename(columns={'f.31.0.0': 'sex'}, inplace=True) 
df_sex.head(5)

Unnamed: 0,f.eid,sex
0,1000018,1
1,1000020,1
2,1000034,0
3,1000041,1
4,1000056,0


In [100]:
## Ethnicity
ethnicity_datapath = data_path + "f.21000.tab"
df_ethnicity = pd.read_csv(ethnicity_datapath, sep = "\t")
dict_ethnicity = {
    # -3: 'Prefer not to answer', -1: 'Do not know',  
                  1001: 'White', 1002: 'White', 1003: 'White', 1: 'White',
                  math.nan: "N/A"
                  }
df_ethnicity.rename(columns={'f.21000.0.0': 'eth_at_recruitment', 
                             'f.21000.1.0': 'eth_at_calib_visit',
                             'f.21000.2.0': 'eth_at_imaging_visit1',}, inplace=True) 
df_ethnicity.replace({"eth_at_recruitment": dict_ethnicity}, inplace=True)
df_ethnicity.replace({"eth_at_calib_visit": dict_ethnicity}, inplace=True)
df_ethnicity.replace({"eth_at_imaging_visit1": dict_ethnicity}, inplace=True)

df_ethnicity['ethnicity_at_recruitment'] = np.where(df_ethnicity['eth_at_recruitment'] == "White", 1, 
    np.where(df_ethnicity['eth_at_recruitment'] == "N/A", -1000,
             np.where(df_ethnicity['eth_at_recruitment'] == -1, -1, 
                      np.where(df_ethnicity['eth_at_recruitment'] == -3, -3,
                               0))))
df_ethnicity['ethnicity_at_calib_visit'] = np.where(df_ethnicity['eth_at_calib_visit'] == "White", 1, 
    np.where(df_ethnicity['eth_at_calib_visit'] == "N/A", -1000,
             np.where(df_ethnicity['eth_at_calib_visit'] == -1, -1, 
                      np.where(df_ethnicity['eth_at_calib_visit'] == -3, -3,
                               0))))

df_ethnicity['ethnicity_at_imaging_visit1'] = np.where(df_ethnicity['eth_at_imaging_visit1'] == "White", 1, 
    np.where(df_ethnicity['eth_at_imaging_visit1'] == "N/A", -1000,
             np.where(df_ethnicity['eth_at_imaging_visit1'] == -1, -1, 
                      np.where(df_ethnicity['eth_at_imaging_visit1'] == -3, -3,
                               0))))

cols = [1,2,3]
df_ethnicity.drop(df_ethnicity.columns[cols],axis=1,inplace=True)
df_ethnicity.tail(20)

Unnamed: 0,f.eid,ethnicity_at_recruitment,ethnicity_at_calib_visit,ethnicity_at_imaging_visit1
502348,6026399,1,-1000,-1000
502349,6026403,0,-1000,-1000
502350,6026417,1,-1000,-1000
502351,6026421,1,-1000,-1000
502352,6026430,1,-1000,-1000
502353,6026446,1,-1000,-1000
502354,6026455,1,-1000,-1000
502355,6026469,0,-1000,-1000
502356,6026474,1,-1000,-1000
502357,6026488,1,-1000,-1000


In [101]:
## Deprivation
deprivation_datapath = data_path + "f.189.tab"
df_deprivation = pd.read_csv(deprivation_datapath, sep = "\t")
df_deprivation.rename(columns={'f.189.0.0': 'deprivation_index'}, inplace=True) 
df_deprivation.head()

Unnamed: 0,f.eid,deprivation_index
0,1000018,-3.87966
1,1000020,-4.08378
2,1000034,-2.75832
3,1000041,-0.264075
4,1000056,0.378004


In [102]:
## Education 
Education_datapath = data_path + "f.26414.tab"
df_Education = pd.read_csv(Education_datapath, sep = "\t")
df_Education.rename(columns={'f.26414.0.0': 'education_score'}, inplace=True) 
df_Education.head()

Unnamed: 0,f.eid,education_score
0,1000018,15.06
1,1000020,20.08
2,1000034,1.16
3,1000041,27.82
4,1000056,41.43


In [103]:
## Age
age_path = data_path + "f.21022.tab"
df_age = pd.read_csv(age_path, sep = "\t")
df_age.rename(columns={'f.21022.0.0': 'age_at_recruitment'}, inplace=True) 
df_age.head()

Unnamed: 0,f.eid,age_at_recruitment
0,1000018,53.0
1,1000020,48.0
2,1000034,59.0
3,1000041,70.0
4,1000056,57.0


In [104]:
# 1558
alcohol_intake_datapath = data_path + "f.1558.tab"
# dict_alc_intake = alc_intake_codes.set_index('coding').meaning.to_dict()
df_alcohol_intake = pd.read_csv(alcohol_intake_datapath, sep = "\t")
df_alcohol_intake.rename(columns={'f.1558.0.0': 'alc_intake_at_recruitment', 
                             'f.1558.1.0': 'alc_intake_at_calib_visit',
                             'f.1558.2.0': 'alc_intake_at_imaging_visit1',
                             'f.1558.3.0': 'alc_intake_at_imaging_visit2'}, inplace=True) 
df_alcohol_intake.head()

Unnamed: 0,f.eid,alc_intake_at_recruitment,alc_intake_at_calib_visit,alc_intake_at_imaging_visit1,alc_intake_at_imaging_visit2
0,1000018,3.0,,,
1,1000020,2.0,,,
2,1000034,4.0,,,
3,1000041,3.0,,,
4,1000056,6.0,,,


In [105]:
# 20116, Smoking status
dic = {-3:	"Prefer not to answer", 0:	"Never", 1:	"Previous", 2:	"Current"}
 
smoking_status_datapath = data_path + "f.20116.tab"
df_smoking_status = pd.read_csv(smoking_status_datapath, sep = "\t")
df_smoking_status.rename(columns={'f.20116.0.0': 'smoking_status_at_recruitment', 
                             'f.20116.1.0': 'smoking_status_at_calib_visit',
                             'f.20116.2.0': 'smoking_status_at_imaging_visit1',
                             'f.20116.3.0': 'smoking_status_at_imaging_visit2'}, inplace=True) 
df_smoking_status.head()

Unnamed: 0,f.eid,smoking_status_at_recruitment,smoking_status_at_calib_visit,smoking_status_at_imaging_visit1,smoking_status_at_imaging_visit2
0,1000018,0.0,,,
1,1000020,0.0,,,
2,1000034,2.0,,,
3,1000041,1.0,,,
4,1000056,1.0,,,


In [106]:
# 22040, IPAQ activity group 
## from previous years -- thanks to Cel

physical_activity_datapath = "previous_METS.tsv"
df_physical_activity = pd.read_csv(physical_activity_datapath, sep = "\t") # baseline and imaging
df_physical_activity.drop(df_physical_activity.columns[1],axis=1,inplace=True)
df_physical_activity.rename(columns={
    'mets_0': 'mets_imaging_visit1',
    'mets_2': 'mets_imaging_visit2'}, inplace=True) 
df_physical_activity.head()

Unnamed: 0,f.eid,mets_imaging_visit1,mets_imaging_visit2
1,1000018,1360.0,
2,1000020,6798.0,
3,1000034,2724.0,
4,1000041,0.0,
5,1000056,3066.0,


In [107]:
dfList = [df_recruitment, df_age, df_sex, df_ethnicity, df_Education, df_deprivation, 
          df_alcohol_intake, df_smoking_status, df_physical_activity,]
res = functools.reduce(lambda left, right: pd.merge(left,right,on=['f.eid']), dfList)
res.head(10)

Unnamed: 0,f.eid,recruitment_date,age_at_recruitment,sex,ethnicity_at_recruitment,ethnicity_at_calib_visit,ethnicity_at_imaging_visit1,education_score,deprivation_index,alc_intake_at_recruitment,alc_intake_at_calib_visit,alc_intake_at_imaging_visit1,alc_intake_at_imaging_visit2,smoking_status_at_recruitment,smoking_status_at_calib_visit,smoking_status_at_imaging_visit1,smoking_status_at_imaging_visit2,mets_imaging_visit1,mets_imaging_visit2
0,1000018,2009-05-29,53.0,1,1,-1000,-1000,15.06,-3.87966,3.0,,,,0.0,,,,1360.0,
1,1000020,2008-10-10,48.0,1,1,-1000,-1000,20.08,-4.08378,2.0,,,,0.0,,,,6798.0,
2,1000034,2007-08-17,59.0,0,1,-1000,-1000,1.16,-2.75832,4.0,,,,2.0,,,,2724.0,
3,1000041,2006-06-12,70.0,1,1,-1000,-1000,27.82,-0.264075,3.0,,,,1.0,,,,0.0,
4,1000056,2008-06-19,57.0,0,1,-1000,-1000,41.43,0.378004,6.0,,,,1.0,,,,3066.0,
5,1000062,2008-12-17,67.0,1,1,-1000,-1000,2.45,-4.04901,1.0,,,,1.0,,,,586.5,
6,1000075,2009-10-01,67.0,1,1,-1000,-1000,3.89,-4.11735,1.0,,,,1.0,,,,5910.0,
7,1000089,2008-07-02,42.0,1,1,-1000,-1000,56.06,3.40003,1.0,,,,2.0,,,,0.0,
8,1000093,2010-03-03,53.0,1,1,-1000,-1000,6.94,-3.95522,4.0,,,,1.0,,,,1026.0,
9,1000104,2008-11-27,59.0,0,1,-1000,-1000,6.18,1.78185,3.0,,,,0.0,,,,1982.0,


In [108]:
# 4056
## Age stroke diagnosed
age_stroke_path = data_path + "f.4056.tab"
df_age_stroke = pd.read_csv(age_stroke_path, sep = "\t")
df_age_stroke.rename(columns={'f.4056.0.0': 'age_stroke_diag_at_recruitment', 
                             'f.4056.1.0': 'age_stroke_diag_at_calib_visit',
                             'f.4056.2.0': 'age_stroke_diag_at_imaging_visit1',
                             'f.4056.3.0': 'age_stroke_diag_at_imaging_visit2'}, inplace=True) 

dict = {math.nan: "N/A"}
df_age_stroke.replace({"age_stroke_diag_at_recruitment": dict}, inplace=True)
df_age_stroke.replace({"age_stroke_diag_at_calib_visit": dict}, inplace=True)
df_age_stroke.replace({"age_stroke_diag_at_imaging_visit1": dict}, inplace=True)
df_age_stroke.replace({"age_stroke_diag_at_imaging_visit2": dict}, inplace=True)

df_age_stroke['age_stroke_yes_no'] = np.where(df_age_stroke['age_stroke_diag_at_recruitment'] != "N/A", 1, 
             np.where(df_age_stroke['age_stroke_diag_at_calib_visit'] != "N/A", 1,
                      np.where(df_age_stroke['age_stroke_diag_at_imaging_visit1'] != "N/A", 1, 
                               np.where(df_age_stroke['age_stroke_diag_at_imaging_visit2'] != "N/A", 1, 0))))

cols = [1,2,3,4]
df_age_stroke.drop(df_age_stroke.columns[cols],axis=1,inplace=True)
df_age_stroke.head() 

Unnamed: 0,f.eid,age_stroke_yes_no
0,1000018,0
1,1000020,0
2,1000034,0
3,1000041,0
4,1000056,0


In [109]:
# 2966
## Age high blood pressure diagnosed
age_HBP_diag_path = data_path + "f.2966.tab"
df_age_HBP_diag = pd.read_csv(age_HBP_diag_path, sep = "\t")
df_age_HBP_diag.rename(columns={'f.2966.0.0': 'age_HBP_diag_at_recruitment', 
                             'f.2966.1.0': 'age_HBP_diag_at_calib_visit',
                             'f.2966.2.0': 'age_HBP_diag_at_imaging_visit1',
                             'f.2966.3.0': 'age_HBP_diag_at_imaging_visit2'}, inplace=True) 
dict = {math.nan: "N/A"}
df_age_HBP_diag.replace({"age_HBP_diag_at_recruitment": dict}, inplace=True)
df_age_HBP_diag.replace({"age_HBP_diag_at_calib_visit": dict}, inplace=True)
df_age_HBP_diag.replace({"age_HBP_diag_at_imaging_visit1": dict}, inplace=True)
df_age_HBP_diag.replace({"age_HBP_diag_at_imaging_visit2": dict}, inplace=True)

df_age_HBP_diag['age_HBP_yes_no'] = np.where(df_age_HBP_diag['age_HBP_diag_at_recruitment'] != "N/A", 1, 
             np.where(df_age_HBP_diag['age_HBP_diag_at_calib_visit'] != "N/A", 1,
                      np.where(df_age_HBP_diag['age_HBP_diag_at_imaging_visit1'] != "N/A", 1, 
                               np.where(df_age_HBP_diag['age_HBP_diag_at_imaging_visit2'] != "N/A", 1, 0))))

cols = [1,2,3,4]
df_age_HBP_diag.drop(df_age_HBP_diag.columns[cols],axis=1,inplace=True)

df_age_HBP_diag.head() 

Unnamed: 0,f.eid,age_HBP_yes_no
0,1000018,0
1,1000020,1
2,1000034,0
3,1000041,1
4,1000056,0


In [110]:
# 2443
## Doctor diabetes diagnosis
dic = {-3:	"Prefer not to answer", 0:	"No", 1:"Yes", -1: "Do not know", math.nan: "N/A"}

diabetes_diag_by_GP_path = data_path + "f.2443.tab"
df_diabetes_diag_by_GP = pd.read_csv(diabetes_diag_by_GP_path, sep = "\t")
df_diabetes_diag_by_GP.rename(columns={'f.2443.0.0': 'diabetes_diag_at_recruitment', 
                             'f.2443.1.0': 'diabetes_diag_at_calib_visit',
                             'f.2443.2.0': 'diabetes_diag_at_imaging_visit1',
                             'f.2443.3.0': 'diabetes_diag_at_imaging_visit2'}, inplace=True) 

df_diabetes_diag_by_GP.replace({"diabetes_diag_at_recruitment": dic}, inplace=True)
df_diabetes_diag_by_GP.replace({"diabetes_diag_at_calib_visit": dic}, inplace=True)
df_diabetes_diag_by_GP.replace({"diabetes_diag_at_imaging_visit1": dic}, inplace=True)
df_diabetes_diag_by_GP.replace({"diabetes_diag_at_imaging_visit2": dic}, inplace=True)

df_diabetes_diag_by_GP['age_diabetes_yes_no'] = np.where(df_diabetes_diag_by_GP['diabetes_diag_at_recruitment'] == "Yes", 1, 
             np.where(df_diabetes_diag_by_GP['diabetes_diag_at_calib_visit'] == "Yes", 1,
                      np.where(df_diabetes_diag_by_GP['diabetes_diag_at_imaging_visit1'] == "Yes", 1, 
                               np.where(df_diabetes_diag_by_GP['diabetes_diag_at_imaging_visit2'] == "Yes", 1, 0))))

cols = [1,2,3,4]
df_diabetes_diag_by_GP.drop(df_diabetes_diag_by_GP.columns[cols],axis=1,inplace=True)
df_diabetes_diag_by_GP.head() 

Unnamed: 0,f.eid,age_diabetes_yes_no
0,1000018,0
1,1000020,0
2,1000034,0
3,1000041,1
4,1000056,0


In [111]:
# 2976
## Age diabetes diagnosed by doctor
age_diabetes_diag_path = data_path + "f.2976.tab"
df_age_diabetes_diag = pd.read_csv(age_diabetes_diag_path, sep = "\t")
df_age_diabetes_diag.rename(columns={
                             'f.2976.0.0': 'age_diabetes_diag_at_recruitment', 
                             'f.2976.1.0': 'age_diabetes_diag_at_calib_visit',
                             'f.2976.2.0': 'age_diabetes_diag_at_imaging_visit1',
                             'f.2976.3.0': 'age_diabetes_diag_at_imaging_visit2'}, inplace=True) 

dict = {math.nan: "N/A"}
df_age_diabetes_diag.replace({"age_diabetes_diag_at_recruitment": dict}, inplace=True)
df_age_diabetes_diag.replace({"age_diabetes_diag_at_calib_visit": dict}, inplace=True)
df_age_diabetes_diag.replace({"age_diabetes_diag_at_imaging_visit1": dict}, inplace=True)
df_age_diabetes_diag.replace({"age_diabetes_diag_at_imaging_visit2": dict}, inplace=True)

df_age_diabetes_diag['age_diabetes_yes_no'] = np.where(df_age_diabetes_diag['age_diabetes_diag_at_recruitment'] != "N/A", 1, 
             np.where(df_age_diabetes_diag['age_diabetes_diag_at_calib_visit'] != "N/A", 1,
                      np.where(df_age_diabetes_diag['age_diabetes_diag_at_imaging_visit1'] != "N/A", 1, 
                               np.where(df_age_diabetes_diag['age_diabetes_diag_at_imaging_visit2'] != "N/A", 1, 0))))

cols = [1,2,3,4]
df_age_diabetes_diag.drop(df_age_diabetes_diag.columns[cols],axis=1,inplace=True)
df_age_diabetes_diag.head() 

Unnamed: 0,f.eid,age_diabetes_yes_no
0,1000018,0
1,1000020,0
2,1000034,0
3,1000041,1
4,1000056,0


In [112]:
# 30750
# Glycated haemoglobin (HbA1c)
HbA1c_path = data_path + "f.30750.tab"
df_HbA1c = pd.read_csv(HbA1c_path, sep = "\t")
df_HbA1c.rename(columns={
                             'f.30750.0.0': 'HbA1c_levels_at_recruitment', 
                             'f.30750.1.0': 'HbA1c_levels_at_calib_visit'}, inplace=True) 

df_HbA1c['has_diabetes_HbA1c'] = np.where(df_HbA1c['HbA1c_levels_at_recruitment'] >48, 1, 
             np.where(df_HbA1c['HbA1c_levels_at_calib_visit'] >48, 1, 0))

cols = [1,2]
df_HbA1c.drop(df_HbA1c.columns[cols],axis=1,inplace=True)
df_HbA1c.head() 

Unnamed: 0,f.eid,has_diabetes_HbA1c
0,1000018,0
1,1000020,0
2,1000034,0
3,1000041,0
4,1000056,0


In [113]:
# 30740
# Glucose
Glucose_path = data_path + "f.30740.tab"
df_Glucose = pd.read_csv(Glucose_path, sep = "\t")
df_Glucose.rename(columns={
                             'f.30740.0.0': 'Glucose_levels_at_recruitment', 
                             'f.30740.1.0': 'Glucose_levels_at_calib_visit'}, inplace=True) 

df_Glucose['has_diabetes_Glucose'] = np.where(df_Glucose['Glucose_levels_at_recruitment'] >11.1, 1, 
             np.where(df_Glucose['Glucose_levels_at_calib_visit'] >11.1, 1, 0))

cols = [1,2]
df_Glucose.drop(df_Glucose.columns[cols],axis=1,inplace=True)
df_Glucose.head() 

Unnamed: 0,f.eid,has_diabetes_Glucose
0,1000018,0
1,1000020,0
2,1000034,0
3,1000041,0
4,1000056,0


In [114]:
# 30690
# Cholesterol
cholesterol_path = data_path + "f.30690.tab"
df_cholesterol = pd.read_csv(cholesterol_path, sep = "\t")
df_cholesterol.rename(columns={
                             'f.30690.0.0': 'cholesterol_levels_at_recruitment', 
                             'f.30690.1.0': 'cholesterol_levels_at_calib_visit'}, inplace=True) 

df_cholesterol['has_high_cholesterol'] = np.where(df_cholesterol['cholesterol_levels_at_recruitment'] >7, 1, 
             np.where(df_cholesterol['cholesterol_levels_at_calib_visit'] >7, 1, 0))

cols = [1,2]
df_cholesterol.drop(df_cholesterol.columns[cols],axis=1,inplace=True)
df_cholesterol.head() 

Unnamed: 0,f.eid,has_high_cholesterol
0,1000018,0
1,1000020,0
2,1000034,1
3,1000041,0
4,1000056,0


In [115]:
# 130814
# Date E78 first reported (disorders of lipoprotein metabolism and other lipidaemias)
date_lipidaemia_path = data_path + "f.130814.tab"
df_date_lipidaemia = pd.read_csv(date_lipidaemia_path, sep = "\t")
df_date_lipidaemia.rename(columns={'f.130814.0.0': 'date_lipidaemia'}, inplace=True) 

df_date_lipidaemia['date_lipidaemia'] = pd.to_datetime(df_date_lipidaemia['date_lipidaemia'])
df_date_lipidaemia.loc[df_date_lipidaemia["date_lipidaemia"] < df_recruitment["recruitment_date"], 
                       "date_lipidaemia"] = pd.NaT

dict = {pd.NaT: "N/A"}
df_date_lipidaemia.replace({"date_lipidaemia": dict}, inplace=True)
df_date_lipidaemia['lipidaemia_yes_no'] = np.where(df_date_lipidaemia['date_lipidaemia'] != "N/A", 1, 0)

df_date_lipidaemia.head(20) 

# df_date_lipidaemia["date_lipidaemia"] > df_recruitment["recruitment_date"]

Unnamed: 0,f.eid,date_lipidaemia,lipidaemia_yes_no
0,1000018,,0
1,1000020,,0
2,1000034,,0
3,1000041,2013-06-24 00:00:00,1
4,1000056,,0
5,1000062,,0
6,1000075,,0
7,1000089,,0
8,1000093,,0
9,1000104,,0


In [116]:
# 131354
# Date I50 first reported (heart failure)
date_HF_path = data_path + "f.131354.tab"
df_date_HF = pd.read_csv(date_HF_path, sep = "\t")
df_date_HF.rename(columns={'f.131354.0.0': 'date_HF'}, inplace=True)

df_date_HF['date_HF'] = pd.to_datetime(df_date_HF['date_HF'])
df_date_HF.loc[df_date_HF["date_HF"] < df_recruitment["recruitment_date"], 
                       "date_HF"] = pd.NaT

dict = {math.nan: "N/A"}
df_date_HF.replace({"date_HF": dict}, inplace=True)
df_date_HF['HF_yes_no'] = np.where(df_date_HF['date_HF'] != "N/A", 1, 0)
# df_date_HF.drop(df_date_HF.columns[1],axis=1,inplace=True)

df_date_HF.head() 

Unnamed: 0,f.eid,date_HF,HF_yes_no
0,1000018,,0
1,1000020,,0
2,1000034,,0
3,1000041,2013-06-24 00:00:00,1
4,1000056,,0


In [117]:
# 131296
# Date I20 first reported (angina pectoris)
date_angina_pec_path = data_path + "f.131296.tab"
df_date_angina_pec = pd.read_csv(date_angina_pec_path, sep = "\t")
df_date_angina_pec.rename(columns={'f.131296.0.0': 'date_angina_pec'}, inplace=True) 


df_date_angina_pec['date_angina_pec'] = pd.to_datetime(df_date_angina_pec['date_angina_pec'])
df_date_angina_pec.loc[df_date_angina_pec["date_angina_pec"] < df_recruitment["recruitment_date"], 
                       "date_angina_pec"] = pd.NaT

dict = {math.nan: "N/A"}
df_date_angina_pec.replace({"date_angina_pec": dict}, inplace=True)
df_date_angina_pec['angina_yes_no'] = np.where(df_date_angina_pec['date_angina_pec'] != "N/A", 1, 0)
# df_date_angina_pec.drop(df_date_angina_pec.columns[1],axis=1,inplace=True)

df_date_angina_pec.head() 

Unnamed: 0,f.eid,date_angina_pec,angina_yes_no
0,1000018,,0
1,1000020,,0
2,1000034,,0
3,1000041,,0
4,1000056,,0


In [118]:
# 131304
# Date I24 first reported (other acute ischaemic heart diseases)
date_other_iscHD_path = data_path + "f.131304.tab"
df_date_other_iscHD = pd.read_csv(date_other_iscHD_path, sep = "\t")
df_date_other_iscHD.rename(columns={'f.131304.0.0': 'date_other_iscHD'}, inplace=True) 

df_date_other_iscHD['date_other_iscHD'] = pd.to_datetime(df_date_other_iscHD['date_other_iscHD'])
df_date_other_iscHD.loc[df_date_other_iscHD["date_other_iscHD"] < df_recruitment["recruitment_date"], 
                       "date_other_iscHD"] = pd.NaT

dict = {math.nan: "N/A"}
df_date_other_iscHD.replace({"date_other_iscHD": dict}, inplace=True)
df_date_other_iscHD['other_iscHD_yes_no'] = np.where(df_date_other_iscHD['date_other_iscHD'] != "N/A", 1, 0)
# df_date_other_iscHD.drop(df_date_other_iscHD.columns[1],axis=1,inplace=True)

df_date_other_iscHD.head() 
### hier - 20:21

Unnamed: 0,f.eid,date_other_iscHD,other_iscHD_yes_no
0,1000018,,0
1,1000020,,0
2,1000034,,0
3,1000041,,0
4,1000056,,0


In [119]:
# 131306
# Date I25 first reported (chronic ischaemic heart disease)
date_chronic_iscHD_path = data_path + "f.131306.tab"
df_date_chronic_iscHD = pd.read_csv(date_chronic_iscHD_path, sep = "\t")
df_date_chronic_iscHD.rename(columns={'f.131306.0.0': 'date_chronic_iscHD'}, inplace=True) 

df_date_chronic_iscHD['date_chronic_iscHD'] = pd.to_datetime(df_date_chronic_iscHD['date_chronic_iscHD'])
df_date_chronic_iscHD.loc[df_date_chronic_iscHD["date_chronic_iscHD"] < df_recruitment["recruitment_date"], 
                       "date_chronic_iscHD"] = pd.NaT

dict = {math.nan: "N/A"}
df_date_chronic_iscHD.replace({"date_chronic_iscHD": dict}, inplace=True)
df_date_chronic_iscHD['chronic_iscHD_yes_no'] = np.where(df_date_chronic_iscHD['date_chronic_iscHD'] != "N/A", 1, 0)
# df_date_chronic_iscHD.drop(df_date_chronic_iscHD.columns[1],axis=1,inplace=True)

df_date_chronic_iscHD.head() 

Unnamed: 0,f.eid,date_chronic_iscHD,chronic_iscHD_yes_no
0,1000018,,0
1,1000020,,0
2,1000034,,0
3,1000041,2013-06-24 00:00:00,1
4,1000056,,0


In [120]:
# 131056
# Date G45 first reported (transient cerebral ischaemic attacks and related syndromes)
date_transient_iscHD_path = data_path + "f.131056.tab"
df_date_transient_iscHD = pd.read_csv(date_transient_iscHD_path, sep = "\t")
df_date_transient_iscHD.rename(columns={'f.131056.0.0': 'date_transient_iscHD'}, inplace=True) 

df_date_transient_iscHD['date_transient_iscHD'] = pd.to_datetime(df_date_transient_iscHD['date_transient_iscHD'])
df_date_transient_iscHD.loc[df_date_transient_iscHD["date_transient_iscHD"] < df_recruitment["recruitment_date"], 
                       "date_transient_iscHD"] = pd.NaT

dict = {math.nan: "N/A"}
df_date_transient_iscHD.replace({"date_transient_iscHD": dict}, inplace=True)
df_date_transient_iscHD['transient_iscHD_yes_no'] = np.where(df_date_transient_iscHD['date_transient_iscHD'] != "N/A", 1, 0)
# df_date_transient_iscHD.drop(df_date_transient_iscHD.columns[1],axis=1,inplace=True)

df_date_transient_iscHD.head() 

Unnamed: 0,f.eid,date_transient_iscHD,transient_iscHD_yes_no
0,1000018,,0
1,1000020,,0
2,1000034,,0
3,1000041,,0
4,1000056,,0


In [121]:
# 3627
## Age angina diagnosed
age_angina_diag_path = data_path + "f.3627.tab"
df_age_angina_diag = pd.read_csv(age_angina_diag_path, sep = "\t")
df_age_angina_diag.rename(columns={'f.3627.0.0': 'age_angina_diag_at_recruitment', 
                             'f.3627.1.0': 'age_angina_diag_at_calib_visit',
                             'f.3627.2.0': 'age_angina_diag_at_imaging_visit1',
                             'f.3627.3.0': 'age_angina_diag_at_imaging_visit2'}, inplace=True) 

dict = {math.nan: "N/A"}
df_age_angina_diag.replace({"age_angina_diag_at_recruitment": dict}, inplace=True)
df_age_angina_diag.replace({"age_angina_diag_at_calib_visit": dict}, inplace=True)
df_age_angina_diag.replace({"age_angina_diag_at_imaging_visit1": dict}, inplace=True)
df_age_angina_diag.replace({"age_angina_diag_at_imaging_visit2": dict}, inplace=True)

df_age_angina_diag['age_angina_yes_no'] = np.where(df_age_angina_diag['age_angina_diag_at_recruitment'] != "N/A", 1, 
             np.where(df_age_angina_diag['age_angina_diag_at_calib_visit'] != "N/A", 1,
                      np.where(df_age_angina_diag['age_angina_diag_at_imaging_visit1'] != "N/A", 1, 
                               np.where(df_age_angina_diag['age_angina_diag_at_imaging_visit2'] != "N/A", 1, 0))))

cols = [1,2,3,4]
df_age_angina_diag.drop(df_age_angina_diag.columns[cols],axis=1,inplace=True)

df_age_angina_diag.head() 

Unnamed: 0,f.eid,age_angina_yes_no
0,1000018,0
1,1000020,0
2,1000034,0
3,1000041,0
4,1000056,0


In [122]:
# 131368
# Date I64 first reported (stroke, not specified as haemorrhage or infarction)
date_stroke_path = data_path + "f.131368.tab"
df_date_stroke_131368 = pd.read_csv(date_stroke_path, sep = "\t")
df_date_stroke_131368.rename(columns={'f.131368.0.0': 'date_stroke'}, inplace=True) 

df_date_stroke_131368['date_stroke'] = pd.to_datetime(df_date_stroke_131368['date_stroke'])
df_date_stroke_131368.loc[df_date_stroke_131368["date_stroke"] < df_recruitment["recruitment_date"], 
                       "date_stroke"] = pd.NaT

dict = {math.nan: "N/A"}
df_date_stroke_131368.replace({"date_stroke": dict}, inplace=True)
df_date_stroke_131368['date_stroke_yes_no'] = np.where(df_date_stroke_131368['date_stroke'] != "N/A", 1, 0)
# df_date_stroke_131368.drop(df_date_stroke_131368.columns[1],axis=1,inplace=True)

df_date_stroke_131368.head() 
# hier - 23:12

Unnamed: 0,f.eid,date_stroke,date_stroke_yes_no
0,1000018,,0
1,1000020,,0
2,1000034,,0
3,1000041,,0
4,1000056,,0


In [123]:
# 131366
# Date I63 first reported (cerebral infarction)
date_cerebral_infarction_path = data_path + "f.131366.tab"
df_date_cerebral_infarction = pd.read_csv(date_cerebral_infarction_path, sep = "\t")
df_date_cerebral_infarction.rename(columns={'f.131366.0.0': 'date_cerebral_infarction'}, inplace=True) 

df_date_cerebral_infarction['date_cerebral_infarction'] = pd.to_datetime(df_date_cerebral_infarction['date_cerebral_infarction'])
df_date_cerebral_infarction.loc[df_date_cerebral_infarction["date_cerebral_infarction"] < df_recruitment["recruitment_date"], 
                       "date_cerebral_infarction"] = pd.NaT

dict = {math.nan: "N/A"}
df_date_cerebral_infarction.replace({"date_cerebral_infarction": dict}, inplace=True)
df_date_cerebral_infarction['date_cerebral_infarction_yes_no'] = np.where(df_date_cerebral_infarction['date_cerebral_infarction'] != "N/A", 1, 0)
# df_date_cerebral_infarction.drop(df_date_cerebral_infarction.columns[1],axis=1,inplace=True)

df_date_cerebral_infarction.head() 

Unnamed: 0,f.eid,date_cerebral_infarction,date_cerebral_infarction_yes_no
0,1000018,,0
1,1000020,,0
2,1000034,,0
3,1000041,,0
4,1000056,,0


In [124]:
# 131362
# Date I61 first reported (intracerebral haemorrhage)
date_intracerebral_haemorrhage_path = data_path + "f.131362.tab"
df_date_intracerebral_haemorrhage = pd.read_csv(date_intracerebral_haemorrhage_path, sep = "\t")
df_date_intracerebral_haemorrhage.rename(columns={'f.131362.0.0': 'date_intracerebral_haemorrhage'}, inplace=True) 

df_date_intracerebral_haemorrhage['date_intracerebral_haemorrhage'] = pd.to_datetime(df_date_intracerebral_haemorrhage['date_intracerebral_haemorrhage'])
df_date_intracerebral_haemorrhage.loc[df_date_intracerebral_haemorrhage["date_intracerebral_haemorrhage"] < df_recruitment["recruitment_date"], 
                       "date_intracerebral_haemorrhage"] = pd.NaT

dict = {math.nan: "N/A"}
df_date_intracerebral_haemorrhage.replace({"date_intracerebral_haemorrhage": dict}, inplace=True)
df_date_intracerebral_haemorrhage['date_intracerebral_haemorrhage_yes_no'] = np.where(
    df_date_intracerebral_haemorrhage['date_intracerebral_haemorrhage'] != "N/A", 1, 0)
# df_date_intracerebral_haemorrhage.drop(df_date_intracerebral_haemorrhage.columns[1],axis=1,inplace=True)

df_date_intracerebral_haemorrhage.head() 
# df_date_intracerebral_haemorrhage[df_date_intracerebral_haemorrhage["date_intracerebral_haemorrhage"] != "N/A"]

Unnamed: 0,f.eid,date_intracerebral_haemorrhage,date_intracerebral_haemorrhage_yes_no
0,1000018,,0
1,1000020,,0
2,1000034,,0
3,1000041,,0
4,1000056,,0


In [125]:
# 131364
# Date I62 first reported (other nontraumatic intracranial haemorrhage)
date_other_intracerebral_haemorrhage_path = data_path + "f.131364.tab"
df_date_other_intracerebral_haemorrhage = pd.read_csv(date_other_intracerebral_haemorrhage_path, sep = "\t")
df_date_other_intracerebral_haemorrhage.rename(columns={'f.131364.0.0': 'date_other_intracerebral_haemorrhage'}, inplace=True) 

df_date_other_intracerebral_haemorrhage['date_other_intracerebral_haemorrhage'] = pd.to_datetime(df_date_other_intracerebral_haemorrhage['date_other_intracerebral_haemorrhage'])
df_date_other_intracerebral_haemorrhage.loc[df_date_other_intracerebral_haemorrhage["date_other_intracerebral_haemorrhage"] < df_recruitment["recruitment_date"], 
                       "date_other_intracerebral_haemorrhage"] = pd.NaT

dict = {math.nan: "N/A"}
df_date_other_intracerebral_haemorrhage.replace({"date_other_intracerebral_haemorrhage": dict}, inplace=True)
df_date_other_intracerebral_haemorrhage['date_other_intracerebral_haemorrhage_yes_no'] = np.where(
    df_date_other_intracerebral_haemorrhage['date_other_intracerebral_haemorrhage'] != "N/A", 1, 0)

df_date_other_intracerebral_haemorrhage.head() 
# df_date_other_intracerebral_haemorrhage[df_date_other_intracerebral_haemorrhage["date_other_intracerebral_haemorrhage"] != "N/A"]

Unnamed: 0,f.eid,date_other_intracerebral_haemorrhage,date_other_intracerebral_haemorrhage_yes_no
0,1000018,,0
1,1000020,,0
2,1000034,,0
3,1000041,,0
4,1000056,,0


In [126]:
# 131286
# Date I10 first reported (essential (primary) hypertension)
date_hypertension_path = data_path + "f.131286.tab"
df_date_hypertension = pd.read_csv(date_hypertension_path, sep = "\t")
df_date_hypertension.rename(columns={'f.131286.0.0': 'date_hypertension'}, inplace=True) 

df_date_hypertension['date_hypertension'] = pd.to_datetime(df_date_hypertension['date_hypertension'])
df_date_hypertension.loc[df_date_hypertension["date_hypertension"] < df_recruitment["recruitment_date"], 
                       "date_hypertension"] = pd.NaT

dict = {math.nan: "N/A"}
df_date_hypertension.replace({"date_hypertension": dict}, inplace=True)
df_date_hypertension['date_hypertension_yes_no'] = np.where(
    df_date_hypertension['date_hypertension'] != "N/A", 1, 0)
# df_date_hypertension.drop(df_date_hypertension.columns[1],axis=1,inplace=True)

df_date_hypertension.head(6) 

Unnamed: 0,f.eid,date_hypertension,date_hypertension_yes_no
0,1000018,,0
1,1000020,,0
2,1000034,,0
3,1000041,,0
4,1000056,,0
5,1000062,2019-10-29 00:00:00,1


In [127]:
# 131338
# Date I42 first reported (cardiomyopathy)

date_cardiomyopathy_path = data_path + "f.131338.tab"
df_date_cardiomyopathy = pd.read_csv(date_cardiomyopathy_path, sep = "\t")
df_date_cardiomyopathy.rename(columns={'f.131338.0.0': 'date_cardiomyopathy'}, inplace=True) 

df_date_cardiomyopathy['date_cardiomyopathy'] = pd.to_datetime(df_date_cardiomyopathy['date_cardiomyopathy'])
df_date_cardiomyopathy.loc[df_date_cardiomyopathy["date_cardiomyopathy"] < df_recruitment["recruitment_date"], 
                       "date_cardiomyopathy"] = pd.NaT

dict = {math.nan: "N/A"}
df_date_cardiomyopathy.replace({"date_cardiomyopathy": dict}, inplace=True)
df_date_cardiomyopathy['date_cardiomyopathy_yes_no'] = np.where(
    df_date_cardiomyopathy['date_cardiomyopathy'] != "N/A", 1, 0)
# df_date_cardiomyopathy.drop(df_date_cardiomyopathy.columns[1],axis=1,inplace=True)

df_date_cardiomyopathy.head()

Unnamed: 0,f.eid,date_cardiomyopathy,date_cardiomyopathy_yes_no
0,1000018,,0
1,1000020,,0
2,1000034,,0
3,1000041,,0
4,1000056,,0


In [128]:
# 131340
# Date I43 first reported (cardiomyopathy in diseases classified elsewhere)

date_cardiomyopathy_elsewhere_path = data_path + "f.131340.tab"
df_date_cardiomyopathy_elsewhere = pd.read_csv(date_cardiomyopathy_elsewhere_path, sep = "\t")
df_date_cardiomyopathy_elsewhere.rename(columns={'f.131340.0.0': 'date_cardiomyopathy_elsewhere'}, inplace=True) 

df_date_cardiomyopathy_elsewhere['date_cardiomyopathy_elsewhere'] = pd.to_datetime(df_date_cardiomyopathy_elsewhere['date_cardiomyopathy_elsewhere'])
df_date_cardiomyopathy_elsewhere.loc[df_date_cardiomyopathy_elsewhere["date_cardiomyopathy_elsewhere"] < df_recruitment["recruitment_date"], 
                       "date_cardiomyopathy_elsewhere"] = pd.NaT

dict = {math.nan: "N/A"}
df_date_cardiomyopathy_elsewhere.replace({"date_cardiomyopathy_elsewhere": dict}, inplace=True)
df_date_cardiomyopathy_elsewhere['date_cardiomyopathy_elsewhere_yes_no'] = np.where(
    df_date_cardiomyopathy_elsewhere['date_cardiomyopathy_elsewhere'] != "N/A", 1, 0)
# df_date_cardiomyopathy_elsewhere.drop(df_date_cardiomyopathy_elsewhere.columns[1],axis=1,inplace=True)

df_date_cardiomyopathy_elsewhere.head() 

Unnamed: 0,f.eid,date_cardiomyopathy_elsewhere,date_cardiomyopathy_elsewhere_yes_no
0,1000018,,0
1,1000020,,0
2,1000034,,0
3,1000041,,0
4,1000056,,0


In [129]:
# 131288
# Date I11 first reported (hypertensive heart disease)

date_hypertensive_HD_path = data_path + "f.131288.tab"
df_date_hypertensive_HD = pd.read_csv(date_hypertensive_HD_path, sep = "\t")
df_date_hypertensive_HD.rename(columns={'f.131288.0.0': 'date_hypertensive_HD'}, inplace=True) 

df_date_hypertensive_HD['date_hypertensive_HD'] = pd.to_datetime(df_date_hypertensive_HD['date_hypertensive_HD'])
df_date_hypertensive_HD.loc[df_date_hypertensive_HD["date_hypertensive_HD"] < df_recruitment["recruitment_date"], 
                       "date_hypertensive_HD"] = pd.NaT

dict = {math.nan: "N/A"}
df_date_hypertensive_HD.replace({"date_hypertensive_HD": dict}, inplace=True)
df_date_hypertensive_HD['date_hypertensive_HD_yes_no'] = np.where(
    df_date_hypertensive_HD['date_hypertensive_HD'] != "N/A", 1, 0)
# df_date_hypertensive_HD.drop(df_date_hypertensive_HD.columns[1],axis=1,inplace=True)

df_date_hypertensive_HD.head() 

Unnamed: 0,f.eid,date_hypertensive_HD,date_hypertensive_HD_yes_no
0,1000018,,0
1,1000020,,0
2,1000034,,0
3,1000041,,0
4,1000056,,0


In [130]:
# 131292
# Date I13 first reported (hypertensive heart and renal disease)

date_hypertensive_HD_and_RD_path = data_path + "f.131288.tab"
df_date_hypertensive_HD_and_RD = pd.read_csv(date_hypertensive_HD_and_RD_path, sep = "\t")
df_date_hypertensive_HD_and_RD.rename(columns={'f.131288.0.0': 'date_hypertensive_HD_and_RD'}, inplace=True) 

df_date_hypertensive_HD_and_RD['date_hypertensive_HD_and_RD'] = pd.to_datetime(df_date_hypertensive_HD_and_RD['date_hypertensive_HD_and_RD'])
df_date_hypertensive_HD_and_RD.loc[df_date_hypertensive_HD_and_RD["date_hypertensive_HD_and_RD"] < df_recruitment["recruitment_date"], 
                       "date_hypertensive_HD_and_RD"] = pd.NaT

dict = {math.nan: "N/A"}
df_date_hypertensive_HD_and_RD.replace({"date_hypertensive_HD_and_RD": dict}, inplace=True)
df_date_hypertensive_HD_and_RD['date_hypertensive_HD_and_RD_yes_no'] = np.where(
    df_date_hypertensive_HD_and_RD['date_hypertensive_HD_and_RD'] != "N/A", 1, 0)
# df_date_hypertensive_HD_and_RD.drop(df_date_hypertensive_HD_and_RD.columns[1],axis=1,inplace=True)

df_date_hypertensive_HD_and_RD.head() 
# df_date_hypertensive_HD_and_RD[df_date_hypertensive_HD_and_RD["date_hypertensive_HD_and_RD"] != "N/A"] 

Unnamed: 0,f.eid,date_hypertensive_HD_and_RD,date_hypertensive_HD_and_RD_yes_no
0,1000018,,0
1,1000020,,0
2,1000034,,0
3,1000041,,0
4,1000056,,0


In [131]:
# 42006
# Date of stroke

date_stroke_path_42006 = data_path + "f.42006.tab"
df_date_stroke_42006 = pd.read_csv(date_stroke_path_42006, sep = "\t")
df_date_stroke_42006.rename(columns={'f.42006.0.0': 'date_stroke_42006'}, inplace=True) 

df_date_stroke_42006['date_stroke_42006'] = pd.to_datetime(df_date_stroke_42006['date_stroke_42006'])
df_date_stroke_42006.loc[df_date_stroke_42006["date_stroke_42006"] < df_recruitment["recruitment_date"], 
                       "date_stroke_42006"] = pd.NaT

dict = {math.nan: "N/A"}
df_date_stroke_42006.replace({"date_stroke_42006": dict}, inplace=True)
df_date_stroke_42006['date_stroke_42006_yes_no'] = np.where(
    df_date_stroke_42006['date_stroke_42006'] != "N/A", 1, 0)
# df_date_stroke_42006.drop(df_date_stroke_42006.columns[1],axis=1,inplace=True)

df_date_stroke_42006.head() 

Unnamed: 0,f.eid,date_stroke_42006,date_stroke_42006_yes_no
0,1000018,,0
1,1000020,,0
2,1000034,,0
3,1000041,,0
4,1000056,,0


In [132]:
# 42008
# Date of ischaemic stroke

date_isch_stroke_path_42008 = data_path + "f.42008.tab"
df_date_isch_stroke_42008 = pd.read_csv(date_isch_stroke_path_42008, sep = "\t")
df_date_isch_stroke_42008.rename(columns={'f.42008.0.0': 'date_isch_stroke_42008'}, inplace=True) 

df_date_isch_stroke_42008['date_isch_stroke_42008'] = pd.to_datetime(df_date_isch_stroke_42008['date_isch_stroke_42008'])
df_date_isch_stroke_42008.loc[df_date_isch_stroke_42008["date_isch_stroke_42008"] < df_recruitment["recruitment_date"], 
                       "date_isch_stroke_42008"] = pd.NaT

dict = {math.nan: "N/A"}
df_date_isch_stroke_42008.replace({"date_isch_stroke_42008": dict}, inplace=True)
df_date_isch_stroke_42008['date_isch_stroke_42008_yes_no'] = np.where(
    df_date_isch_stroke_42008['date_isch_stroke_42008'] != "N/A", 1, 0)
# df_date_isch_stroke_42008.drop(df_date_isch_stroke_42008.columns[1],axis=1,inplace=True)

df_date_isch_stroke_42008.head() 

Unnamed: 0,f.eid,date_isch_stroke_42008,date_isch_stroke_42008_yes_no
0,1000018,,0
1,1000020,,0
2,1000034,,0
3,1000041,,0
4,1000056,,0


In [133]:
# 42010
# Date of intracerebral haemorrhage
 
date_intracerebral_haemorrhage_path_42010 = data_path + "f.42010.tab"
df_date_intracerebral_haemorrhage_42010 = pd.read_csv(date_intracerebral_haemorrhage_path_42010, sep = "\t")
df_date_intracerebral_haemorrhage_42010.rename(columns={'f.42010.0.0': 'date_intracerebral_haemorrhage_42010'}, inplace=True) 

df_date_intracerebral_haemorrhage_42010['date_intracerebral_haemorrhage_42010'] = pd.to_datetime(df_date_intracerebral_haemorrhage_42010['date_intracerebral_haemorrhage_42010'])
df_date_intracerebral_haemorrhage_42010.loc[df_date_intracerebral_haemorrhage_42010["date_intracerebral_haemorrhage_42010"] < df_recruitment["recruitment_date"], 
                       "date_intracerebral_haemorrhage_42010"] = pd.NaT

dict = {math.nan: "N/A"}
df_date_intracerebral_haemorrhage_42010.replace({"date_intracerebral_haemorrhage_42010": dict}, inplace=True)
df_date_intracerebral_haemorrhage_42010['date_intracerebral_haemorrhage_42010_yes_no'] = np.where(
    df_date_intracerebral_haemorrhage_42010['date_intracerebral_haemorrhage_42010'] != "N/A", 1, 0)
# df_date_intracerebral_haemorrhage_42010.drop(df_date_intracerebral_haemorrhage_42010.columns[1],axis=1,inplace=True)

df_date_intracerebral_haemorrhage_42010.head() 

Unnamed: 0,f.eid,date_intracerebral_haemorrhage_42010,date_intracerebral_haemorrhage_42010_yes_no
0,1000018,,0
1,1000020,,0
2,1000034,,0
3,1000041,,0
4,1000056,,0


In [134]:
# 42000
# Date of myocardial infarction 
date_myo_infarction_path = data_path + "f.42000.tab"
df_date_myo_infarction = pd.read_csv(date_myo_infarction_path, sep = "\t")
df_date_myo_infarction.rename(columns={'f.42000.0.0': 'date_myo_infarction_HA'}, inplace=True) 

df_date_myo_infarction['date_myo_infarction_HA'] = pd.to_datetime(df_date_myo_infarction['date_myo_infarction_HA'])
df_date_myo_infarction.loc[df_date_myo_infarction["date_myo_infarction_HA"] < df_recruitment["recruitment_date"], 
                       "date_myo_infarction_HA"] = pd.NaT

dict = {math.nan: "N/A"}
df_date_myo_infarction.replace({"date_myo_infarction_HA": dict}, inplace=True)
df_date_myo_infarction['date_myo_infarction_iscHD_yes_no'] = np.where(df_date_myo_infarction['date_myo_infarction_HA'] != "N/A", 1, 0)
# df_date_myo_infarction.drop(df_date_myo_infarction.columns[1],axis=1,inplace=True)

df_date_myo_infarction.head() 

Unnamed: 0,f.eid,date_myo_infarction_HA,date_myo_infarction_iscHD_yes_no
0,1000018,,0
1,1000020,,0
2,1000034,,0
3,1000041,2013-06-24 00:00:00,1
4,1000056,,0


In [135]:
# 131298
# Date I21 first reported (acute myocardial infarction/acute_HA)
date_acute_HA_path = data_path + "f.131298.tab"
df_date_acute_HA = pd.read_csv(date_acute_HA_path, sep = "\t")
df_date_acute_HA.rename(columns={'f.131298.0.0': 'date_acute_HA'}, inplace=True) 

df_date_acute_HA['date_acute_HA'] = pd.to_datetime(df_date_acute_HA['date_acute_HA'])
df_date_acute_HA.loc[df_date_acute_HA["date_acute_HA"] < df_recruitment["recruitment_date"], 
                       "date_acute_HA"] = pd.NaT

dict = {math.nan: "N/A"}
df_date_acute_HA.replace({"date_acute_HA": dict}, inplace=True)
df_date_acute_HA['date_acute_HA_yes_no'] = np.where(df_date_acute_HA['date_acute_HA'] != "N/A", 1, 0)
# df_date_acute_HA.drop(df_date_acute_HA.columns[1],axis=1,inplace=True)

df_date_acute_HA.head() 

Unnamed: 0,f.eid,date_acute_HA,date_acute_HA_yes_no
0,1000018,,0
1,1000020,,0
2,1000034,,0
3,1000041,2013-06-24 00:00:00,1
4,1000056,,0


In [136]:
# 131300
# Date I22 first reported (subsequent myocardial infarction/subseq_HA)
date_subseq_HA = data_path + "f.131300.tab"
df_date_subseq_HA = pd.read_csv(date_subseq_HA, sep = "\t")
df_date_subseq_HA.rename(columns={'f.131300.0.0': 'date_subseq_HA'}, inplace=True) 

df_date_subseq_HA['date_subseq_HA'] = pd.to_datetime(df_date_subseq_HA['date_subseq_HA'])
df_date_subseq_HA.loc[df_date_subseq_HA["date_subseq_HA"] < df_recruitment["recruitment_date"], 
                       "date_subseq_HA"] = pd.NaT

df_date_subseq_HA.replace(np.nan, "N/A", inplace = True)
df_date_subseq_HA['date_subseq_HA_yes_no'] = np.where(
    df_date_subseq_HA['date_subseq_HA'] != "N/A", 1, 0)
# df_date_subseq_HA.drop(df_date_subseq_HA.columns[1],axis=1,inplace=True)

df_date_subseq_HA.head() 

Unnamed: 0,f.eid,date_subseq_HA,date_subseq_HA_yes_no
0,1000018,,0
1,1000020,,0
2,1000034,,0
3,1000041,,0
4,1000056,,0


In [137]:
# 131302
# Date I23 first reported (certain current complications following acute myocardial infarction)
date_complications_after_HA = data_path + "f.131302.tab"
df_date_complications_after_HA = pd.read_csv(date_complications_after_HA, sep = "\t")
df_date_complications_after_HA.rename(columns={'f.131302.0.0': 'date_complications_after_HA'}, inplace=True) 

df_date_complications_after_HA['date_complications_after_HA'] = pd.to_datetime(df_date_complications_after_HA['date_complications_after_HA'])
df_date_complications_after_HA.loc[df_date_complications_after_HA["date_complications_after_HA"] < df_recruitment["recruitment_date"], 
                       "date_complications_after_HA"] = pd.NaT

df_date_complications_after_HA.replace(np.nan, "N/A", inplace = True)
df_date_complications_after_HA['date_complications_after_HA_yes_no'] = np.where(
    df_date_complications_after_HA['date_complications_after_HA'] != "N/A", 1, 0)
# df_date_complications_after_HA.drop(df_date_complications_after_HA.columns[1],axis=1,inplace=True)

df_date_complications_after_HA.head() 

Unnamed: 0,f.eid,date_complications_after_HA,date_complications_after_HA_yes_no
0,1000018,,0
1,1000020,,0
2,1000034,,0
3,1000041,,0
4,1000056,,0


In [138]:
# 6153:3, Medication for diabetes

meds_diabetes_path_6153 = "long_df_medications_6153.tsv" # data_path + "f.6153.tab"
df_meds_diabetes_6153 = pd.read_csv(meds_diabetes_path_6153, sep = "\t")
df_meds_diabetes_6153.replace(np.nan, "N/A", inplace = True)

df_meds_diabetes_6153['6153_diabetes_meds_yes_no'] = np.where(
    df_meds_diabetes_6153['meaning'].str.contains('Insulin', regex=True), 1, 0)

cols = range(1,4)
df_meds_diabetes_6153.drop(df_meds_diabetes_6153.columns[cols],axis=1,inplace=True)
df_meds_diabetes_6153.head() 
# df_meds_diabetes_6153[df_meds_diabetes_6153['meaning'].str.contains('Insulin', regex=True)]

Unnamed: 0,f.eid,6153_diabetes_meds_yes_no
1,1000018,0
2,1000020,0
3,1000034,0
4,1000041,0
5,1000056,0


In [139]:
# 6177:3, Medication for diabetes

meds_diabetes_path_6177 = "long_df_medications_6177.tsv" # data_path + "f.6177.tab"
df_meds_diabetes_6177 = pd.read_csv(meds_diabetes_path_6177, sep = "\t")
df_meds_diabetes_6177.replace(np.nan, "N/A", inplace = True)

df_meds_diabetes_6177['6177_diabetes_meds_yes_no'] = np.where(
    df_meds_diabetes_6177['meaning'].str.contains('Insulin', regex=True), 1, 0)

cols = range(1,4)
df_meds_diabetes_6177.drop(df_meds_diabetes_6177.columns[cols],axis=1,inplace=True)
df_meds_diabetes_6177.head() 
# df_meds_diabetes_6177[df_meds_diabetes_6177['meaning'].str.contains('Insulin', regex=True)]

Unnamed: 0,f.eid,6177_diabetes_meds_yes_no
1,1000018,0
2,1000020,0
3,1000020,0
4,1000034,0
5,1000041,0


In [140]:
# 6153:2, Medication for BP

meds_BP_path_6153 = "long_df_medications_6153.tsv" # data_path + "f.6153.tab"
df_BP_meds_6153 = pd.read_csv(meds_BP_path_6153, sep = "\t")
df_BP_meds_6153.replace(np.nan, "N/A", inplace = True)

df_BP_meds_6153['6153_diabetes_meds_yes_no'] = np.where(
    df_BP_meds_6153['meaning'].str.contains('Blood pressure medication', regex=True), 1, 0)

cols = range(1,4)
df_BP_meds_6153.drop(df_BP_meds_6153.columns[cols],axis=1,inplace=True)
df_BP_meds_6153.head() 
# df_BP_meds_6153[df_BP_meds_6153['meaning'].str.contains('Blood pressure medication', regex=True)]

Unnamed: 0,f.eid,6153_diabetes_meds_yes_no
1,1000018,0
2,1000020,0
3,1000034,0
4,1000041,0
5,1000056,0


In [141]:
# 6177:2, Medication for BP

meds_cholesterol_path_6177 = "long_df_medications_6177.tsv" # data_path + "f.6177.tab"
df_BP_meds_6177 = pd.read_csv(meds_cholesterol_path_6177, sep = "\t")
df_BP_meds_6177.replace(np.nan, "N/A", inplace = True)

df_BP_meds_6177['6177_BP_meds_yes_no'] = np.where(
    df_BP_meds_6177['meaning'].str.contains('Blood pressure', regex=True), 1, 0)

cols = range(1,4)
df_BP_meds_6177.drop(df_BP_meds_6177.columns[cols],axis=1,inplace=True)
df_BP_meds_6177.head() 

Unnamed: 0,f.eid,6177_BP_meds_yes_no
1,1000018,0
2,1000020,0
3,1000020,1
4,1000034,0
5,1000041,0


In [142]:
# 6153:2, Medication for High cholesterol

meds_cholesterol_path_6153 = "long_df_medications_6153.tsv" # data_path + "f.6153.tab"
df_meds_cholesterol_6153 = pd.read_csv(meds_cholesterol_path_6153, sep = "\t")
df_meds_cholesterol_6153.replace(np.nan, "N/A", inplace = True)

df_meds_cholesterol_6153['6153_high_cholesterol_meds_yes_no'] = np.where(
    df_meds_cholesterol_6153['meaning'].str.contains('Cholesterol', regex=True), 1, 0)

cols = range(1,4)
df_meds_cholesterol_6153.drop(df_meds_cholesterol_6153.columns[cols],axis=1,inplace=True)
df_meds_cholesterol_6153.head() 
# df_meds_cholesterol_6153[df_meds_cholesterol_6153['meaning'].str.contains('Cholesterol', regex=True)]

Unnamed: 0,f.eid,6153_high_cholesterol_meds_yes_no
1,1000018,0
2,1000020,0
3,1000034,0
4,1000041,0
5,1000056,0


In [143]:
# 6177:1, Medication for High cholesterol

meds_cholesterol_path_6177 = "long_df_medications_6177.tsv" # data_path + "f.6177.tab"
df_meds_cholesterol_6177 = pd.read_csv(meds_cholesterol_path_6177, sep = "\t")
df_meds_cholesterol_6177.replace(np.nan, "N/A", inplace = True)

df_meds_cholesterol_6177['6177_cholesterol_meds_yes_no'] = np.where(
    df_meds_cholesterol_6177['meaning'].str.contains('Cholesterol', regex=True), 1, 0)

cols = range(1,4)
df_meds_cholesterol_6177.drop(df_meds_cholesterol_6177.columns[cols],axis=1,inplace=True)
df_meds_cholesterol_6177.head() 
# df_meds_cholesterol_6177[df_meds_cholesterol_6177['meaning'].str.contains('Cholesterol', regex=True)]

Unnamed: 0,f.eid,6177_cholesterol_meds_yes_no
1,1000018,0
2,1000020,1
3,1000020,0
4,1000034,0
5,1000041,0


In [144]:
# 6150:4, Vascular/heart problems diagnosed by doctor -- High blood pressure

HBP_path_6150 = "long_df_diagnoses_6150.tsv" # data_path + "f.6177.tab"
df_HBP_6150 = pd.read_csv(HBP_path_6150, sep = "\t")
df_HBP_6150.replace(np.nan, "N/A", inplace = True)

df_HBP_6150['6150_BP_diag_yes_no'] = np.where(
    df_HBP_6150['meaning'].str.contains('blood pressure', regex=True), 1, 0)

cols = range(1,4)
df_HBP_6150.drop(df_HBP_6150.columns[cols],axis=1,inplace=True)
df_HBP_6150.head() 

Unnamed: 0,f.eid,6150_BP_diag_yes_no
1,1000018,0
2,1000020,1
3,1000034,0
4,1000041,1
5,1000056,0


In [145]:
# 6150:2, Vascular/heart problems diagnosed by doctor -- Angina

angina_path_6150 = "long_df_diagnoses_6150.tsv" # data_path + "f.6177.tab"
df_angina_6150 = pd.read_csv(angina_path_6150, sep = "\t")
df_angina_6150.replace(np.nan, "N/A", inplace = True)

df_angina_6150['6150_angina_diag_yes_no'] = np.where(
    df_angina_6150['meaning'].str.contains('Angina', regex=True), 1, 0)

cols = range(1,4)
df_angina_6150.drop(df_angina_6150.columns[cols],axis=1,inplace=True)
df_angina_6150.head() 
# df_angina_6150[df_angina_6150['meaning'].str.contains('Angina', regex=True)]

Unnamed: 0,f.eid,6150_angina_diag_yes_no
1,1000018,0
2,1000020,0
3,1000034,0
4,1000041,0
5,1000056,0


In [146]:
# 6150:3, Vascular/heart problems diagnosed by doctor -- Stroke

stroke_path_6150 = "long_df_diagnoses_6150.tsv" # data_path + "f.6177.tab"
df_stroke_6150 = pd.read_csv(stroke_path_6150, sep = "\t")
df_stroke_6150.replace(np.nan, "N/A", inplace = True)

df_stroke_6150['6150_stroke_diag_yes_no'] = np.where(
    df_stroke_6150['meaning'].str.contains('Stroke', regex=True), 1, 0)

cols = range(1,4)
df_stroke_6150.drop(df_stroke_6150.columns[cols],axis=1,inplace=True)
df_stroke_6150.head() 
# df_stroke_6150[df_stroke_6150['meaning'].str.contains('Stroke', regex=True)]

Unnamed: 0,f.eid,6150_stroke_diag_yes_no
1,1000018,0
2,1000020,0
3,1000034,0
4,1000041,0
5,1000056,0


In [147]:
# 6150:1, Vascular/heart problems diagnosed by doctor -- HA

HA_path_6150 = "long_df_diagnoses_6150.tsv" # data_path + "f.6177.tab"
df_HA_6150 = pd.read_csv(HA_path_6150, sep = "\t")
df_HA_6150.replace(np.nan, "N/A", inplace = True)

df_HA_6150['6150_HA_yes_no'] = np.where(
    df_HA_6150['meaning'].str.contains('Heart attack', regex=True), 1, 0)

cols = range(1,4)
df_HA_6150.drop(df_HA_6150.columns[cols],axis=1,inplace=True)
df_HA_6150.head() 
# df_HA_6150[df_HA_6150['meaning'].str.contains('Heart attack', regex=True)]

Unnamed: 0,f.eid,6150_HA_yes_no
1,1000018,0
2,1000020,0
3,1000034,0
4,1000041,0
5,1000056,0


In [148]:
# 130706
# Date E10 first reported (insulin-dependent diabetes mellitus)
date_insulin_diabetes_path = data_path + "f.130706.tab"
df_date_insulin_diabetes = pd.read_csv(date_insulin_diabetes_path, sep = "\t")
df_date_insulin_diabetes.rename(columns={'f.130706.0.0': 'date_insulin_diabetes'}, inplace=True) 

df_date_insulin_diabetes['date_insulin_diabetes'] = pd.to_datetime(df_date_insulin_diabetes['date_insulin_diabetes'])
df_date_insulin_diabetes.loc[df_date_insulin_diabetes["date_insulin_diabetes"] < df_recruitment["recruitment_date"], 
                       "date_insulin_diabetes"] = pd.NaT

df_date_insulin_diabetes.replace(np.nan, "N/A", inplace = True)
df_date_insulin_diabetes['date_insulin_diabetes_yes_no'] = np.where(
    df_date_insulin_diabetes['date_insulin_diabetes'] != "N/A", 1, 0)
df_date_insulin_diabetes.head() 
# df_date_insulin_diabetes[df_date_insulin_diabetes['date_insulin_diabetes'] !='N/A']

Unnamed: 0,f.eid,date_insulin_diabetes,date_insulin_diabetes_yes_no
0,1000018,,0
1,1000020,,0
2,1000034,,0
3,1000041,,0
4,1000056,,0


In [149]:
# 130710
# Date E12 first reported (malnutrition-related diabetes mellitus)
date_malnutrition_diabetes_path = data_path + "f.130710.tab"
df_date_malnutrition_diabetes = pd.read_csv(date_malnutrition_diabetes_path, sep = "\t")
df_date_malnutrition_diabetes.rename(columns={'f.130710.0.0': 'date_malnutrition_diabetes'}, inplace=True) 

df_date_malnutrition_diabetes['date_malnutrition_diabetes'] = pd.to_datetime(df_date_malnutrition_diabetes['date_malnutrition_diabetes'])
df_date_malnutrition_diabetes.loc[df_date_malnutrition_diabetes["date_malnutrition_diabetes"] < df_recruitment["recruitment_date"], 
                       "date_malnutrition_diabetes"] = pd.NaT

df_date_malnutrition_diabetes.replace(np.nan, "N/A", inplace = True)
df_date_malnutrition_diabetes['date_malnutrition_diabetes_yes_no'] = np.where(
    df_date_malnutrition_diabetes['date_malnutrition_diabetes'] != "N/A", 1, 0)
# df_date_malnutrition_diabetes.drop(df_date_malnutrition_diabetes.columns[1],axis=1,inplace=True)

df_date_malnutrition_diabetes.head() 

Unnamed: 0,f.eid,date_malnutrition_diabetes,date_malnutrition_diabetes_yes_no
0,1000018,,0
1,1000020,,0
2,1000034,,0
3,1000041,,0
4,1000056,,0


In [150]:
# 130708
# Date E11 first reported (non-insulin-dependent diabetes mellitus)
date_non_insulin_diabetes_path = data_path + "f.130708.tab"
df_date_non_insulin_diabetes = pd.read_csv(date_non_insulin_diabetes_path, sep = "\t")
df_date_non_insulin_diabetes.rename(columns={'f.130708.0.0': 'date_non_insulin_diabetes'}, inplace=True) 

df_date_non_insulin_diabetes['date_non_insulin_diabetes'] = pd.to_datetime(df_date_non_insulin_diabetes['date_non_insulin_diabetes'])
df_date_non_insulin_diabetes.loc[df_date_non_insulin_diabetes["date_non_insulin_diabetes"] < df_recruitment["recruitment_date"], 
                       "date_non_insulin_diabetes"] = pd.NaT

df_date_non_insulin_diabetes.replace(np.nan, "N/A", inplace = True)
df_date_non_insulin_diabetes['date_non_insulin_diabetes_yes_no'] = np.where(
    df_date_non_insulin_diabetes['date_non_insulin_diabetes'] != "N/A", 1, 0)
# df_date_non_insulin_diabetes.drop(df_date_non_insulin_diabetes.columns[1],axis=1,inplace=True)

df_date_non_insulin_diabetes.head() 

Unnamed: 0,f.eid,date_non_insulin_diabetes,date_non_insulin_diabetes_yes_no
0,1000018,,0
1,1000020,,0
2,1000034,,0
3,1000041,,0
4,1000056,2015-02-26 00:00:00,1


In [151]:
# 130712
# Date E13 first reported (other specified diabetes mellitus)
date_other_spec_diabetes_path = data_path + "f.130712.tab"
df_date_other_spec_diabetes = pd.read_csv(date_other_spec_diabetes_path, sep = "\t")
df_date_other_spec_diabetes.rename(columns={'f.130712.0.0': 'date_other_spec_diabetes'}, inplace=True) 

df_date_other_spec_diabetes['date_other_spec_diabetes'] = pd.to_datetime(df_date_other_spec_diabetes['date_other_spec_diabetes'])
df_date_other_spec_diabetes.loc[df_date_other_spec_diabetes["date_other_spec_diabetes"] < df_recruitment["recruitment_date"], 
                       "date_other_spec_diabetes"] = pd.NaT

df_date_other_spec_diabetes.replace(np.nan, "N/A", inplace = True)
df_date_other_spec_diabetes['date_other_spec_diabetes_yes_no'] = np.where(
    df_date_other_spec_diabetes['date_other_spec_diabetes'] != "N/A", 1, 0)
# df_date_other_spec_diabetes.drop(df_date_other_spec_diabetes.columns[1],axis=1,inplace=True)

df_date_other_spec_diabetes.head() 

Unnamed: 0,f.eid,date_other_spec_diabetes,date_other_spec_diabetes_yes_no
0,1000018,,0
1,1000020,,0
2,1000034,,0
3,1000041,,0
4,1000056,,0


In [152]:
# 130714
# Date E14 first reported (unspecified diabetes mellitus)
date_other_unspec_diabetes_path = data_path + "f.130714.tab"
df_date_other_unspec_diabetes = pd.read_csv(date_other_unspec_diabetes_path, sep = "\t")
df_date_other_unspec_diabetes.rename(columns={'f.130714.0.0': 'date_other_unspec_diabetes'}, inplace=True) 

df_date_other_unspec_diabetes['date_other_unspec_diabetes'] = pd.to_datetime(df_date_other_unspec_diabetes['date_other_unspec_diabetes'])
df_date_other_unspec_diabetes.loc[df_date_other_unspec_diabetes["date_other_unspec_diabetes"] < df_recruitment["recruitment_date"], 
                       "date_other_unspec_diabetes"] = pd.NaT

df_date_other_unspec_diabetes.replace(np.nan, "N/A", inplace = True)
df_date_other_unspec_diabetes['date_other_unspec_diabetes_yes_no'] = np.where(
    df_date_other_unspec_diabetes['date_other_unspec_diabetes'] != "N/A", 1, 0)
# df_date_other_unspec_diabetes.drop(df_date_other_unspec_diabetes.columns[1],axis=1,inplace=True)

df_date_other_unspec_diabetes.head() 

Unnamed: 0,f.eid,date_other_unspec_diabetes,date_other_unspec_diabetes_yes_no
0,1000018,,0
1,1000020,,0
2,1000034,,0
3,1000041,,0
4,1000056,,0


In [153]:
# self report 20002
selfreport_path = "long_df_selfreport_20002.tsv"
df_selfreport = pd.read_csv(selfreport_path, sep = "\t")

df_selfreport.replace(np.nan, "N/A", inplace = True)
df_selfreport['self_reported_hypertension_yes_no'] = np.where(
    df_selfreport['meaning'].str.contains('hypertension', regex=True), 1, 0)

df_selfreport['self_reported_cholesterol_yes_no'] = np.where(
    df_selfreport['meaning'].str.contains('high cholesterol', regex=True), 1, 0)

df_selfreport['self_reported_angina_yes_no'] = np.where(
    df_selfreport['meaning'].str.contains('angina', regex=True), 1, 0)

df_selfreport['self_reported_diabetes_yes_no'] = np.where(
    df_selfreport['meaning'].str.contains('diabetes', regex=True), 1, 0)

df_selfreport['self_reported_hypertrophic_cardiomyopathy_yes_no'] = np.where(
    df_selfreport['meaning'].str.contains('hypertrophic cardiomyopathy', regex=True), 1, 0)

df_selfreport['self_reported_cardiomyopathy_yes_no'] = np.where(
    df_selfreport['meaning'].str.contains('cardiomyopathy', regex=True), 1, 0)

df_selfreport['self_reported_HF_yes_no'] = np.where(
    df_selfreport['meaning'].str.contains('heart failure/pulmonary odema', regex=True), 1, 0)

df_selfreport['self_reported_stroke_yes_no'] = np.where(
    df_selfreport['meaning'].str.contains('stroke', regex=True), 1, 0)

# df_selfreport['self_reported_ischaemic_stroke_yes_no'] = np.where(
#     df_selfreport['meaning'].str.contains('ischaemic stroke', regex=True), 1, 0)

df_selfreport['self_reported_brain_haem_yes_no'] = np.where(
    df_selfreport['meaning'].str.contains('brain haemorrhage', regex=True), 1, 0)

df_selfreport['self_reported_heart_attack_yes_no'] = np.where(
    df_selfreport['meaning'].str.contains('heart attack/myocardial infarction', regex=True), 1, 0)

cols = [1,2,3]
df_selfreport.drop(df_selfreport.columns[cols],axis=1,inplace=True)

df_selfreport.head() 

Unnamed: 0,f.eid,self_reported_hypertension_yes_no,self_reported_cholesterol_yes_no,self_reported_angina_yes_no,self_reported_diabetes_yes_no,self_reported_hypertrophic_cardiomyopathy_yes_no,self_reported_cardiomyopathy_yes_no,self_reported_HF_yes_no,self_reported_stroke_yes_no,self_reported_brain_haem_yes_no,self_reported_heart_attack_yes_no
1,1000018,0,0,0,0,0,0,0,0,0,0
2,1000020,1,0,0,0,0,0,0,0,0,0
3,1000020,0,1,0,0,0,0,0,0,0,0
4,1000034,0,0,0,0,0,0,0,0,0,0
5,1000034,0,0,0,0,0,0,0,0,0,0


### results for searching for diabetes in df selfreport
terms = ((df_selfreport['meaning']).unique())
search = "diabetes"
search = "hypertension"
search = "high cholesterol"
search = "angina"
search = "cardiomyopathy"
search = "heart failure/p"
search = "stroke"
search = "brain haemorrhage"
search = "heart attack/myocardial"
result = [t for t in terms if search in t]  


print(result) 

In [154]:
# Body Surface Area

bsa_path = data_path + "f.22427.tab"
df_bsa = pd.read_csv(bsa_path, sep = "\t")
df_bsa.rename(columns={'f.22427.2.0': 'bsa_imaging_visit1'}, inplace=True) 
df_bsa.drop(df_bsa.columns[2],axis=1,inplace=True)
df_bsa.dropna(inplace = True)
df_bsa.head()

Unnamed: 0,f.eid,bsa_imaging_visit1
18,1000195,1.51
22,1000231,1.67
37,1000384,2.07
49,1000500,1.69
50,1000511,1.79


In [155]:
df_cmr_measures = pd.read_csv("/workspace/home/gandriamiadana/mres_dissertation/cmr_measures_updated.csv", sep= ";")
df_cmr_measures.rename(columns={'eid': 'f.eid'}, inplace=True) 

df_cmrs = df_cmr_measures[['f.eid', 'LVM (g)', 'LVSV (mL)', 'LVESV (mL)', 'LVEDV (mL)', 'Ell_Global (%)', 'LVEF (%)']] 
df_cmrs.head(10)
print(df_cmrs.shape)

# df_cmrs_updated = df_cmrs.copy()
df_cmrs_updated = pd.merge(df_bsa, df_cmrs)

# print(df_cmrs_updated.shape)
df_cmrs_updated['LV_mass2'] = df_cmrs_updated["LVM (g)"]/df_cmrs_updated["bsa_imaging_visit1"]
df_cmrs_updated['LVEDVI'] = df_cmrs_updated["LVEDV (mL)"]/df_cmrs_updated["bsa_imaging_visit1"]
lv_mean_cav_vol = (df_cmrs_updated["LVEDV (mL)"]+df_cmrs_updated["LVESV (mL)"])/2
df_cmrs_updated['LVGFI']= lv_mean_cav_vol + (df_cmrs_updated['LV_mass2']/1.05)
df_cmrs_updated.rename(columns={'Ell_Global (%)': 'GLS',
                                'LVEF (%)': 'LVEF'}, inplace=True) 
cols = range(1,6) 
df_cmrs_updated.drop(df_cmrs_updated.columns[cols],axis=1,inplace=True)

df_cmrs_updated.head()

(26892, 7)


Unnamed: 0,f.eid,GLS,LVEF,LV_mass2,LVEDVI,LVGFI
0,1000195,-19.34086,64.465409,50.61841,84.347439,134.519577
1,1000231,-17.838181,59.775877,43.565295,67.772234,120.84333
2,1000384,-17.237037,63.243243,46.30382,83.52182,162.318368
3,1000511,-16.787152,52.413793,35.358554,54.073621,105.100473
4,1000771,-21.193899,58.405838,41.395938,72.072969,123.616874


In [156]:
# dfList = [df_age, df_sex, df_ethnicity, df_Education, df_deprivation, df_alcohol_intake, df_smoking_status, df_physical_activity,
#           df_diabetes_diag_by_GP, df_age_diabetes_diag, df_HbA1c, df_Glucose,
#           df_cholesterol, df_date_lipidaemia, df_date_HF, df_date_angina_pec, df_date_other_iscHD, df_date_chronic_iscHD, 
#           df_date_transient_iscHD, df_age_angina_diag,
#           df_date_hypertension, df_date_cardiomyopathy, df_date_cardiomyopathy_elsewhere, 
#           df_date_hypertensive_HD, df_date_hypertensive_HD_and_RD, 
#           , df_date_myo_infarction, df_date_acute_HA, df_date_subseq_HA, df_date_complications_after_HA,
#           df_meds_diabetes_6153, df_meds_diabetes_6177, df_BP_meds_6153, df_BP_meds_6177, df_meds_cholesterol_6153, df_meds_cholesterol_6177,
#           df_HBP_6150, df_HA_6150, 
#           df_HBP_6150, df_stroke_6150, df_HA_6150, 
#     df_date_insulin_diabetes, df_date_malnutrition_diabetes, df_date_non_insulin_diabetes,
#     df_date_other_spec_diabetes, df_date_other_unspec_diabetes, df_selfreport, df_bsa, df_cmrs_updated
#         df_date_insulin_diabetes, df_date_malnutrition_diabetes, df_date_non_insulin_diabetes,
#           df_date_other_spec_diabetes, df_date_other_unspec_diabetes, df_selfreport, df_bsa, df_cmrs_updated
        #   ]
# print(dfList)

In [157]:
### Self report columns needed for next processing step
print(list(df_selfreport.columns), "\n")
stroke_cols = ["f.eid"] + [col for col in df_selfreport.columns 
                           if 'stroke' in col] + [col for col in df_selfreport.columns if 'brain' in col]
diabetes_cols = ["f.eid"] + [col for col in df_selfreport.columns if 'diabetes' in col] 
hypertension_cols = ["f.eid"] + [col for col in df_selfreport.columns if 'hypertension' in col] 
cholesterol_cols = ["f.eid"] + [col for col in df_selfreport.columns if 'cholesterol' in col] 
angina_cols = ["f.eid"] + [col for col in df_selfreport.columns if 'angina' in col] 
cardiomyopathy_cols = ["f.eid"] + [col for col in df_selfreport.columns if 'cardiomyopathy' in col] 
HF_cols = ["f.eid"] + [col for col in df_selfreport.columns if 'HF' in col] 
HA_cols = ["f.eid"] + [col for col in df_selfreport.columns if 'heart_attack' in col] 

['f.eid', 'self_reported_hypertension_yes_no', 'self_reported_cholesterol_yes_no', 'self_reported_angina_yes_no', 'self_reported_diabetes_yes_no', 'self_reported_hypertrophic_cardiomyopathy_yes_no', 'self_reported_cardiomyopathy_yes_no', 'self_reported_HF_yes_no', 'self_reported_stroke_yes_no', 'self_reported_brain_haem_yes_no', 'self_reported_heart_attack_yes_no'] 



In [158]:
# STROKE CASES 

dfList_stroke = [
    df_age_stroke, df_date_stroke_131368, df_date_stroke_42006, df_stroke_6150, df_date_isch_stroke_42008, 
    df_date_intracerebral_haemorrhage_42010, df_date_intracerebral_haemorrhage, df_date_cerebral_infarction,
    df_date_other_intracerebral_haemorrhage, df_selfreport[stroke_cols]
] 
df = functools.reduce(lambda left, right: pd.merge(left,right,on=['f.eid']), dfList_stroke)

cols_for_df = ["f.eid"] + [col for col in df.columns if 'yes' in col]
temp_df = df[cols_for_df]
stroke_status = temp_df.iloc[:, 1:].apply(np.sum, axis=1)
df2 = temp_df.drop(temp_df.columns[1:],axis=1,inplace=False)
df2['Stroke_Status'] = stroke_status
df2.drop_duplicates(inplace=True)
df2.loc[df2["Stroke_Status"] >= 1, "Stroke_Status"] = 1
df2[df2['Stroke_Status'] >= 1]

cols_for_new_df = [col for col in df.columns if 'yes' in col]
new_df = df.drop(list(cols_for_new_df), axis=1, inplace=False)
stroke_df = pd.merge(new_df, df2)

stroke_df.head()

Unnamed: 0,f.eid,date_stroke,date_stroke_42006,date_isch_stroke_42008,date_intracerebral_haemorrhage_42010,date_intracerebral_haemorrhage,date_cerebral_infarction,date_other_intracerebral_haemorrhage,Stroke_Status
0,1000018,,,,,,,,0
1,1000020,,,,,,,,0
2,1000020,,,,,,,,0
3,1000034,,,,,,,,0
4,1000034,,,,,,,,0


In [159]:
# DIABETES CASES 

dfList_diabetes = [
    df_diabetes_diag_by_GP, df_meds_diabetes_6153, df_age_diabetes_diag, df_meds_diabetes_6177, 
    df_date_insulin_diabetes, df_date_non_insulin_diabetes, df_date_other_spec_diabetes, df_date_other_unspec_diabetes, df_date_malnutrition_diabetes,
    df_Glucose, df_HbA1c,
    df_selfreport[diabetes_cols]
] 
df_diab = functools.reduce(lambda left, right: pd.merge(left,right,on=['f.eid']), dfList_diabetes)

cols_for_df_diab = ["f.eid"] + [col for col in df_diab.columns if 'yes' in col]
temp_diab_df = df_diab[cols_for_df_diab]
temp_diab_df.drop_duplicates(inplace=True)
diabetes_status = temp_diab_df.iloc[:, 1:].apply(np.sum, axis=1)
df_diab2 = temp_diab_df.drop(temp_diab_df.columns[1:],axis=1,inplace=False)
df_diab2['diabetes_status'] = diabetes_status
df_diab2.loc[df_diab2["diabetes_status"] >= 1, "diabetes_status"] = 1

cols_for_new_df_diab = [col for col in df_diab.columns if 'yes' in col]
new_df_diab = df_diab.drop(list(cols_for_new_df_diab), axis=1, inplace=False)
diabetes_df = pd.merge(new_df_diab, df_diab2)

diabetes_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_diab_df.drop_duplicates(inplace=True)


Unnamed: 0,f.eid,date_insulin_diabetes,date_non_insulin_diabetes,date_other_spec_diabetes,date_other_unspec_diabetes,date_malnutrition_diabetes,has_diabetes_Glucose,has_diabetes_HbA1c,diabetes_status
0,1000018,,,,,,0,0,0
1,1000020,,,,,,0,0,0
2,1000020,,,,,,0,0,0
3,1000020,,,,,,0,0,0
4,1000020,,,,,,0,0,0


In [163]:
# HYPERTENSION CASES 

dfList_hypertension = [
    df_BP_meds_6153, df_BP_meds_6177, df_HBP_6150,df_date_hypertension,  
    df_age_HBP_diag,
    df_selfreport[hypertension_cols]
] 
df_hyperten = functools.reduce(lambda left, right: pd.merge(left,right,on=['f.eid']), dfList_hypertension)

cols_for_df_hyperten = ["f.eid"] + [col for col in df_hyperten.columns if 'yes' in col]
temp_hyperten_df = df_hyperten[cols_for_df_hyperten]
temp_hyperten_df.drop_duplicates(inplace=True)

hypertension_status = temp_hyperten_df.iloc[:, 1:].apply(np.sum, axis=1)
df_hyperten2 = temp_hyperten_df.drop(temp_hyperten_df.columns[1:],axis=1,inplace=False)
df_hyperten2['hypertension_status'] = hypertension_status
df_hyperten2.loc[df_hyperten2["hypertension_status"] >= 1, "hypertension_status"] = 1

cols_for_new_df_hyperten = [col for col in df_hyperten.columns if 'yes' in col]
new_df_hyperten = df_hyperten.drop(list(cols_for_new_df_hyperten), axis=1, inplace=False)
new_df_hyperten.head()
hypertension_df = pd.merge(new_df_hyperten, df_hyperten2)

hypertension_df.head()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_hyperten_df.drop_duplicates(inplace=True)


Unnamed: 0,f.eid,date_hypertension,hypertension_status
0,1000018,,0
1,1000020,,1
2,1000020,,1
3,1000020,,1
4,1000020,,1


In [164]:
# HIGH CHOLESTEROL CASES 

dfList_cholesterol = [
    df_cholesterol, df_date_lipidaemia, df_meds_cholesterol_6153, df_meds_cholesterol_6177,
    df_selfreport[cholesterol_cols]
] 
df_chol = functools.reduce(lambda left, right: pd.merge(left,right,on=['f.eid']), dfList_cholesterol)

cols_for_df_cholesterol = ["f.eid"] + [col for col in df_chol.columns if 'yes' in col]
temp_chol_df = df_chol[cols_for_df_cholesterol]
temp_chol_df.drop_duplicates(inplace=True)

cholesterol_status = temp_chol_df.iloc[:, 1:].apply(np.sum, axis=1)
df_chol2 = temp_chol_df.drop(temp_chol_df.columns[1:],axis=1,inplace=False)
df_chol2['cholesterol_status'] = cholesterol_status
df_chol2.loc[df_chol2["cholesterol_status"] >= 1, "cholesterol_status"] = 1

cols_for_new_df_chol = [col for col in df_chol.columns if 'yes' in col]
new_df_chol = df_chol.drop(list(cols_for_new_df_chol), axis=1, inplace=False)
new_df_chol.head()
cholesterol_df = pd.merge(new_df_chol, df_chol2)

cholesterol_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_chol_df.drop_duplicates(inplace=True)


Unnamed: 0,f.eid,has_high_cholesterol,date_lipidaemia,cholesterol_status
0,1000018,0,,0
1,1000020,0,,1
2,1000020,0,,1
3,1000020,0,,0
4,1000020,0,,1


In [165]:
# ISCHAEMIC HEART DISEASE IHD OUTCOME

dfList_IHD = [
    df_age_angina_diag, df_angina_6150, df_date_angina_pec, df_date_other_iscHD, df_date_chronic_iscHD,
    df_selfreport[angina_cols]
] 
df_IHD = functools.reduce(lambda left, right: pd.merge(left,right,on=['f.eid']), dfList_IHD)

cols_for_df_IHD = ["f.eid"] + [col for col in df_IHD.columns if 'yes' in col]
temp_IHD_df = df_IHD[cols_for_df_IHD]
temp_IHD_df.drop_duplicates(inplace=True)

IHD_status = temp_IHD_df.iloc[:, 1:].apply(np.sum, axis=1)
df_IHD2 = temp_IHD_df.drop(temp_IHD_df.columns[1:],axis=1,inplace=False)
df_IHD2['IHD_status'] = IHD_status
df_IHD2.loc[df_IHD2["IHD_status"] >= 1, "IHD_status"] = 1

cols_for_new_df_IHD = [col for col in df_IHD.columns if 'yes' in col]
new_df_IHD = df_IHD.drop(list(cols_for_new_df_IHD), axis=1, inplace=False)
new_df_IHD.head()
IHD_df = pd.merge(new_df_IHD, df_IHD2)

IHD_df.head()
### hier -- 01:55

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_IHD_df.drop_duplicates(inplace=True)


Unnamed: 0,f.eid,date_angina_pec,date_other_iscHD,date_chronic_iscHD,IHD_status
0,1000018,,,,0
1,1000020,,,,0
2,1000020,,,,0
3,1000034,,,,0
4,1000034,,,,0


In [178]:
# NON-ISCHAEMIC HEART DISEASE IHD OUTCOME

dfList_non_IHD = [
    df_date_cardiomyopathy, df_date_cardiomyopathy_elsewhere, df_date_hypertensive_HD, df_date_hypertensive_HD_and_RD,
    df_selfreport[cardiomyopathy_cols]
] 
df_non_IHD = functools.reduce(lambda left, right: pd.merge(left,right,on=['f.eid']), dfList_non_IHD)

cols_for_df_non_IHD = ["f.eid"] + [col for col in df_non_IHD.columns if 'yes' in col]
temp_non_IHD_df = df_non_IHD[cols_for_df_non_IHD]
temp_non_IHD_df.drop_duplicates(inplace=True)

non_IHD_status = temp_non_IHD_df.iloc[:, 1:].apply(np.sum, axis=1)
df_non_IHD2 = temp_non_IHD_df.drop(temp_non_IHD_df.columns[1:],axis=1,inplace=False)
df_non_IHD2['non_IHD_status'] = non_IHD_status
df_non_IHD2.loc[df_non_IHD2["non_IHD_status"] >= 1, "non_IHD_status"] = 1

cols_for_new_df_non_IHD = [col for col in df_non_IHD.columns if 'yes' in col]
new_df_non_IHD = df_non_IHD.drop(list(cols_for_new_df_non_IHD), axis=1, inplace=False)
new_df_non_IHD.head()
non_IHD_df = pd.merge(new_df_non_IHD, df_non_IHD2)

non_IHD_df.head()
### hier -- 02:05

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_non_IHD_df.drop_duplicates(inplace=True)


Unnamed: 0,f.eid,date_cardiomyopathy,date_cardiomyopathy_elsewhere,date_hypertensive_HD,date_hypertensive_HD_and_RD,non_IHD_status
0,1000018,,,,,0
1,1000020,,,,,0
2,1000020,,,,,0
3,1000034,,,,,0
4,1000034,,,,,0


In [None]:
# HEART FAILURE OUTCOME

In [None]:
# PERICARDIAL DF

In [None]:
# HEART ATTACK DF

In [64]:
# res = functools.reduce(lambda left, right: pd.merge(left,right,on=['f.eid']), dfList)
# res.head(10)

Unnamed: 0,f.eid,age_at_recruitment,sex,ethnicity_at_recruitment,ethnicity_at_calib_visit,ethnicity_at_imaging_visit1,education_score,deprivation_index,alc_intake_at_recruitment,alc_intake_at_calib_visit,...,date_myo_infarction_iscHD_yes_no,date_acute_HA_yes_no,date_subseq_HA_yes_no,date_complications_after_HA_yes_no,6153_diabetes_meds_yes_no_x,6177_diabetes_meds_yes_no,6153_diabetes_meds_yes_no_y,6177_BP_meds_yes_no,6153_high_cholesterol_meds_yes_no,6177_cholesterol_meds_yes_no
0,1000018,53.0,1,1,-1000,-1000,15.06,-3.87966,3.0,,...,0,0,0,0,0,0,0,0,0,0
1,1000020,48.0,1,1,-1000,-1000,20.08,-4.08378,2.0,,...,0,0,0,0,0,0,0,0,0,1
2,1000020,48.0,1,1,-1000,-1000,20.08,-4.08378,2.0,,...,0,0,0,0,0,0,0,0,0,0
3,1000020,48.0,1,1,-1000,-1000,20.08,-4.08378,2.0,,...,0,0,0,0,0,0,0,1,0,1
4,1000020,48.0,1,1,-1000,-1000,20.08,-4.08378,2.0,,...,0,0,0,0,0,0,0,1,0,0
5,1000020,48.0,1,1,-1000,-1000,20.08,-4.08378,2.0,,...,0,0,0,0,0,0,0,0,0,1
6,1000020,48.0,1,1,-1000,-1000,20.08,-4.08378,2.0,,...,0,0,0,0,0,0,0,0,0,0
7,1000020,48.0,1,1,-1000,-1000,20.08,-4.08378,2.0,,...,0,0,0,0,0,0,0,1,0,1
8,1000020,48.0,1,1,-1000,-1000,20.08,-4.08378,2.0,,...,0,0,0,0,0,0,0,1,0,0
9,1000034,59.0,0,1,-1000,-1000,1.16,-2.75832,4.0,,...,0,0,0,0,0,0,0,0,0,0


In [None]:
final_df = pd.merge(res, res2, on=['f.eid'])
final_df.head()

In [None]:
bc_status_icd9 = pd.read_csv("BC_status_icd9.tsv", sep = "\t") 
bc_status_icd10 = pd.read_csv("BC_status_icd10.tsv", sep = "\t") 
bc_status_opcs4 = pd.read_csv("BC_status_opcs4.tsv", sep = "\t") 
df_meds_diabetes_6153 

In [None]:
res.to_csv("study_sample_UKB_data.tsv", sep = "\t") #, index=False)