In [1]:
import pandas as pd
import numpy as np

In [2]:
#UKB data paths
ukb_dobs_path = 'ukb_dobs.tsv'
hes_icd10_path = 'hes_icd10.tsv'
hes_icd9_path = 'hes_icd9.tsv'
hes_opcs4_path = 'hes_opcs4.tsv'
death_icd10_path = 'ukb_death.tsv'
isr_path = 'ukb_isr.tsv'
osr_path = 'ukb_osr.tsv'
gp_clin_path = 'ukb_ctv3.tsv'

#codelist paths
codelist_paths = {
        'chd_unconditional':'chd_codes_with_subphenotypes.tsv',
        'chd_conditional':'chd_conditional_with_subphenotypes.tsv',
        'confounding':'confounding_conditions.tsv',
        'excl_from_controls':'exclude_heart_related_from_controls.tsv',
        'excl_from_chd_cond':'exclusionary_codes_for_chd_conditionals.tsv',
        'potential_asd': 'potential_asd.tsv',
        'pfo': 'pfo_codes.tsv'}

In [3]:
#assign patients with flags which describe the codelists their records match to
def read_flag_codelists(filepath_dict):
    out = {}
    with_subtypes = []
    for flag, codelist_path in filepath_dict.items():
        out[flag] = pd.read_csv(codelist_path, sep='\t')
    return out

def get_pat_flags(data, codeset, code_col, current_flags, curr_subtype_flags, date_col='date'):
    data = pd.merge(data, dob, on='eid')
    data['age'] = (np.floor((data['date'] - data['dob']).dt.days / 365.25)).astype(int)
    for flag, flagdf in flag_codes.items():
        df = flagdf[flagdf['codeset'] == codeset]
        codes = df['code'].to_list()
        df.set_index('code', inplace=True)
        cdata = data.join(df, on=code_col)
        filt = cdata[(cdata[code_col].isin(codes)) & 
                          ((cdata['age_lim'] >= cdata['age']) |
                          (cdata['age_lim'].isna()))]
        flagged_i = set(filt.index)
        current_flags[flag] = np.maximum(current_flags[flag], np.where(current_flags.index.isin(flagged_i), 1, 0))
        if 'chd_subpheno' in df.columns:
            subphenos = filt['chd_subpheno']
            try:
                curr_subtype_flags[flag] = pd.concat([curr_subtype_flags[flag], subphenos])
            except KeyError:
                curr_subtype_flags[flag] = subphenos
                
def get_flags_no_age_lims(data, codeset, code_col, current_flags, date_col='date'):
    data = pd.merge(data, dob, on='eid')
    data['age'] = (np.floor((data['date'] - data['dob']).dt.days / 365.25)).astype(int)
    for flag, flagdf in flag_codes.items():
        df = flagdf[flagdf['codeset'] == codeset]
        codes = df['code'].to_list()
        df.set_index('code', inplace=True)
        cdata = data.join(df, on=code_col)
        filt = cdata[cdata[code_col].isin(codes)]
        flagged_i = set(filt.index)
        current_flags[flag] = np.maximum(current_flags[flag], np.where(current_flags.index.isin(flagged_i), 1, 0))

flag_codes = read_flag_codelists(codelist_paths)
flags_with_subtypes = [flag for flag, df in flag_codes.items() if 'chd_subpheno' in df.columns]

dob = pd.read_csv(ukb_dobs_path, sep='\t', index_col='eid')
dob['dob'] = pd.to_datetime(dob['dob'])
dob.dropna(inplace=True)

shape = (len(dob.index), len(flag_codes.keys()))
pat_flags = pd.DataFrame(data = np.zeros(shape), index=dob.index, columns = flag_codes.keys())
pat_flags_no_age_lims = pd.DataFrame(data = np.zeros(shape), index=dob.index, columns = flag_codes.keys())
pat_subtype_flags  = {}

icd10 = pd.read_csv(hes_icd10_path, sep='\t', index_col='eid')
icd10['date'] = pd.to_datetime(icd10['date'])
get_pat_flags(icd10, 'icd10', 'diag_icd10', pat_flags, pat_subtype_flags)
get_flags_no_age_lims(icd10, 'icd10', 'diag_icd10', pat_flags_no_age_lims)
del(icd10)

death = pd.read_csv(death_icd10_path, sep='\t', index_col='eid')
death['date'] = pd.to_datetime(death['date'])
get_pat_flags(death, 'icd10', 'icd10_death', pat_flags, pat_subtype_flags)
get_flags_no_age_lims(death, 'icd10', 'icd10_death', pat_flags_no_age_lims)
del(death)

icd9 = pd.read_csv(hes_icd9_path, sep='\t', index_col='eid')
icd9['date'] = pd.to_datetime(icd9['date'])
get_pat_flags(icd9, 'icd9', 'diag_icd9', pat_flags, pat_subtype_flags)
get_flags_no_age_lims(icd9, 'icd9', 'diag_icd9', pat_flags_no_age_lims)
del(icd9)

opcs4 = pd.read_csv(hes_opcs4_path, sep='\t', index_col='eid')
opcs4['date'] = pd.to_datetime(opcs4['date'])
get_pat_flags(opcs4, 'opcs4', 'oper4', pat_flags, pat_subtype_flags)
get_flags_no_age_lims(opcs4, 'opcs4', 'oper4', pat_flags_no_age_lims)
del(opcs4)

isr = pd.read_csv(isr_path, sep='\t', index_col='eid').astype({'isr':'str', 'date_approx':'datetime64'})
isr.columns = ['isr', 'date']
get_pat_flags(isr, 'isr', 'isr', pat_flags, pat_subtype_flags)
get_flags_no_age_lims(isr, 'isr', 'isr', pat_flags_no_age_lims)
del(isr)

osr = pd.read_csv(osr_path, sep='\t', index_col='eid').astype({'osr':'str', 'date_approx':'datetime64'})
osr.columns = ['osr', 'date']
get_pat_flags(osr, 'osr', 'osr', pat_flags, pat_subtype_flags)
get_flags_no_age_lims(osr, 'osr', 'osr', pat_flags_no_age_lims)
del(osr)

chunksize = 10 ** 6
ctv3 = pd.read_csv(gp_clin_path, sep='\t', index_col='eid', chunksize=chunksize)
for chunk in ctv3:
    chunk['date'] = pd.to_datetime(chunk['date'])
    get_pat_flags(chunk, 'ctv3', 'ctv3', pat_flags, pat_subtype_flags)
    get_flags_no_age_lims(chunk, 'ctv3', 'ctv3', pat_flags_no_age_lims)
    
gp_reg_f = '/home/v45331db/phd/biobank_work/raw_data/primary_care/gp_registrations.txt'
gp_reg = pd.read_csv(gp_reg_f, sep='\t', index_col=0)
has_gp_data = set(gp_reg.index)
    
pat_flags['has_gp_data'] = np.where(pat_flags.index.isin(has_gp_data), 1, 0)
pat_flags = pat_flags.astype(int)
pat_flags_no_age_lims = pat_flags_no_age_lims.astype(int)
chd_pats = set()
controls = set()

pat_flags.to_csv('pat_flags.tsv', sep='\t')

  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)


In [4]:
#remove those with confounding conditions
conf = pat_flags['confounding'] == 1
pat_flags = pat_flags[~conf]

#include chd_unconditional matches
chd_u = pat_flags[pat_flags['chd_unconditional'] == 1].index
chd_pats.update(chd_u)
pat_subtypes = pat_subtype_flags['chd_unconditional'][pat_subtype_flags['chd_unconditional'].index.isin(chd_u)]
pat_flags = pat_flags[~pat_flags.index.isin(chd_u)]

#include chd_conditional matches if age_lims/code exclusions met
chd_c = pat_flags[(pat_flags['chd_conditional'] == 1) & (pat_flags['excl_from_chd_cond'] == 0)].index
chd_pats.update(chd_c)
chd_c_subtypes = pat_subtype_flags['chd_conditional'][pat_subtype_flags['chd_conditional'].index.isin(chd_c)]
pat_subtypes = pd.concat([pat_subtypes, chd_c_subtypes])
pat_flags = pat_flags[~pat_flags.index.isin(chd_c)]

#exclude age-insensitive chd_conditional matches from controls
pat_flags_no_age_lims = pat_flags_no_age_lims.loc[pat_flags.index] #removes participants previously removed from pat_flags
pat_flags['excl_from_controls'] = np.where(pat_flags_no_age_lims['chd_conditional'] == 1, 1, pat_flags['excl_from_controls'])

#include asds
asd = pat_flags[(pat_flags['potential_asd'] == 1) & (pat_flags['pfo'] == 0) & (pat_flags['has_gp_data'] == 1)].index
pfo_excl = pat_flags[(pat_flags['potential_asd'] == 1) & (pat_flags['has_gp_data'] == 0)].index
chd_pats.update(asd)
asd_subtypes = pat_subtype_flags['potential_asd'][pat_subtype_flags['potential_asd'].index.isin(asd)]
pat_subtypes = pd.concat([pat_subtypes, asd_subtypes])
pat_flags = pat_flags[~pat_flags.index.isin(asd)]
pat_flags = pat_flags[~pat_flags.index.isin(pfo_excl)] #Remove asd/pfos where we don't have GP data to distinguish them

#exclude any potential asds which do not meet inclusion criteria
pat_flags_no_age_lims = pat_flags_no_age_lims.loc[pat_flags.index] #removes participants previously removed from pat_flags
pat_flags['excl_from_controls'] = np.where(pat_flags_no_age_lims['potential_asd'] == 1, 1, pat_flags['excl_from_controls'])

control_excl = pat_flags['excl_from_controls'] == 1
pat_flags = pat_flags[~control_excl]
controls.update(pat_flags.index)

pat_subtypes = pat_subtypes.str.strip()
pat_subtypes = pat_subtypes.reset_index().drop_duplicates().set_index('eid')

In [5]:
#define CHD subgroups
complex_defects = [
    'Unsp. cyanotic congenital heart disease',
    'TOF',
    'AVSD',
    'Atrial isomerism',
    'TAPVR',
    'TGA',
    'DORV',
    'DOLV',
    'HLHS',
    'IAA',
    'Truncus arteriosus',
    'Unsp. congenital malformations of cardiac chambers',
    'Double inlet ventricle',
    'Single ventricle',
    'Discordant atrioventricular connection',
    'Cor biloculare',
    'Other cyanotic congenital heart disease'
]

aov_defects = [
    'Aortic stenosis',
    'Aortic insufficiency',
    'Aortic valve replacement',
    'Congenital aortic insufficiency',
    'Other aortic valve anomaly',
    'Congenital aortic stenosis'
]

complex_group_subs = pat_subtypes[pat_subtypes['chd_subpheno'].isin(complex_defects)]
complex_group = set(complex_group_subs.index)
pat_subtypes = pat_subtypes[~pat_subtypes.index.isin(complex_group)]

noncomplex_group_subs = pat_subtypes[~pat_subtypes['chd_subpheno'].isin(aov_defects)] #anyone with at least one non-aov defect
noncomplex_group = set(noncomplex_group_subs.index)
pat_subtypes = pat_subtypes[~pat_subtypes.index.isin(noncomplex_group)]

aov_group = set(pat_subtypes.index)

complex_out = '/home/v45331db/phd/biobank_work/cohorts/mar2021_chd_subgroups_fixed/complex_chd.tsv'
noncomplex_out = '/home/v45331db/phd/biobank_work/cohorts/mar2021_chd_subgroups_fixed/noncomplex_chd.tsv'
aov_out = '/home/v45331db/phd/biobank_work/cohorts/mar2021_chd_subgroups_fixed/isolated_aov.tsv'

df = pd.DataFrame({'complex_chd': np.where(dob.index.isin(complex_group), 1, 0),
                   'noncomplex_chd': np.where(dob.index.isin(noncomplex_group), 1, 0),
                   'isolated_aov': np.where(dob.index.isin(aov_group), 1, 0),
                   'controls': np.where(dob.index.isin(controls), 1, 0)},
                  index=dob.index)
df = df.astype(int)
df = df[(df == 1).any(axis=1)] #remove participants excluded from controls
df.drop('controls', axis=1, inplace=True)

df['chd'] = np.where((df == 1).any(axis=1), 1, 0)

df.to_csv(outf, sep='\t')