In [1]:
import pandas as pd
import numpy as np
from lifelines import CoxPHFitter
from scipy.stats import fisher_exact

In [2]:
#load main table
data_f = 'copd_table_post_filtering.tsv'
df = pd.read_csv(data_f, sep='\t', index_col='eid')

#load pulmonmary tb data
tb_f = 'pulmonary_tb_dates.tsv'
tb = pd.read_csv(tb_f, sep='\t', index_col='eid', usecols=['eid', 'pulmonary_tb'])

df = df.join(tb)
df

Unnamed: 0_level_0,sex,height,age_at_ac1,ethnicity,Townsend_DI,bmi,complex_chd,noncomplex_chd,isolated_aov,chd,...,death_date,event,hypertension_comb,smoking,smoking_status_0,smoking_status_1,smoking_status_2,TDI_binned,yrs_duration,pulmonary_tb
eid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3869328,1.0,176.0,64.0,1,0.611146,23.7926,0.0,0.0,0.0,0.0,...,,0.0,1.0,1.0,0,1,0,"(-2.806, 0.646]",20.000000,0
1713188,0.0,176.0,63.0,1,-0.474842,35.8019,0.0,0.0,0.0,0.0,...,,0.0,1.0,1.0,0,1,0,"(-2.806, 0.646]",20.000000,0
2429802,1.0,163.0,66.0,1,3.899080,31.3523,0.0,0.0,0.0,0.0,...,2013-10-04 00:00:00,1.0,1.0,0.0,1,0,0,"(0.646, 4.097]",13.763176,0
2223778,0.0,166.0,55.0,1,0.099974,22.1730,0.0,0.0,0.0,0.0,...,,0.0,1.0,0.0,1,0,0,"(-2.806, 0.646]",20.000000,0
4797749,1.0,165.0,57.0,1,-5.860310,32.3600,0.0,0.0,1.0,1.0,...,,0.0,1.0,1.0,0,1,0,"(-6.258, -2.806]",20.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2748576,0.0,160.0,68.0,1,-2.223210,24.0234,0.0,0.0,0.0,0.0,...,,0.0,1.0,0.0,1,0,0,"(-2.806, 0.646]",20.000000,0
3689066,0.0,164.0,54.0,1,3.044200,22.6056,0.0,0.0,0.0,0.0,...,,0.0,0.0,0.0,1,0,0,"(0.646, 4.097]",20.000000,0
1871700,1.0,166.0,53.0,0,3.753720,23.8424,0.0,0.0,0.0,0.0,...,,0.0,1.0,0.0,1,0,0,"(0.646, 4.097]",20.000000,0
3949329,1.0,166.0,50.0,1,-1.669470,31.7535,0.0,0.0,0.0,0.0,...,,0.0,1.0,0.0,1,0,0,"(-2.806, 0.646]",20.000000,0


In [10]:
#stratified cox regression
full_form_cat = (
    'sex + age_at_ac1 + bmi + ethnicity + pulmonary_tb'
)

cat_strata = [
    'smoking',
    'hypertension_comb',
    'asthma',
    'TDI_binned'
]

formtypes = {
    'Fully Adjusted': (df, full_form_cat, cat_strata)
}

results = {}
groups = ['All CHD', 'Isolated AoV', 'Noncomplex', 'Control']
res_cols = ['COPD Diagnoses, n (%)', 'HR', '95% CI', 'P-Value']

for regtype, (data, form, strata) in formtypes.items():
    res_df = pd.DataFrame(index=groups, columns=res_cols)
    
    all_chd_form = form + ' + chd'
    subgroup_form = form + ' + complex_chd + noncomplex_chd + isolated_aov'
    
    cph1 = CoxPHFitter()
    cph1.fit(
        data,
        duration_col='yrs_duration',
        event_col='event',
        formula=all_chd_form,
        strata=strata)

    cph2 = CoxPHFitter()
    cph2.fit(
        data,
        duration_col='yrs_duration',
        event_col='event',
        formula=subgroup_form,
        strata=strata)
    
    reg_res = pd.concat((cph1.summary, cph2.summary))
    
    group_masks = {
        'All CHD': ('chd', data['chd'] == 1),
        'Noncomplex': ('noncomplex_chd', data['noncomplex_chd'] == 1),
        'Isolated AoV': ('isolated_aov', data['isolated_aov'] == 1),
        'Control': (None, data['chd'] == 0)
    }
    
    for group, (label, mask) in group_masks.items():
        events = data[mask & (data['event'] == 1)].index.size
        n = data[mask].index.size
        res_df.loc[group, 'No. at Risk'] = f'{n:,}'
        event_pc = 100 * events/n
        res_df.loc[group, 'COPD Diagnoses, n (%)'] = f'{events:,} ({round(event_pc, 1)})'
        if label:
            hr = round(reg_res.loc[label, 'exp(coef)'],2)
            hr_llim = round(reg_res.loc[label, 'exp(coef) lower 95%'],2)
            hr_ulim = round(reg_res.loc[label, 'exp(coef) upper 95%'],2)
            ci = f'({hr_llim}, {hr_ulim})'
            p = np.format_float_scientific(reg_res.loc[label, 'p'],2)
            
            res_df.loc[group, 'HR'] = hr
            res_df.loc[group, '95% CI'] = ci
            res_df.loc[group, 'P-Value'] = p
    
    results[regtype] = res_df

In [11]:
fa_res = results['Fully Adjusted'].T
fa_res

Unnamed: 0,All CHD,Isolated AoV,Noncomplex,Control
"COPD Diagnoses, n (%)",294 (8.7),176 (9.0),106 (8.2),"14,854 (3.1)"
HR,2.19,2.08,2.39,
95% CI,"(1.95, 2.46)","(1.8, 2.42)","(1.97, 2.89)",
P-Value,3.96e-40,4.45e-22,4.31e-19,
No. at Risk,3385,1960,1294,479765
