In [1]:
import pandas as pd
import numpy as np
from lifelines import CoxPHFitter
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt

In [None]:
#load pre-processed tables
df = pd.read_csv('../results/copd_table_post_filtering.tsv', sep='\t')
pdf = pd.read_csv('../results/copd_py_table_post_filtering.tsv', sep='\t')

In [3]:
total_follow_up = df['yrs_duration'].sum()
print(f'Total follow up time: {total_follow_up:,.2f} person-years.')

Total follow up time: 9,506,907.39 person-years.


In [6]:
#stratified cox regression
age_sex_form = (
    'sex + age_at_ac1'
)

full_form_cat = (
    'sex + age_at_ac1 + bmi + ethnicity'
)

full_form_py = (
    'sex + age_at_ac1 + bmi + ethnicity'
)

age_sex_strata = None

cat_strata = [
    'smoking',
    'hypertension_comb',
    'asthma',
    'TDI_binned'
]

py_strata = [
    'pack_yr_binned',
    'hypertension_comb',
    'asthma',
    'TDI_binned'
]

formtypes = {
    'Age/Sex Adjusted': (df, age_sex_form, age_sex_strata),
    'Fully Adjusted': (df, full_form_cat, cat_strata),
    'Pack Year Adjusted': (pdf, full_form_py, py_strata)
}

results = {}
groups = ['All CHD', 'Isolated AoV', 'Noncomplex', 'Control']
res_cols = ['COPD Diagnoses, n (%)', 'HR', '95% CI', 'P-Value']

for regtype, (data, form, strata) in formtypes.items():
    res_df = pd.DataFrame(index=groups, columns=res_cols)
    
    all_chd_form = form + ' + chd'
    subgroup_form = form + ' + complex_chd + noncomplex_chd + isolated_aov'
    
    if strata:
        cph1 = CoxPHFitter()
        cph1.fit(
            data,
            duration_col='yrs_duration',
            event_col='event',
            formula=all_chd_form,
            strata=strata)

        cph2 = CoxPHFitter()
        cph2.fit(
            data,
            duration_col='yrs_duration',
            event_col='event',
            formula=subgroup_form,
            strata=strata)
    else:
        cph1 = CoxPHFitter()
        cph1.fit(
            data,
            duration_col='yrs_duration',
            event_col='event',
            formula=all_chd_form)

        cph2 = CoxPHFitter()
        cph2.fit(
            data,
            duration_col='yrs_duration',
            event_col='event',
            formula=subgroup_form)
    
    reg_res = pd.concat((cph1.summary, cph2.summary))
    
    group_masks = {
        'All CHD': ('chd', data['chd'] == 1),
        'Noncomplex': ('noncomplex_chd', data['noncomplex_chd'] == 1),
        'Isolated AoV': ('isolated_aov', data['isolated_aov'] == 1),
        'Control': (None, data['chd'] == 0)
    }
    
    for group, (label, mask) in group_masks.items():
        events = data[mask & (data['event'] == 1)].index.size
        n = data[mask].index.size
        res_df.loc[group, 'No. at Risk'] = f'{n:,}'
        event_pc = 100 * events/n
        res_df.loc[group, 'COPD Diagnoses, n (%)'] = f'{events:,} ({round(event_pc, 1)})'
        if label:
            hr = round(reg_res.loc[label, 'exp(coef)'],2)
            hr_llim = round(reg_res.loc[label, 'exp(coef) lower 95%'],2)
            hr_ulim = round(reg_res.loc[label, 'exp(coef) upper 95%'],2)
            ci = f'({hr_llim}, {hr_ulim})'
            p = np.format_float_scientific(reg_res.loc[label, 'p'],2)
            
            res_df.loc[group, 'HR'] = hr
            res_df.loc[group, '95% CI'] = ci
            res_df.loc[group, 'P-Value'] = p
    
    results[regtype] = res_df


In [7]:
#build main coxreg table
t1 = results['Age/Sex Adjusted'].copy()
cols = t1.columns.to_list()
new_col_order = cols[-1:] + cols[:-1]
t1 = t1[new_col_order]
t2 = results['Fully Adjusted'].drop(['No. at Risk', 'COPD Diagnoses, n (%)'], axis=1).copy()
tab = pd.concat((t1, t2), axis=1).fillna('-').T.to_numpy()
for row in [5,2]:
    tab = np.insert(tab, row, ['']*4, 0)

groups = ['All CHD', 'Isolated AoV', 'Noncomplex', 'Control']
row_labels = [
    'No. at Risk',
    'COPD Diagnoses, n (%)', 
    'Age/Sex Adjusted:',
    'HR',
    '95% CI',
    'P-Value',
    'Fully Adjusted:',
    'HR',
    '95% CI',
    'P-Value'
]

main_df = pd.DataFrame(tab, index = row_labels, columns = groups)
main_df

Unnamed: 0,All CHD,Isolated AoV,Noncomplex,Control
No. at Risk,3385,1960,1294,479765
"COPD Diagnoses, n (%)",294 (8.7),176 (9.0),106 (8.2),"14,854 (3.1)"
Age/Sex Adjusted:,,,,
HR,2.63,2.54,2.73,-
95% CI,"(2.34, 2.95)","(2.19, 2.94)","(2.25, 3.3)",-
P-Value,2.69e-60,1.46e-34,7.18e-25,-
Fully Adjusted:,,,,
HR,2.21,2.11,2.39,-
95% CI,"(1.97, 2.48)","(1.82, 2.45)","(1.97, 2.89)",-
P-Value,5.50e-41,6.93e-23,4.57e-19,-


In [36]:
#build pack year table
res = results['Pack Year Adjusted'].copy()
cols = res.columns.to_list()
new_col_order = cols[-1:] + cols[:-1]
res = res[new_col_order]
res = res.T
res = res.fillna('-')
res

Unnamed: 0,All CHD,Isolated AoV,Noncomplex,Control
No. at Risk,2854,1633,1107,409222
"COPD Diagnoses, n (%)",257 (9.0),153 (9.4),92 (8.3),"13,122 (3.2)"
HR,2.04,1.89,2.34,-
95% CI,"(1.8, 2.31)","(1.61, 2.21)","(1.9, 2.87)",-
P-Value,1.30e-29,7.01e-15,5.58e-16,-
