In [48]:
import pandas as pd
import numpy as np
from lifelines import CoxPHFitter
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
from scipy import stats
import itertools

In [51]:
#read processed UKB data
ukb_gen_f = 'ukb_general_info.tsv'
chd_f = 'chd_subgroups.tsv'
prev_diag_f = 'copd_related_flags.tsv'
bp_f = 'ukb_measured_bp.tsv'
copd_f = 'copd_event_dates.tsv'
spiro_f = 'ukb_fev_fvc.tsv'
py_f = 'ukb_pack_yr.tsv'

main = pd.read_csv(ukb_gen_f, sep='\t', index_col='eid')
main.drop(['ac1_date'], axis=1, inplace=True)

chd = pd.read_csv(chd_f, sep='\t', index_col='eid')
prev_diag = pd.read_csv(prev_diag_f, sep='\t', index_col='eid', usecols=['eid', 'asthma', 'hypertension'])
bp = pd.read_csv(bp_f, sep='\t', index_col='eid')
copd = pd.read_csv(copd_f, sep='\t', index_col='eid')
spiro = pd.read_csv(spiro_f, sep='\t', index_col='eid')

df = main.join([chd, prev_diag, bp, copd])
py = pd.read_csv(py_f, sep='\t', index_col='eid', usecols=['eid', 'pack_yr'])

In [52]:
#remove dont know/prefer not to answers
df = df[~df['ethnicity'].isin([-1, -3])]
df = df[~(df['smoking_status'].isin([-1, -3]))]

#convert ethnicity to binary white/non-white
df['ethnicity'] = np.where(df['ethnicity'].astype(str).str.startswith('1'), 1, 0)

#create combined hypertension variable using measured bp, self-report BP meds + EHR
df['measured_hypertension'] = np.where((df['systolic_bp'] >= 140) |
                                       (df['diastolic_bp'] >= 90), 1, 0)
df['hypertension_comb'] = np.fmax(df['hypertension'], df['measured_hypertension'])
df['hypertension_comb'] = np.fmax(df['hypertension_comb'], df['takes_bp_meds'])
df.drop(['measured_hypertension', 'systolic_bp', 'diastolic_bp', 'takes_bp_meds'], axis=1, inplace=True)

#copy smoking column for use in stratification
df['smoking'] = df['smoking_status'].copy()

#get smoking dummy variable
df = pd.get_dummies(df, columns=['smoking_status'], drop_first=False)
df.rename(columns={
    'smoking_status_0.0': 'smoking_status_0',
    'smoking_status_1.0': 'smoking_status_1',
    'smoking_status_2.0': 'smoking_status_2'}, 
          inplace=True)

#bin TDI for use in stratification
min_tdi = df['Townsend_DI'].min()
max_tdi = df['Townsend_DI'].max()
df['TDI_binned'] = pd.cut(df['Townsend_DI'], np.linspace(min_tdi, max_tdi, 6))

#calculate time to event for copd
start_date = pd.to_datetime('01-04-1997', dayfirst=True)
end_date = pd.to_datetime('31-03-2017', dayfirst=True)
study_dur = ((end_date - start_date).days + 1) / 365.25 

df['event_date'] = df['event_date'].astype('datetime64')
df['yrs_duration'] = (df['event_date'] - start_date).dt.days / 365.25

df['death_date'] = df['death_date'].astype('datetime64')
time_to_death = (df['death_date'] - start_date).dt.days / 365.25
df['yrs_duration'] = np.fmin(df['yrs_duration'], time_to_death)

df['yrs_duration'].fillna(study_dur, inplace=True)

cols = [col for col in df.columns if '_date' not in col]
df.dropna(inplace=True, subset=cols)
df.astype('object').to_csv('../results/copd_table_post_filtering.tsv', sep='\t')
df.drop(['event_date', 'death_date'], axis=1, inplace=True)

#create separate pack year df
pdf = df.join(py)
pdf['pack_yr'] = np.where(pdf['smoking'] == 0, 0, pdf['pack_yr'])

#bin pack year variable
bins=[0,10,20,30]
pdf['pack_yr_binned'] = pd.cut(pdf['pack_yr'], bins=bins)
pdf['pack_yr_binned'] = np.where(pdf['pack_yr'] == 0, 'never_smoke', pdf['pack_yr_binned'])
pdf['pack_yr_binned'] = np.where(pdf['pack_yr'] > 30, '>30', pdf['pack_yr_binned'])
pdf['pack_yr_binned'] = pdf['pack_yr_binned'].astype(str)

pdf.dropna(inplace=True)
pdf.astype('object').to_csv('../results/copd_py_table_post_filtering.tsv', sep='\t')

In [17]:
def get_median_iqr(data, group_mask, column, dp):
    x = data[group_mask][column]
    med = np.median(x)
    lq = np.percentile(x, 25)
    uq = np.percentile(x, 75)
    
    if dp != 0:
        med = round(med, dp)
        lq = round(lq, dp)
        uq = round(uq, dp)
    else:
        med = int(med)
        lq = int(lq)
        uq = int(uq)
        
    return f'{med:,} ({lq:,}, {uq:,})'

def get_n_pc(data, group_mask, column, val, dp=1):
    x = data[group_mask][column]
    n = (x == val).sum()
    group_size = len(x)
    pc = 100*n / group_size
    return f'{n:,} ({pc:.1f})'

In [18]:
masks = {
    'All CHD': df['chd'] == 1,
    'Isolated AoV': df['isolated_aov'] == 1,
    'Noncomplex': df['noncomplex_chd'] == 1,
    'Control': df['chd'] == 0
}

py_masks = {
    'All CHD': (pdf['chd'] == 1) & (pdf['smoking'] != 0),
    'Isolated AoV': (pdf['isolated_aov'] == 1) & (pdf['smoking'] != 0),
    'Noncomplex': (pdf['noncomplex_chd'] == 1) & (pdf['smoking'] != 0),
    'Control': (pdf['chd'] == 0) & (pdf['smoking'] != 0)
}

rows = [
    ['n'] + [f'{mask.sum():,}' for mask in masks.values()],
    #['n'] + [get_n_pc(df, mask, 'sex', sex_val) for mask, sex_val in itertools.product(groups[:,1], [1,0])],
    ['Age, median (IQR)'] + [get_median_iqr(df, mask, 'age_at_ac1', dp=0) for mask in masks.values()],
    ['Male sex, n (%)'] + [get_n_pc(df, mask, 'sex', 1) for mask in masks.values()],
    ['White, n (%)'] + [get_n_pc(df, mask, 'ethnicity', 1) for mask in masks.values()],
    ['TDI, median (IQR)'] + [get_median_iqr(df, mask, 'Townsend_DI', dp=2) for mask in masks.values()],
    ['BMI, median (IQR), kg/m2'] + [get_median_iqr(df, mask, 'bmi', dp=1) for mask in masks.values()],
    ['Smoking status, n (%)'] + ['' for mask in masks.values()],
    ['Current'] + [get_n_pc(df, mask, 'smoking', 2) for mask in masks.values()],
    ['Former'] + [get_n_pc(df, mask, 'smoking', 1) for mask in masks.values()],
    ['Never'] + [get_n_pc(df, mask, 'smoking', 0) for mask in masks.values()],
    ['Pack years of smoking, median (IQR)'] + [get_median_iqr(pdf, mask, 'pack_yr', dp=1) for mask in py_masks.values()],
    ['Other diagnoses, n (%)'] + ['' for mask in masks.values()],
    ['Asthma'] + [get_n_pc(df, mask, 'asthma', 1) for mask in masks.values()],
    ['Hypertension'] + [get_n_pc(df, mask, 'hypertension_comb', 1) for mask in masks.values()],
    ['Diabetes mellitus'] + [get_n_pc(df, mask, 'diabetes_comb', 1) for mask in masks.values()]
]

tab1 = pd.DataFrame(rows)
tab1 = tab1.set_index(0)
tab1.index.name = None
tab1.columns = masks.keys()
tab1

Unnamed: 0,All CHD,Isolated AoV,Noncomplex,Control
n,3385,1960,1294,479765
"Age, median (IQR)","59 (53, 63)","60 (55, 64)","58 (50, 63)","58 (50, 63)"
"Male sex, n (%)","1,945 (57.5)","1,253 (63.9)",626 (48.4),"216,791 (45.2)"
"White, n (%)","3,233 (95.5)","1,895 (96.7)","1,216 (94.0)","454,228 (94.7)"
"TDI, median (IQR)","-1.84 (-3.41, 1.07)","-1.89 (-3.47, 0.98)","-1.82 (-3.4, 1.11)","-2.16 (-3.65, 0.51)"
"BMI, median (IQR), kg/m2","27.6 (24.8, 31.1)","28.0 (25.2, 31.7)","27.1 (24.4, 30.4)","26.7 (24.1, 29.9)"
"Smoking status, n (%)",,,,
Current,377 (11.1),221 (11.3),138 (10.7),"50,573 (10.5)"
Former,"1,342 (39.6)",850 (43.4),451 (34.9),"165,412 (34.5)"
Never,"1,666 (49.2)",889 (45.4),705 (54.5),"263,780 (55.0)"


In [20]:
#split by sex
groups = np.array([
    ('CHD', df['chd'] == 1),
    # ('Isolated AoV', df['isolated_aov'] == 1),
    # ('Noncomplex', df['noncomplex_chd'] == 1),
    ('Control', df['chd'] == 0)])

py_groups = [
    ('CHD', (pdf['chd'] == 1) & (pdf['smoking'] != 0)),
    # ('Isolated AoV', (pdf['isolated_aov'] == 1) & (pdf['smoking'] != 0)),
    # ('Noncomplex', (pdf['noncomplex_chd'] == 1) & (pdf['smoking'] != 0)),
    ('Control', (pdf['chd'] == 0) & (pdf['smoking'] != 0))
]

sexes = [
    ('Male', df['sex'] == 1),
    ('Female', df['sex'] == 0)]

masks = {(group, sex): group_mask & sex_mask for group, group_mask in groups for sex, sex_mask in sexes}
py_masks = {(group, sex): group_mask & sex_mask for group, group_mask in py_groups for sex, sex_mask in sexes}

rows = [
    ['n'] + [get_n_pc(df, mask, 'sex', sex_val) for mask, sex_val in itertools.product(groups[:,1], [1,0])],
    ['Age, median (IQR)'] + [get_median_iqr(df, mask, 'age_at_ac1', dp=0) for mask in masks.values()],
    ['White, n (%)'] + [get_n_pc(df, mask, 'ethnicity', 1) for mask in masks.values()],
    ['TDI, median (IQR)'] + [get_median_iqr(df, mask, 'Townsend_DI', dp=2) for mask in masks.values()],
    ['BMI, median (IQR), kg/m2'] + [get_median_iqr(df, mask, 'bmi', dp=1) for mask in masks.values()],
    ['Smoking status, n (%)'] + ['' for mask in masks.values()],
    ['Current'] + [get_n_pc(df, mask, 'smoking', 2) for mask in masks.values()],
    ['Former'] + [get_n_pc(df, mask, 'smoking', 1) for mask in masks.values()],
    ['Never'] + [get_n_pc(df, mask, 'smoking', 0) for mask in masks.values()],
    ['Pack years of smoking, median (IQR)'] + [get_median_iqr(pdf, mask, 'pack_yr', dp=1) for mask in py_masks.values()],
    ['Other diagnoses, n (%)'] + ['' for mask in masks.values()],
    ['Asthma'] + [get_n_pc(df, mask, 'asthma', 1) for mask in masks.values()],
    ['Hypertension'] + [get_n_pc(df, mask, 'hypertension_comb', 1) for mask in masks.values()],
    ['Diabetes mellitus'] + [get_n_pc(df, mask, 'diabetes_comb', 1) for mask in masks.values()]
]



main_tab = pd.DataFrame(rows)
main_tab = main_tab.set_index(0)
main_tab.index.name = None
main_tab.columns = pd.MultiIndex.from_tuples(masks.keys())
main_tab

  groups = np.array([
  x = data[group_mask][column]


Unnamed: 0_level_0,CHD,CHD,Control,Control
Unnamed: 0_level_1,Male,Female,Male,Female
n,"1,945 (57.5)","1,440 (42.5)","216,791 (45.2)","262,974 (54.8)"
"Age, median (IQR)","60 (53, 64)","58 (52, 63)","58 (50, 63)","57 (50, 63)"
"White, n (%)","1,859 (95.6)","1,374 (95.4)","205,288 (94.7)","248,940 (94.7)"
"TDI, median (IQR)","-1.86 (-3.44, 1.11)","-1.84 (-3.39, 1.02)","-2.15 (-3.66, 0.58)","-2.16 (-3.65, 0.44)"
"BMI, median (IQR), kg/m2","27.8 (25.4, 30.9)","27.2 (24.0, 31.2)","27.3 (25.0, 30.0)","26.1 (23.4, 29.7)"
"Smoking status, n (%)",,,,
Current,238 (12.2),139 (9.7),"27,154 (12.5)","23,419 (8.9)"
Former,856 (44.0),486 (33.8),"82,877 (38.2)","82,535 (31.4)"
Never,851 (43.8),815 (56.6),"106,760 (49.2)","157,020 (59.7)"
"Pack years of smoking, median (IQR)","25.0 (13.0, 40.4)","19.0 (9.8, 33.9)","21.0 (11.0, 35.0)","16.5 (8.5, 28.1)"


In [21]:
#split by sex
groups = np.array([
    # ('All CHD', df['chd'] == 1),
    ('Isolated AoV', df['isolated_aov'] == 1),
    ('Noncomplex', df['noncomplex_chd'] == 1),
    ('Control', df['chd'] == 0)])

py_groups = [
    # ('All CHD', (pdf['chd'] == 1) & (pdf['smoking'] != 0)),
    ('Isolated AoV', (pdf['isolated_aov'] == 1) & (pdf['smoking'] != 0)),
    ('Noncomplex', (pdf['noncomplex_chd'] == 1) & (pdf['smoking'] != 0)),
    ('Control', (pdf['chd'] == 0) & (pdf['smoking'] != 0))
]

sexes = [
    ('Male', df['sex'] == 1),
    ('Female', df['sex'] == 0)]

masks = {(group, sex): group_mask & sex_mask for group, group_mask in groups for sex, sex_mask in sexes}
py_masks = {(group, sex): group_mask & sex_mask for group, group_mask in py_groups for sex, sex_mask in sexes}

rows = [
    ['n'] + [get_n_pc(df, mask, 'sex', sex_val) for mask, sex_val in itertools.product(groups[:,1], [1,0])],
    ['Age, median (IQR)'] + [get_median_iqr(df, mask, 'age_at_ac1', dp=0) for mask in masks.values()],
    ['White, n (%)'] + [get_n_pc(df, mask, 'ethnicity', 1) for mask in masks.values()],
    ['TDI, median (IQR)'] + [get_median_iqr(df, mask, 'Townsend_DI', dp=2) for mask in masks.values()],
    ['BMI, median (IQR), kg/m2'] + [get_median_iqr(df, mask, 'bmi', dp=1) for mask in masks.values()],
    ['Smoking status, n (%)'] + ['' for mask in masks.values()],
    ['Current'] + [get_n_pc(df, mask, 'smoking', 2) for mask in masks.values()],
    ['Former'] + [get_n_pc(df, mask, 'smoking', 1) for mask in masks.values()],
    ['Never'] + [get_n_pc(df, mask, 'smoking', 0) for mask in masks.values()],
    ['Pack years of smoking, median (IQR)'] + [get_median_iqr(pdf, mask, 'pack_yr', dp=1) for mask in py_masks.values()],
    ['Other diagnoses, n (%)'] + ['' for mask in masks.values()],
    ['Asthma'] + [get_n_pc(df, mask, 'asthma', 1) for mask in masks.values()],
    ['Hypertension'] + [get_n_pc(df, mask, 'hypertension_comb', 1) for mask in masks.values()],
    ['Diabetes mellitus'] + [get_n_pc(df, mask, 'diabetes_comb', 1) for mask in masks.values()]
]

sub_tab = pd.DataFrame(rows)
sub_tab = sub_tab.set_index(0)
sub_tab.index.name = None
sub_tab.columns = pd.MultiIndex.from_tuples(masks.keys())
sub_tab

  groups = np.array([
  x = data[group_mask][column]


Unnamed: 0_level_0,Isolated AoV,Isolated AoV,Noncomplex,Noncomplex,Control,Control
Unnamed: 0_level_1,Male,Female,Male,Female,Male,Female
n,"1,253 (63.9)",707 (36.1),626 (48.4),668 (51.6),"216,791 (45.2)","262,974 (54.8)"
"Age, median (IQR)","60 (55, 64)","59 (54, 63)","59 (50, 64)","57 (50, 63)","58 (50, 63)","57 (50, 63)"
"White, n (%)","1,215 (97.0)",680 (96.2),585 (93.5),631 (94.5),"205,288 (94.7)","248,940 (94.7)"
"TDI, median (IQR)","-1.99 (-3.56, 0.79)","-1.69 (-3.33, 1.16)","-1.61 (-3.3, 1.39)","-1.91 (-3.44, 0.8)","-2.15 (-3.66, 0.58)","-2.16 (-3.65, 0.44)"
"BMI, median (IQR), kg/m2","28.1 (25.6, 31.5)","27.9 (24.4, 32.1)","27.6 (25.1, 30.3)","26.7 (23.8, 30.5)","27.3 (25.0, 30.0)","26.1 (23.4, 29.7)"
"Smoking status, n (%)",,,,,,
Current,158 (12.6),63 (8.9),70 (11.2),68 (10.2),"27,154 (12.5)","23,419 (8.9)"
Former,590 (47.1),260 (36.8),242 (38.7),209 (31.3),"82,877 (38.2)","82,535 (31.4)"
Never,505 (40.3),384 (54.3),314 (50.2),391 (58.5),"106,760 (49.2)","157,020 (59.7)"
"Pack years of smoking, median (IQR)","25.9 (13.5, 41.0)","21.5 (10.1, 36.0)","24.0 (12.2, 38.6)","18.4 (9.4, 30.9)","21.0 (11.0, 35.0)","16.5 (8.5, 28.1)"


In [42]:
def do_fisher_exact(data, case_mask, control_mask, val, column):
    case_x = data[case_mask][column]
    control_x = data[control_mask][column]
    arr = [
        [(case_x == val).sum(), (case_x != val).sum()],
         [(control_x == val).sum(), (control_x != val).sum()]
          ]
    _, p = stats.fisher_exact(arr)
    return p

def do_mann_whitney(data, case_mask, control_mask, column):
    case_x = data[case_mask][column]
    control_x = data[control_mask][column]
    _, p = stats.mannwhitneyu(case_x, control_x)
    return p

case_masks = [
    df['chd'] == 1,
    df['isolated_aov'] == 1,
    df['noncomplex_chd'] == 1
]

py_case_masks = [
    (pdf['chd'] == 1) & (pdf['smoking'] != 0),
    (pdf['isolated_aov'] == 1) & (pdf['smoking'] != 0),
    (pdf['noncomplex_chd'] == 1) & (pdf['smoking'] != 0),
]

control_mask = df['chd'] == 0
py_control = (pdf['chd'] == 0) & (pdf['smoking'] != 0)

rows = [
    ['n'] + [np.nan for mask in case_masks],
    ['Age, median (IQR)'] + [do_mann_whitney(df, mask, control_mask, 'age_at_ac1') for mask in case_masks],
    ['White, n (%)'] + [do_fisher_exact(df, mask, control_mask, 1, 'ethnicity') for mask in case_masks],
    ['TDI, median (IQR)'] + [do_mann_whitney(df, mask, control_mask, 'Townsend_DI') for mask in case_masks],
    ['BMI, median (IQR), kg/m2'] + [do_mann_whitney(df, mask, control_mask, 'bmi') for mask in case_masks],
    ['Smoking status, n (%)'] + [np.nan for mask in case_masks],
    ['Current'] + [do_fisher_exact(df, mask, control_mask, 2, 'smoking') for mask in case_masks],
    ['Former'] + [do_fisher_exact(df, mask, control_mask, 1, 'smoking') for mask in case_masks],
    ['Never'] + [do_fisher_exact(df, mask, control_mask, 0, 'smoking') for mask in case_masks],
    ['Pack years of smoking, median (IQR)'] + [do_mann_whitney(pdf, mask, py_control, 'pack_yr') for mask in py_case_masks],
    ['Other diagnoses, n (%)'] + [np.nan for mask in case_masks],
    ['Asthma'] + [do_fisher_exact(df, mask, control_mask, 1, 'asthma') for mask in case_masks],
    ['Hypertension'] + [do_fisher_exact(df, mask, control_mask, 1, 'hypertension_comb') for mask in case_masks],
    ['Diabetes mellitus'] + [do_fisher_exact(df, mask, control_mask, 1, 'diabetes_comb') for mask in case_masks]
]

pvals = pd.DataFrame(rows)
pvals = pvals.set_index(0)
pvals.index.name = None
pvals.columns = ['All CHD', 'Isolated AoV', 'Noncomplex']
pvals

Unnamed: 0,All CHD,Isolated AoV,Noncomplex
n,,,
"Age, median (IQR)",4.078463e-18,2.060465e-31,0.9569059
"White, n (%)",0.03126792,3.329571e-05,0.2639796
"TDI, median (IQR)",3.973153e-10,6.602808e-05,2.335702e-05
"BMI, median (IQR), kg/m2",2.439022e-28,1.984077e-36,0.001756271
"Smoking status, n (%)",,,
Current,0.2613127,0.2853236,0.8918057
Former,4.715261e-10,4.771214e-16,0.792133
Never,2.127011e-11,1.7515380000000002e-17,0.7371038
"Pack years of smoking, median (IQR)",1.253853e-13,6.70632e-14,0.02324482


In [None]:
#p value table for interactions
import statsmodels.formula.api as smf
import statsmodels.api as sm

def test_chd_sex_interaction(data, col, chd_col, regtype):
    form = f'{col} ~ sex * {chd_col}'
    
    if regtype == 'linear':
        mod = smf.ols(formula=form, data=data)
    
    if regtype == 'logistic':
        mod = smf.glm(formula = form, data=data, family=sm.families.Binomial())
        
    res = mod.fit()
    p = res.pvalues[f'sex:{chd_col}']
    stat = res.params[f'sex:{chd_col}']
    return p

chd_cols = {
    'All CHD': 'chd',
    'Isolated AoV': 'isolated_aov',
    'Noncomplex': 'noncomplex_chd'
}
rows = [
    ['n'] + [np.nan for col in chd_cols.values()],
    ['Age, median (IQR)'] + [test_chd_sex_interaction(df, 'age_at_ac1', chd_col, 'linear') for chd_col in chd_cols.values()],
    ['White, n (%)'] + [test_chd_sex_interaction(df, 'ethnicity', chd_col, 'logistic') for chd_col in chd_cols.values()],
    ['TDI, median (IQR)'] + [test_chd_sex_interaction(df, 'Townsend_DI', chd_col, 'linear') for chd_col in chd_cols.values()],
    ['BMI, median (IQR), kg/m2'] + [test_chd_sex_interaction(df, 'bmi', chd_col, 'linear') for chd_col in chd_cols.values()],
    ['Smoking status, n (%)'] + [np.nan for col in chd_cols.values()],
    ['Current'] + [test_chd_sex_interaction(df, 'smoking_status_2', chd_col, 'logistic') for chd_col in chd_cols.values()],
    ['Former'] + [test_chd_sex_interaction(df, 'smoking_status_1', chd_col, 'logistic') for chd_col in chd_cols.values()],
    ['Never'] + [test_chd_sex_interaction(df, 'smoking_status_0', chd_col, 'logistic') for chd_col in chd_cols.values()],
    ['Pack years of smoking, median (IQR)'] + [test_chd_sex_interaction(pdf, 'pack_yr', chd_col, 'linear') for chd_col in chd_cols.values()],
    ['Other diagnoses, n (%)'] + [np.nan for col in chd_cols.values()],
    ['Asthma'] + [test_chd_sex_interaction(df, 'asthma', chd_col, 'logistic') for chd_col in chd_cols.values()],
    ['Hypertension'] + [test_chd_sex_interaction(df, 'hypertension_comb', chd_col, 'logistic') for chd_col in chd_cols.values()],
    ['Diabetes mellitus'] + [test_chd_sex_interaction(df, 'hypertension_comb', chd_col, 'logistic') for chd_col in chd_cols.values()]
]
    
int_pvals = pd.DataFrame(rows)
int_pvals = int_pvals.set_index(0)
int_pvals.index.name = None
int_pvals.columns = ['All CHD', 'Isolated AoV', 'Noncomplex']
int_pvals

Unnamed: 0,All CHD,Isolated AoV,Noncomplex
n,,,
"Age, median (IQR)",0.007608,0.227395,0.201822
"White, n (%)",0.851038,0.362474,0.427721
"TDI, median (IQR)",0.898592,0.064131,0.130699
"BMI, median (IQR), kg/m2",0.006966,0.000299,0.103078
"Smoking status, n (%)",,,
Current,0.308238,0.96055,0.125341
Former,0.069253,0.203476,0.856718
Never,0.185892,0.134212,0.439581
"Pack years of smoking, median (IQR)",0.092656,0.153968,0.874554


In [33]:
def format_pvals(p):
    if p < 0.01:
        return np.format_float_scientific(p, 2)
    else:
        return str(round(p, 2))

#make main table
all_chd_p = pd.DataFrame(pvals['All CHD'])
all_chd_p['All CHD'] = all_chd_p['All CHD'].apply(format_pvals)
all_chd_p.columns = pd.MultiIndex.from_tuples([('', 'P-value')])
main_tab_final = pd.concat([main_tab, all_chd_p], axis=1)
main_tab_final

Unnamed: 0_level_0,CHD,CHD,Control,Control,Unnamed: 5_level_0
Unnamed: 0_level_1,Male,Female,Male,Female,P-value
n,"1,945 (57.5)","1,440 (42.5)","216,791 (45.2)","262,974 (54.8)",
"Age, median (IQR)","60 (53, 64)","58 (52, 63)","58 (50, 63)","57 (50, 63)",4.08e-18
"White, n (%)","1,859 (95.6)","1,374 (95.4)","205,288 (94.7)","248,940 (94.7)",0.03
"TDI, median (IQR)","-1.86 (-3.44, 1.11)","-1.84 (-3.39, 1.02)","-2.15 (-3.66, 0.58)","-2.16 (-3.65, 0.44)",3.97e-10
"BMI, median (IQR), kg/m2","27.8 (25.4, 30.9)","27.2 (24.0, 31.2)","27.3 (25.0, 30.0)","26.1 (23.4, 29.7)",2.44e-28
"Smoking status, n (%)",,,,,
Current,238 (12.2),139 (9.7),"27,154 (12.5)","23,419 (8.9)",0.26
Former,856 (44.0),486 (33.8),"82,877 (38.2)","82,535 (31.4)",4.72e-10
Never,851 (43.8),815 (56.6),"106,760 (49.2)","157,020 (59.7)",2.13e-11
"Pack years of smoking, median (IQR)","25.0 (13.0, 40.4)","19.0 (9.8, 33.9)","21.0 (11.0, 35.0)","16.5 (8.5, 28.1)",1.25e-13


In [35]:
#make subgroup table
subgroups = ['Isolated AoV', 'Noncomplex']
sub_p = pvals[subgroups]
for group in subgroups:
    sub_p[group] = sub_p[group].apply(format_pvals)
sub_p.columns = pd.MultiIndex.from_tuples([(group, 'P-value') for group in subgroups])
sub_tab_final = pd.concat([sub_tab, sub_p], axis=1)
sub_tab_final

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_p[group] = sub_p[group].apply(format_pvals)


Unnamed: 0_level_0,Isolated AoV,Isolated AoV,Noncomplex,Noncomplex,Control,Control,Isolated AoV,Noncomplex
Unnamed: 0_level_1,Male,Female,Male,Female,Male,Female,P-value,P-value
n,"1,253 (63.9)",707 (36.1),626 (48.4),668 (51.6),"216,791 (45.2)","262,974 (54.8)",,
"Age, median (IQR)","60 (55, 64)","59 (54, 63)","59 (50, 64)","57 (50, 63)","58 (50, 63)","57 (50, 63)",2.06e-31,0.96
"White, n (%)","1,215 (97.0)",680 (96.2),585 (93.5),631 (94.5),"205,288 (94.7)","248,940 (94.7)",3.33e-05,0.26
"TDI, median (IQR)","-1.99 (-3.56, 0.79)","-1.69 (-3.33, 1.16)","-1.61 (-3.3, 1.39)","-1.91 (-3.44, 0.8)","-2.15 (-3.66, 0.58)","-2.16 (-3.65, 0.44)",6.6e-05,2.34e-05
"BMI, median (IQR), kg/m2","28.1 (25.6, 31.5)","27.9 (24.4, 32.1)","27.6 (25.1, 30.3)","26.7 (23.8, 30.5)","27.3 (25.0, 30.0)","26.1 (23.4, 29.7)",1.98e-36,0.00176
"Smoking status, n (%)",,,,,,,,
Current,158 (12.6),63 (8.9),70 (11.2),68 (10.2),"27,154 (12.5)","23,419 (8.9)",0.29,0.89
Former,590 (47.1),260 (36.8),242 (38.7),209 (31.3),"82,877 (38.2)","82,535 (31.4)",4.77e-16,0.79
Never,505 (40.3),384 (54.3),314 (50.2),391 (58.5),"106,760 (49.2)","157,020 (59.7)",1.75e-17,0.74
"Pack years of smoking, median (IQR)","25.9 (13.5, 41.0)","21.5 (10.1, 36.0)","24.0 (12.2, 38.6)","18.4 (9.4, 30.9)","21.0 (11.0, 35.0)","16.5 (8.5, 28.1)",6.71e-14,0.02
