In [None]:
%load_ext autoreload
%autoreload 2
import numpy as np

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import os
import pandas as pd
pd.options.mode.chained_assignment = None 

import sys 
sys.path.append('../src/')

import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

from utils import restrict_GPU_pytorch
restrict_GPU_pytorch('0')

np.random.seed(0)

### Load extracted EDW data

In [None]:
anticoag_treatment = pd.read_csv('../extracted_EDW_data/trt_antic_ecglst.csv')
anticoag_treatment['anticoag_treatment'] = ~anticoag_treatment['OrderDTS'].isna()
anticoag_treatment = anticoag_treatment[['UniqueID', 'anticoag_treatment']]

hospitalization = pd.read_csv('../extracted_EDW_data/hospital_vists.csv')
hospitalization['HospitalAdmitDTS'] = pd.to_datetime(hospitalization['HospitalAdmitDTS'])
hospitalization['ecg_date_lst'] = pd.to_datetime(hospitalization['ecg_date_lst'])
hospitalization = hospitalization[hospitalization['HospitalAdmitDTS'] > hospitalization['ecg_date_lst']]
hospitalization = hospitalization.sort_values('HospitalAdmitDTS')
hospitalization = hospitalization.groupby('UniqueID').first().reset_index()

# Filter for hospitalizations that occur after ecg_date_lst
# Filter for earliest hospitalization
specialist_visit = pd.read_csv('../extracted_EDW_data/specvis_lst.csv')
specialist_visit = specialist_visit[['UniqueID', 'spec_vis']]

insurance = pd.read_csv('../extracted_EDW_data/ins_ecglst.csv')
insurance = insurance[['UniqueID', 'instype_final']]
est_care = pd.read_csv('../extracted_EDW_data/estcare_ecglst.csv')
est_care = est_care[['UniqueID', 'PCPvisits_bin', 'CARvisits_bin', 'OTHvisits_bin']]

rate = pd.read_csv('../extracted_EDW_data/rate_ecglst.csv')
rhythm = pd.read_csv('../extracted_EDW_data/rhythm_ecglst.csv')
stroke = pd.read_csv('../extracted_EDW_data/stroke_ecglst.csv')

rate = rate[['UniqueID', 'trt_rate']]
rhythm = rhythm[['UniqueID', 'trt_rhythm']]
stroke = stroke[['UniqueID', 'stroke']]

exists_in_new_system = pd.read_csv('../extracted_EDW_data/Missing Patients EDW RZ.csv')
unique_ids_in_new_system = exists_in_new_system[exists_in_new_system['patid_found'] == 1]['UniqueID'].unique()

print("Treatment rate with rate control medications:", rate['trt_rate'].mean())
print("Treatment rate with rhythm control medications:", rhythm['trt_rhythm'].mean()) 
print("Mean stroke:", stroke['stroke'].mean())


### Load AF dataset and combine with EDW data

In [None]:
from paths import map_params_to_filename
from ecg_datasets import ECGDemographicsDataset, ECGDataset
from ecg_preprocessing_fs import create_name_dob_hash
import pandas as pd 

outcome= 'afib'
merge_with_EDW_vars = True
preprocessing_params = {'max_pred_gap': 90, 
                        'selection_criteria': 'va', 
                        'include_single_ecgs': True, 
                        'mini': False}
unique_id_col = 'UniqueID'


ecg_data = pd.read_csv('./processed_data/processed_afib_' + map_params_to_filename(preprocessing_params) + '.csv')   
ecg_data['PatientFirstName'].fillna('nan', inplace=True)
ecg_data[unique_id_col] = create_name_dob_hash(ecg_data, 'PatientFirstName', 'PatientLastName', 'DateofBirth')

print("# of (Patients, ECGs) before merging with map to PatientID: ", ecg_data['UniqueID'].nunique(), len(ecg_data))

preprocessing_params['one_ecg_per_patient'] = 'last' # can be 'false', 'last', 'first', 'last_white', 'last_two_ecgs'
preprocessing_params['loss'] = 'CE' # can be CE or Focal
preprocessing_params['mini'] = False # This refers to subsampling the train set; the earlier setting refers to subsampling all of MUSE. Yes, this is not ideal..

files_to_skip = ['/data/workspace/ekg_bwr_trunc_norm/003161595_08-05-2019_15-20-53_SCD10410491PA05082019152053.npy',
                    '/data/workspace/ekg_bwr_trunc_norm/000122100_09-21-2019_01-02-14_none.npy',
                    '/data/workspace/ekg_bwr_trunc_norm/006363943_05-24-2018_18-00-22_none.npy',
                    '/data/workspace/ekg_bwr_trunc_norm/010464725_05-24-2019_15-17-48_SCD12365371PA24052019151748.npy',
                    '/data/workspace/ekg_bwr_trunc_norm/000951233_08-12-2019_15-28-53_SKJ14029684PA12082019152853.npy',
                    '/data/workspace/ekg_bwr_trunc_norm/003145657_08-14-2019_11-01-13_SKJ13388441SA14082019110113.npy', 
                    '/data/workspace/ekg_bwr_trunc_norm/035737121_08-13-2017_18-51-36_SKJ13408672SA13082017185136.npy',
                    '/data/workspace/ekg_bwr_trunc_norm/003156598_11-22-2019_16-12-28_none.npy',
                    '/data/workspace/ekg_bwr_trunc_norm/006619841_05-07-2019_08-28-52_none.npy',
                    '/data/workspace/ekg_bwr_trunc_norm/006619841_05-11-2019_03-20-46_none.npy',
                    '/data/workspace/ekg_bwr_trunc_norm/006619841_05-06-2019_23-35-11_none.npy',
                    '/data/workspace/ekg_bwr_trunc_norm/001410670_04-19-2016_14-07-26_SCD06223397PA19042016140726.npy',
                    '/data/workspace/ekg_bwr_trunc_norm/002105344_08-10-2018_16-05-48_none.npy',
                    '/data/workspace/ekg_bwr_trunc_norm/002105344_08-10-2018_16-05-48_none.npy',
                    '/data/workspace/ekg_bwr_trunc_norm/002063599_01-26-2017_10-56-00_SCD07047035PA26012017105600.npy',
                    '/data/workspace/ekg_bwr_trunc_norm/001410670_04-19-2016_14-07-26_SCD06223397PA19042016140726.npy',
                    '/data/workspace/ekg_bwr_trunc_norm/006619841_05-06-2019_23-35-11_none.npy',
                    '/data/workspace/ekg_bwr_trunc_norm/006619841_05-07-2019_01-21-07_none.npy',
                    '/data/workspace/ekg_bwr_trunc_norm/002120566_12-19-2018_08-39-41_SKJ14080390PA19122018083941.npy',
                    '/data/workspace/ekg_bwr_trunc_norm/002021971_05-14-2016_09-49-49_none.npy'
                    ]

ecg_data = ecg_data[~ecg_data['path_to_bwr_trunc_norm_data'].isin(files_to_skip)]
ecg_data['ecg_date'] = pd.to_datetime(ecg_data[['year', 'month', 'day']])
ecg_data['DateofBirth'] = pd.to_datetime(ecg_data['DateofBirth'])
ecg_data['PatientAge'] = ecg_data['ecg_date'] - ecg_data['DateofBirth']
ecg_data['PatientAge_years'] = ecg_data['PatientAge'].dt.days / 365.2425
ecg_data['PatientAge_years_01'] = ecg_data['PatientAge_years'] / 100
ecg_data.drop(columns=['Unnamed: 0.1', 'Unnamed: 0', 'PatientID', 'year', 'month', 'day', 'muse_mrn'], inplace=True)


print("# of (Patients, ECGs) before merging with map to PatientID: ", ecg_data['UniqueID'].nunique(), len(ecg_data))
muse_edw_map = pd.read_csv('muse_edw_map.csv', dtype='str')
ecg_data = ecg_data[ecg_data['UniqueID'].isin(muse_edw_map['UniqueID'])]
ecg_data = pd.merge(ecg_data, muse_edw_map[['UniqueID', 'PatientID']], on='UniqueID')
print("# of (Patients, ECGs) after merging with map to PatientID: ", ecg_data['UniqueID'].nunique(), len(ecg_data))


# Demographics based on Brianna's pull
demographics_file = pd.read_csv('~/Documents/demographics_with_diagnosis_info.csv')
demographics_file['earliest_diagnosis'] = pd.to_datetime(demographics_file['earliest_diagnosis'])

pids = sorted(list(set(ecg_data[unique_id_col])))
test_set_size = .4
random_state = 0
preprocessing_params['test_set_size'] = test_set_size
# preprocessing_params['random_state'] = random_state
train_ids, test_ids = train_test_split(pids, test_size=preprocessing_params['test_set_size'], random_state=random_state)
val_ids, test_ids = train_test_split(test_ids, test_size=.5, random_state=random_state)

if merge_with_EDW_vars:
    # Merge with demographics
    ecg_data = ecg_data[ecg_data['UniqueID'].isin(demographics_file['UniqueID'])]
    print("# of (Patients, ECGs) after demographics merge: ", ecg_data['UniqueID'].nunique(), len(ecg_data))
    ecg_data = pd.merge(ecg_data, demographics_file, on='UniqueID')


    print("# of Patients in sample matched to some diagnosis: ", ecg_data['diagnosis_in_charts'].sum())
    # Filter rows where diagnosis occurs AFTER ECG or there is no diagnosis in the charts
    ecg_data = ecg_data[(~ecg_data['diagnosis_in_charts']) | (0 < (ecg_data['earliest_diagnosis']  - ecg_data['ecg_date']).dt.days)]
    print("# of (Patients, ECGs) after filtering out established AFib diagnoses: ", ecg_data['UniqueID'].nunique(), len(ecg_data))
    ecg_data['time_to_diagnosis'] = (ecg_data['earliest_diagnosis'] - ecg_data['ecg_date']).dt.days

    # Add binary indicators for demographics
    for race_val in ['White', 'Hispanic or Latino', 'Black or African American', 'Asian', 'Other',
                     'Declined or Unavailable', 'Native American or Pacific Islander']:
        ecg_data['binary_' + race_val] = ecg_data['PatientRaceFinal'] == race_val
    ecg_data['binary_Male'] = ecg_data['SexDSC'] == 'Male'
    ecg_data.drop(columns=['binary_Race_CAUCASIAN', 'binary_Race_HISPANIC', 'binary_Race_BLACK', 'binary_Race_HISPANIC'], inplace=True)

    # Add indicators for downstream outcomes - treatment with anticoag for now
    ecg_data = pd.merge(ecg_data, anticoag_treatment, on='UniqueID') # 13% of patients go on to have anticoag treatment 
    ecg_data['mortality'] = ~ecg_data['DeathDTS'].isna() # 12% of patients die
    ecg_data['hospitalization'] = ecg_data['UniqueID'].isin(hospitalization['UniqueID']) 
    ecg_data = pd.merge(ecg_data, specialist_visit, on='UniqueID')
    ecg_data = pd.merge(ecg_data, rate, on='UniqueID')
    ecg_data = pd.merge(ecg_data, rhythm, on='UniqueID')
    ecg_data = pd.merge(ecg_data, stroke, on='UniqueID')
    ecg_data = pd.merge(ecg_data, est_care, on='UniqueID')
    ecg_data = pd.merge(ecg_data, insurance, on='UniqueID')
print("\n# of Patients in  sample: ", ecg_data['UniqueID'].nunique())
print("# of ECGs in sample: ", len(ecg_data))

ecg_data['DeathDTS'] = pd.to_datetime(ecg_data['DeathDTS']) 
ecg_data['date'] = pd.to_datetime(ecg_data['date']) 
ecg_data['mortality_within_one_year'] =  (ecg_data['DeathDTS'] - ecg_data['date']).dt.days < 365

# Remove UniqueIDs who are associated wi
# th both positive & negative class; it's because of middle name
uniqueids_positive_and_negative = ecg_data.groupby('UniqueID')['label'].nunique()
repeated_unique_ids_across_class = uniqueids_positive_and_negative[uniqueids_positive_and_negative > 1].index.values
ecg_data = ecg_data[~ecg_data['UniqueID'].isin(repeated_unique_ids_across_class)]

if preprocessing_params['one_ecg_per_patient'] != 'false':
    # Sort by ECG date (ascending=False)
    # Select one per UniqueID
    ecg_data = ecg_data.sort_values('ecg_date')
    if preprocessing_params['one_ecg_per_patient'].startswith('first'):
        ecg_data = ecg_data.groupby('UniqueID').first().reset_index()
    else:
        ecg_data = ecg_data.groupby('UniqueID').last().reset_index()
print(ecg_data['label'].mean(), preprocessing_params)
print("\n# of Patients in  sample: ", ecg_data['UniqueID'].nunique())
print("# of ECGs in sample: ", len(ecg_data))

ecg_data['PrimLangDSC'] = ecg_data['PrimLangDSC'].map(lambda x: x if x == "English" else "Non-English")


### Split dataset into train, calibration, and study sample.

In [None]:
unique_id_col = 'UniqueID'
additional_feat_names = ['binary_Black or African American', 'binary_Hispanic or Latino',
                         'binary_Declined or Unavailable','binary_Asian',  'binary_Other', 
                         'binary_Native American or Pacific Islander',
                         'binary_Male', 
                         'PatientAge_years_01']
# additional_feat_names = []


batch_size = 8
num_workers = 6
if preprocessing_params['mini']:
    train_ids = np.random.choice(train_ids, size=int(.05*len(train_ids)), replace=False)
    
split_dfs = []
split_paths = []
split_y = []
additional_feats = []
for i, pid_set in enumerate([train_ids, val_ids, test_ids]):
    split_df = ecg_data[ecg_data[unique_id_col].isin(pid_set)]
    split_df.reset_index(drop=True, inplace=True)

    if i in [0,1] and preprocessing_params['one_ecg_per_patient'] == 'last_white':
        split_df = split_df[split_df['PatientRaceFinal'] == 'White']

    if i in [0,1] and preprocessing_params['one_ecg_per_patient'] == 'last_two_ecgs':
        split_df = split_df[split_df['UniqueID'].isin(pids_to_keep_under_two_ecg_constraint)]

    if i > 0:
        # Make sure we're evaluating each patient with only one ECG
        split_df = split_df.sample(frac=1, random_state=0)
        split_df = split_df.groupby(unique_id_col).first().reset_index() # Replace with a random ECG

        # 
        split_df = split_df[split_df[unique_id_col].isin(unique_ids_in_new_system)]
    split_paths.append(list(split_df['path_to_bwr_trunc_norm_data']))
    split_y.append(np.array(list(split_df['label'])))
    additional_feat_values = split_df[additional_feat_names].fillna(0).values
    split_dfs.append(split_df)
    additional_feats.append(additional_feat_values.astype(int))

train_ecg_paths, val_ecg_paths, test_ecg_paths = split_paths
train_additional_feats, val_additional_feats, test_additional_feats = additional_feats
train_y, val_y, test_y = split_y

if len(additional_feat_names) == 0:
    train_dataset = ECGDataset(train_ecg_paths, train_y)
    val_dataset = ECGDataset(val_ecg_paths, val_y)
    test_dataset = ECGDataset(test_ecg_paths, test_y)
else:
    train_dataset = ECGDemographicsDataset(train_ecg_paths,  train_additional_feats, train_y)
    val_dataset = ECGDemographicsDataset(val_ecg_paths, val_additional_feats, val_y)
    test_dataset = ECGDemographicsDataset(test_ecg_paths, test_additional_feats, test_y)

train_loader = DataLoader(train_dataset, batch_size=batch_size, pin_memory=False, shuffle=True, num_workers=num_workers)
val_loader = DataLoader(val_dataset, batch_size=batch_size, pin_memory=False, shuffle=False, num_workers=num_workers)
test_loader = DataLoader(test_dataset, batch_size=batch_size, pin_memory=False, shuffle=False, num_workers=num_workers)
print(len(train_loader.dataset), len(val_loader.dataset), len(test_loader.dataset))

expt_config={'arch': 'Net1D','additional_features': False}
if len(additional_feat_names) > 0:
    expt_config['additional_features'] = True

train_y.mean(), val_y.mean(), test_y.mean()


In [6]:
ecg_data = pd.concat(split_dfs)

# Redo splits, as is done in train_models.ipynb
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
instype_map = {'Medicaid': 'Medicaid', 'Unknown/Missing': 'Unknown/Missing', 'Commercial': 'Commercial', 'Dual': 'Medicare', 'Medicare':'Medicare','Other': 'Other' }

scaler = StandardScaler()

age_thresh = 17
val_split_df = split_dfs[1]
test_split_df = split_dfs[2]
train_split_df = split_dfs[0]

combined_df = pd.concat([val_split_df, test_split_df], ignore_index=True)
shuffled_df = shuffle(combined_df, random_state=42)

# Split back into validation and test sets
split_ratio = 0.625
split_point = int(len(shuffled_df) * split_ratio)

# Create new validation and test splits
val_split_df = shuffled_df.iloc[:split_point].reset_index(drop=True)
test_split_df = shuffled_df.iloc[split_point:].reset_index(drop=True)

### Print out latex variables referenced in paper.

In [None]:
#Output latex macros

print(f"\\newcommand{{\\nEntireSample}}{ {np.sum([len(df) for df in split_dfs])}}")
print(f"\\newcommand{{\\nPosEntireSample}}{ {np.sum([df['label'].sum() for df in split_dfs])}}")
print(f"\\newcommand{{\\nNegEntireSample}}{ {np.sum([len(df[df['label']== 0]) for df in split_dfs])}}")

print(f"\\newcommand{{\\nTrain}}{ {len(split_dfs[0])}}")
print(f"\\newcommand{{\\nCal}}{ {len(val_split_df)}}")
print(f"\\newcommand{{\\nStudy}}{ {len(test_split_df)}}")

pos_ecgs = ecg_data[ecg_data['label'] == 1] 

print(f"\\newcommand{{\\nDiagEntireSample}}{ {ecg_data['diagnosis_in_charts'].sum()}}")
print(f"\\newcommand{{\\nDiagAmongAFPos}}{ {pos_ecgs['diagnosis_in_charts'].sum()}}")
print(f"\\newcommand{{\\nUnDiagAmongAFPos}}{ {len(pos_ecgs[pos_ecgs['diagnosis_in_charts'] == 0])}}")
print(f"\\newcommand{{\\nDiagWhite}}{ {ecg_data[ecg_data['PatientRaceFinal'] == 'White']['diagnosis_in_charts'].sum()}}")
print(f"\\newcommand{{\\nDiagBlack}}{ {ecg_data[ecg_data['PatientRaceFinal'] == 'Black or African American']['diagnosis_in_charts'].sum()}}")
print(f"\\newcommand{{\\nDiagAsian}}{ {ecg_data[ecg_data['PatientRaceFinal'] == 'Asian']['diagnosis_in_charts'].sum()}}")

print(f"\\newcommand{{\\nDiagHisp}}{ {ecg_data[ecg_data['PatientRaceFinal'] == 'Hispanic or Latino']['diagnosis_in_charts'].sum()}}")

pos_ecg_with_PCP = pos_ecgs[pos_ecgs['PCPvisits_bin'] == 1]

print(f"\\newcommand{{\\nPosWithPCPEntireSample}}{ {len(pos_ecg_with_PCP)}}")
print(f"\\newcommand{{\\nDiagAmongPosWithPCPEntireSample}}{  {pos_ecg_with_PCP['diagnosis_in_charts'].sum()}}")
print(f"\\newcommand{{\\nUnDiagAmongPosWithPCPEntireSample}}{ {len(pos_ecg_with_PCP[pos_ecg_with_PCP['diagnosis_in_charts'] == 0])}}")


# Compute black-specific diagnosis rates
# Compute white-specific  diagnosis rates
pos_ecg_with_PCP['no_diagnosis_in_chart'] = ~(pos_ecg_with_PCP['diagnosis_in_charts'].astype(bool))
race_to_hidden_diag_rates = pos_ecg_with_PCP.groupby('PatientRaceFinal')['no_diagnosis_in_chart'].mean().reset_index()
ins_to_hidden_diag_rates = pos_ecg_with_PCP.groupby('instype_final')['no_diagnosis_in_chart'].mean().reset_index()
lang_to_hidden_diag_rates = pos_ecg_with_PCP.groupby('PrimLangDSC')['no_diagnosis_in_chart'].mean().reset_index()

hidden_diag_rate_white = race_to_hidden_diag_rates[race_to_hidden_diag_rates['PatientRaceFinal'] == 'White']['no_diagnosis_in_chart'].iloc[0]
hidden_diag_rate_black = race_to_hidden_diag_rates[race_to_hidden_diag_rates['PatientRaceFinal'] == 'Black or African American']['no_diagnosis_in_chart'].iloc[0]
hidden_diag_rate_comm = ins_to_hidden_diag_rates[ins_to_hidden_diag_rates['instype_final'] == 'Commercial']['no_diagnosis_in_chart'].iloc[0]
hidden_diag_rate_medicaid = ins_to_hidden_diag_rates[ins_to_hidden_diag_rates['instype_final'] == 'Medicaid']['no_diagnosis_in_chart'].iloc[0]
hidden_diag_rate_eng = lang_to_hidden_diag_rates[lang_to_hidden_diag_rates['PrimLangDSC'] == 'English']['no_diagnosis_in_chart'].iloc[0]
hidden_diag_rate_noneng = lang_to_hidden_diag_rates[lang_to_hidden_diag_rates['PrimLangDSC'] == 'Non-English']['no_diagnosis_in_chart'].iloc[0]

print(f"\\newcommand{{\\pctHiddenDiagWhitePosPCP}}{ {np.round(100*hidden_diag_rate_white, 1)}}")
print(f"\\newcommand{{\\pctHiddenDiagBlackPosPCP}}{ {np.round(100*hidden_diag_rate_black, 1)}}")

print(f"\\newcommand{{\\pctHiddenDiagCommercialPosPCP}}{ {np.round(100*hidden_diag_rate_comm, 1)}}")
print(f"\\newcommand{{\\pctHiddenDiagMedicaidPosPCP}}{ {np.round(100*hidden_diag_rate_medicaid, 1)}}")


print(f"\\newcommand{{\\pctHiddenDiagEngPosPCP}}{ {np.round(100*hidden_diag_rate_eng, 1)}}")
print(f"\\newcommand{{\\pctHiddenDiagNongEngPosPCP}}{ {np.round(100*hidden_diag_rate_noneng, 1)}}")



In [None]:
lang_to_hidden_diag_rates = pos_ecg_with_PCP.groupby('PrimLangDSC')['no_diagnosis_in_chart'].mean().reset_index()
lang_to_hidden_diag_rates

### Generate Table S1, summary statistics of three samples

In [None]:
import pandas as pd
from tabulate import tabulate
from utils import prettify_group_name, prettify_col_name
category_to_values = {'SexDSC': ('Male', 'Female'),
                      'PatientRaceFinal': ('White', 'Black or African American', 'Hispanic or Latino', 'Asian'),
                      'instype_final': ('Commercial', 'Medicare', 'Medicaid' ),
                      'PrimLangDSC': ('English', 'Non-English')}

def generate_table1(train_split_df, val_split_df, test_split_df):
    # Define the splits
    splits = {
        "Train": train_split_df,
        "Calibration": val_split_df,
        "Study": test_split_df
    }

    # Initialize an empty dictionary to store rows
    table_data = []

    # Row 1: Number of Patients
    num_patients = {name: len(df) for name, df in splits.items()}
    table_data.append([''] + ["# of Patients"] + [f"{num_patients[name]}" for name in splits.keys()])

    # Row 2: Age (Mean (SD))
    age_summary = {
        name: f"{df['PatientAge_years'].mean():.1f} ({df['PatientAge_years'].std():.1f})" for name, df in splits.items()
    }
    table_data.append([''] + ["Age (Mean (SD))"] + [age_summary[name] for name in splits.keys()])

    # Rows for categorical variables
    categorical_vars = ["SexDSC", "PatientRaceFinal", "instype_final", "PrimLangDSC"]
    for var in categorical_vars:
        unique_values = category_to_values[var]
        pretty_var = prettify_group_name[var]
        for i, value in enumerate(unique_values):
            row = [f'{pretty_var}' if i == 0 else ''] + [ str(value)] + [
                f"{len(df[df[var] == value])} ({((df[var] == value).mean() * 100):.1f}%)" for name, df in splits.items()
            ]
            table_data.append(row)
    # Rows for binary variables
    binary_vars = ["diagnosis_in_charts", "label",  "stroke"]
    for i, var in enumerate(binary_vars):
        pretty_var = prettify_col_name(var)
        row = ['Outcome' if i == 0 else ''] + [pretty_var] + [
            f"{df[var].sum()} ({(df[var].mean() * 100):.1f}%)" for name, df in splits.items()
        ]
        table_data.append(row)

    # Convert to a DataFrame for formatting
    table_df = pd.DataFrame(table_data, columns=[""] + [""] + list(splits.keys()))

    # Convert to LaTeX using tabulate
    latex_table = tabulate(table_df, headers="keys", tablefmt="latex", showindex=False)
    return latex_table, table_data

# Generate the table
latex_table, table_data = generate_table1(split_dfs[0], val_split_df, test_split_df)
print(latex_table)

In [None]:
import re

# Your table data as list of lists

# Formatting helpers
midrule_labels = {"Sex", "Race", "Insurance", "Primary Language", "Outcome"}
indented_labels = {"Male", "Female", "White", "Black or African American", "Hispanic or Latino", "Asian",
                   "Commercial", "Medicare", "Medicaid", "English", "Non-English"}
bold_labels = midrule_labels

def format_number(s):
    return re.sub(r'\d{1,3}(?=(\d{3})+(?!\d))', r'\g<0>,', s)

def escape_latex(s):
    s = s.replace('%', r'\%')
    s = re.sub(r'<', r'\\ensuremath{<}', s)
    s = re.sub(r'>', r'\\ensuremath{>}', s)
    return s

def format_cell(s):
    s = format_number(s)
    s = escape_latex(s)
    return s

# Build LaTeX table
latex_lines = []
latex_lines.append(r"\begin{tabular}{lrrrr}")
latex_lines.append(r"\toprule")
latex_lines.append(r"& & \textbf{Train} & \textbf{Calibration} & \textbf{Study}")
latex_lines.append(r"\midrule")
for row in table_data:
    label = row[0]
    
    # Add \midrule before new sections
    if label in midrule_labels:
        latex_lines.append(r"\midrule")

    # Indent certain labels
    if label in indented_labels:
        label = r"\quad " + label

    # Bold section headers
    if label.strip() in bold_labels:
        label = r"\textbf{" + label.strip() + r"}"

    # Format row
    formatted_row = " & ".join([label] + [format_cell(cell) for cell in row[1:]]) + r" \\"
    latex_lines.append(formatted_row)

latex_lines.append(r"\bottomrule")
latex_lines.append(r"\end{tabular}")

# Output LaTeX
latex_output = "\n".join(latex_lines)
print(latex_output)


### Generate Table 1

In [177]:
from statsmodels.stats.proportion import proportion_confint

def format_percent_with_ci(positives, total, alpha=0.05, method='wilson'):
    if total == 0:
        return "N/A"

    point_estimate = positives / total
    ci_low, ci_high = proportion_confint(count=positives, nobs=total, alpha=alpha, method=method)
    
    # Convert to percentages
    pe_pct = round(point_estimate * 100, 1)
    ci_low_pct = round(ci_low * 100, 1)
    ci_high_pct = round(ci_high * 100, 1)

    return f"{pe_pct} ({ci_low_pct}, {ci_high_pct})"

rows = []

n_pos_ecgs = len(pos_ecgs)
n_pos_ecgs_pcp = len(pos_ecg_with_PCP)
hidden_diag_rate_among_pos = 100 - np.round(pos_ecgs['diagnosis_in_charts'].mean()*100, 2)
hidden_diag_rate_among_pos_with_pcp = 100 - np.round(pos_ecg_with_PCP['diagnosis_in_charts'].mean()*100, 2)

hidden_diag_n_among_pos = pos_ecgs['diagnosis_in_charts'].value_counts()[False]
hidden_diag_n_among_pos_with_pcp = pos_ecg_with_PCP['diagnosis_in_charts'].value_counts()[False]

pos_ecg_ci = format_percent_with_ci(hidden_diag_n_among_pos, n_pos_ecgs)
pos_ecg_with_pcp_ci = format_percent_with_ci(hidden_diag_n_among_pos_with_pcp, n_pos_ecgs_pcp)

rows.append({'Group': f"Overall = --", #'% EHR Diagnosis': f'{diag_rate:.1f}' , 
                    '% Hidden Diagnosis (AF+)': pos_ecg_ci, 
                    '% Hidden Diagnosis (AF+, PCP)': pos_ecg_with_pcp_ci})

# rows.append({'Group':f"Overall = --", #'% EHR Diagnosis': f'{diag_rate:.1f}' , 
#                     '% EHR Diagnosis (AF+)': f'{hidden_diag_rate_among_pos:.1f} ({hidden_diag_n_among_pos})', 
#                     '% EHR Diagnosis (AF+, PCP)': f'{hidden_diag_rate_among_pos_with_pcp:.1f} ({hidden_diag_n_among_pos_with_pcp})'})
          
# Iterate over groups and values
for group, values in category_to_values.items():
    if prettify_group_name[group] == 'Sex':
        continue 
    for val in values:
        # diag_rate = np.round(ecg_data[ecg_data[group] == val]['diagnosis_in_charts'].mean()*100,2)
        hidden_diag_rate_among_pos = 100 - np.round(pos_ecgs[pos_ecgs[group] == val]['diagnosis_in_charts'].mean()*100, 2)
        hidden_diag_rate_among_pos_with_pcp = 100 - np.round(pos_ecg_with_PCP[pos_ecg_with_PCP[group] == val]['diagnosis_in_charts'].mean()*100, 2)

        hidden_diag_n_among_pos = pos_ecgs[pos_ecgs[group] == val]['diagnosis_in_charts'].value_counts()[False]
        hidden_diag_n_among_pos_with_pcp = pos_ecg_with_PCP[pos_ecg_with_PCP[group] == val]['diagnosis_in_charts'].value_counts()[False]

        pos_ecg_ci = format_percent_with_ci(hidden_diag_n_among_pos, len(pos_ecgs[pos_ecgs[group] == val]))
        pos_ecg_with_pcp_ci = format_percent_with_ci(hidden_diag_n_among_pos_with_pcp, len(pos_ecg_with_PCP[pos_ecg_with_PCP[group] == val]))


        rows.append({'Group': f"{prettify_group_name[group]} = {val}", #'% EHR Diagnosis': f'{diag_rate:.1f}' , 
                    '% Hidden Diagnosis (AF+)': pos_ecg_ci + ' ' + str(hidden_diag_rate_among_pos), 
                    '% Hidden Diagnosis (AF+, PCP)': pos_ecg_with_pcp_ci + ' ' + str(hidden_diag_rate_among_pos_with_pcp)})
        
        # rows.append({'Group': f"{prettify_group_name[group]} = {val}", #'% EHR Diagnosis': f'{diag_rate:.1f}' , 
        #             '% EHR Diagnosis (AF+)': f'{hidden_diag_rate_among_pos:.1f} ({hidden_diag_n_among_pos})', 
        #             '% EHR Diagnosis (AF+, PCP)': f'{hidden_diag_rate_among_pos_with_pcp:.1f} ({hidden_diag_n_among_pos_with_pcp})'})
        
diagnosis_rates = pd.DataFrame(rows)
diagnosis_rates

In [None]:
# Mapping for display name cleanup
value_rename_map = {
    'BlackorAfricanAmerican': 'Black/African American',
    'HispanicorLatino': 'Hispanic/Latino'
}

# Start LaTeX table (requires \usepackage{makecell} in LaTeX preamble)
latex_rows = []
latex_rows.append(r"\begin{tabular}{llccc}")
latex_rows.append(r"\toprule")
latex_rows.append(r" & & \makecell{\textbf{Patients with}\\\textbf{AF ECG}} & \makecell{\textbf{Patients with}\\\textbf{AF ECG, PCP Visit}} \\")
latex_rows.append(r" & & \% Hidden Diagnosis (CI) & \% Hidden DIagnosis (CI) \\")
latex_rows.append(r"\hline")

prev_category = None
for _, row in diagnosis_rates.iterrows():
    group_info = row['Group']
    category, val = group_info.split('=')
    category = category.strip()
    val = val.strip().replace(' ', '')

    # Clean display values
    val = value_rename_map.get(val, val)

    # Only show category label the first time it appears
    group_col = f"\\textbf{{{category}}}" if category != prev_category else ""
    if category != prev_category:
        latex_rows.append("\midrule")
    prev_category = category

    # Build LaTeX row
    latex_row = f"{group_col} & {val}  & {row['% Hidden Diagnosis (AF+)']} & {row['% Hidden Diagnosis (AF+, PCP)']} \\\\"
    latex_rows.append(latex_row)
latex_rows.append(r"\bottomrule")
latex_rows.append(r"\end{tabular}")

# Combine and print
latex_code = "\n".join(latex_rows)
print(latex_code)
