In [1]:
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import os
import re
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
import statsmodels.formula.api as smf
import statsmodels.api as sm
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
import seaborn as sns
import copy
from sklearn import preprocessing
import pickle

### Merge full mimiic data with mimic data predictions for smoking status

In [2]:
# Retrieving merged_data -- can't show due to MIMIC Privacy Policy
full_data_df = pd.read_csv("full_data_df_no_index.csv")
pred_mimic_df = pd.read_csv("...") # Should be the csv file with mimic smoking status predictions for each entry
pred_mimic_df = pred_mimic_df.rename(columns={'SUBJECT_ID': 'subject_id'})



#### full_data_df contains 6361 rows and 130 columns (including subject_id, age, echo, etc...)
#### pred_mimic_df contains 34312 rows and 46 columns (including subject_id, SMOKING_STATUS, etc..)

In [5]:
merged_df = pd.merge(full_data_df, pred_mimic_df[["subject_id","SMOKING_STATUS"]], on=["subject_id"])

In [6]:
merged_df["SMOKING_STATUS"].value_counts()

1    2058
3    1413
4    1171
2      93
0      64
Name: SMOKING_STATUS, dtype: int64

In [7]:
# Droppping 0 labels to ensure we only have 4 possible smoking status labels
# 0 labels derived from merging two dataframes where some entries may not have a prediction
merged_df = merged_df.drop(merged_df[merged_df["SMOKING_STATUS"] == 0].index)
merged_df["SMOKING_STATUS"].value_counts()

1    2058
3    1413
4    1171
2      93
Name: SMOKING_STATUS, dtype: int64

In [8]:
# Converting integers to weekdays
def int_to_weekday(row):
    r = int(row)
    if r == 0:
        return 'sunday'
    elif r == 1:
        return "monday"
    elif r == 2:
        return "tuesday"
    elif r == 3:
        return "wednesday"
    elif r == 4:
        return "thursday"
    elif r== 5:
        return "friday"
    else:
        return "saturday"

merged_df["icu_adm_weekday"] = merged_df["icu_adm_weekday"].apply(int_to_weekday)

In [9]:
merged_df["first_careunit"] = merged_df["first_careunit"].astype('category')
merged_df["first_careunit"] = merged_df["first_careunit"].cat.reorder_categories(["SICU", "MICU"])

merged_df["gender"] = merged_df["gender"].astype("category")
merged_df["gender"] = merged_df["gender"].cat.reorder_categories(["M", "F"])

merged_df["icu_adm_weekday"] = merged_df["icu_adm_weekday"].astype("category")

### Viewing the finalized merged dataframe

#### 4735 rows and 131 columns (including subject_id, echo, mort_28_day, SMOKING_STATUS, etc...)

In [None]:
merged_df # Unable to show due to MIMIC Privacy Policy

### Defining helper functions to calculate causal effects w.r.t effect restoration from measurement bias

In [12]:
def generate_models(dataframe):
    '''
    Given a pre-processed MIMIC + proxy prediction dataframe, train four logistic regression models using smf.logit. 
    The formula strings will be hard-coded into the function. The assumptions for these models are:
        1) Categorical smoking categories 
        2) Not all feature are binary, but at least the output (mort_28_day) and treatment (echo) should be binary
    '''
    
    # Calculating P(y | u*, a, c) --> y ~ u* + a + c
    fstring = 'mort_28_day ~ echo + first_careunit + age + gender + weight + saps + sofa + elix_score + vent + \
            vaso + icu_adm_weekday + icu_adm_hour + icd_chf + icd_afib + icd_renal + icd_liver + icd_copd + \
            icd_cad + icd_stroke + icd_malignancy + vs_heart_rate_first + vs_map_first + vs_temp_first + \
            lab_hemoglobin_first + lab_platelet_first + lab_wbc_first + lab_ph_first + lab_chloride_first + \
            lab_sodium_first + lab_bun_first + lab_bicarbonate_first + lab_pco2_first + lab_creatinine_first + \
            lab_potassium_first + lab_po2_first + lab_lactate_first + sedative + vs_cvp_flag + \
            lab_creatinine_kinase_flag + lab_bnp_flag + lab_troponin_flag + SMOKING_STATUS'
    eq1 = smf.logit(fstring, data=dataframe)
    eq1_model = eq1.fit(disp=0)
    
    # Calculating P(u* | a, c)
    f_string2 = "SMOKING_STATUS ~ echo + first_careunit + age + gender + weight + saps + sofa + elix_score + vent + \
            vaso + icu_adm_weekday + icu_adm_hour + icd_chf + icd_afib + icd_renal + icd_liver + icd_copd + \
            icd_cad + icd_stroke + icd_malignancy + vs_heart_rate_first + vs_map_first + vs_temp_first + \
            lab_hemoglobin_first + lab_platelet_first + lab_wbc_first + lab_ph_first + lab_chloride_first + \
            lab_sodium_first + lab_bun_first + lab_bicarbonate_first + lab_pco2_first + lab_creatinine_first + \
            lab_potassium_first + lab_po2_first + lab_lactate_first + sedative + vs_cvp_flag + \
            lab_creatinine_kinase_flag + lab_bnp_flag + lab_troponin_flag"
    eq2 = smf.mnlogit(f_string2, data=dataframe)
    eq2_model = eq2.fit(disp=0)
    
    # Calculating P(c,u*) --> approximates to (P(u* | c) - eu) / (1- eu - &u) * 1 / N
    f_string3 = "SMOKING_STATUS ~ first_careunit + age + gender + weight + saps + sofa + elix_score + vent + \
            vaso + icu_adm_weekday + icu_adm_hour + icd_chf + icd_afib + icd_renal + icd_liver + icd_copd + \
            icd_cad + icd_stroke + icd_malignancy + vs_heart_rate_first + vs_map_first + vs_temp_first + \
            lab_hemoglobin_first + lab_platelet_first + lab_wbc_first + lab_ph_first + lab_chloride_first + \
            lab_sodium_first + lab_bun_first + lab_bicarbonate_first + lab_pco2_first + lab_creatinine_first + \
            lab_potassium_first + lab_po2_first + lab_lactate_first + sedative + vs_cvp_flag + \
            lab_creatinine_kinase_flag + lab_bnp_flag + lab_troponin_flag"
    eq3 = smf.glm(f_string3, data=dataframe)
    eq3_model = eq3.fit(disp=0)
    
    # Calculating P(y | a, c)
    f_string4 = 'mort_28_day ~ echo + first_careunit + age + gender + weight + saps + sofa + elix_score + vent + \
                vaso + icu_adm_weekday + icu_adm_hour + icd_chf + icd_afib + icd_renal + icd_liver + icd_copd + \
                icd_cad + icd_stroke + icd_malignancy + vs_heart_rate_first + vs_map_first + vs_temp_first + \
                lab_hemoglobin_first + lab_platelet_first + lab_wbc_first + lab_ph_first + lab_chloride_first + \
                lab_sodium_first + lab_bun_first + lab_bicarbonate_first + lab_pco2_first + lab_creatinine_first + \
                lab_potassium_first + lab_po2_first + lab_lactate_first + sedative + vs_cvp_flag + \
                lab_creatinine_kinase_flag + lab_bnp_flag + lab_troponin_flag'
    eq4 = smf.logit(f_string4, data=dataframe)
    eq4_model = eq4.fit(disp=0)
    
    # Calculating P(a|c)
    f_string5 = "echo ~ first_careunit + age + gender + weight + saps + sofa + elix_score + vent + \
                vaso + icu_adm_weekday + icu_adm_hour + icd_chf + icd_afib + icd_renal + icd_liver + icd_copd + \
                icd_cad + icd_stroke + icd_malignancy + vs_heart_rate_first + vs_map_first + vs_temp_first + \
                lab_hemoglobin_first + lab_platelet_first + lab_wbc_first + lab_ph_first + lab_chloride_first + \
                lab_sodium_first + lab_bun_first + lab_bicarbonate_first + lab_pco2_first + lab_creatinine_first + \
                lab_potassium_first + lab_po2_first + lab_lactate_first + sedative + vs_cvp_flag + \
                lab_creatinine_kinase_flag + lab_bnp_flag + lab_troponin_flag"
    eq5 = smf.logit(f_string5, data=dataframe)
    eq5_model = eq5.fit(disp=0)
    
    return eq1_model, eq2_model, eq3_model, eq4_model, eq5_model

### Implementing Risk Ratio

In [13]:
def risk_ratio(dataframe, model1, model2, model3, model4, model5):
    '''
    Given a pre-procesesed MIMIC + smoking proxy prediction dataframe as well as five trained models 
    from generate_models(), calculate the risk ratio as defined by: 
    causal_effect = summation(c,u){ p(c,u) * ( E[Y=1 | A=1,c,u*] / E[Y=1 | A=0,c,u*] ) }
    The assumptions of this function are:
        1) Smoking proxy predictions are categorical
        2) Treatment variable values are binary --> either 1 for receiving treatment or 0 for not receiving treatment
        3) Order for model inputs matter:
            a) model1 = P(y | u*, a, c) --> y ~ u* + a + c
            b) model2 = P(u* | a, c)
            c) model3 = P(c,u*) --> approximates to (P(u* | c) - eu) / (1- eu - &u) * 1 / N --> P(u | c)
            d) model4 = P(y | a, c)
            e) model5 = P(a | c)
        4) Default prediction is probability of getting 1 due to how statsmodels works
    '''
    
    tmp_df = None
    unique_smoking = [1,2,3,4]
    unique_echo = [1,0]
    exp_array = []
    
    # Understanding Matrix of Error Adjustments
    confusion = [
                    [8, 0, 2, 1],
                    [4, 4, 3, 0],
                    [1, 0, 14, 1],
                    [1, 0, 1, 61]
                ] # rows represent the ground truth labels and cols represents the predicted labels

    error_mat = [
                    [8/11, 0, 2/11, 1/11],
                    [4/11, 4/11, 3/11, 0],
                    [1/16, 0, 14/16, 1/16],
                    [1/63, 0, 1/63, 61/63]
                ] # rows represent U* and cols represent U
    inverse = np.linalg.inv(error_mat)
    
    # Getting P(A, c, y=1, u*) 
    prob_a1_c_y1_u = []
    prob_a0_c_y1_u = []
    for s in unique_smoking:
        tmp_df = copy.deepcopy(dataframe)
    
        # Presetting the smoking status in the dataframe to either be 1 or 0
        tmp_df["SMOKING_STATUS"] = [s] * tmp_df.shape[0]
        
        for e in unique_echo:
            tmp_tmp_df = copy.deepcopy(tmp_df)
            tmp_tmp_df["echo"] = [e] * tmp_df.shape[0]
            
            prob_1 = model1.predict(tmp_tmp_df)
            prob_2 = model2.predict(tmp_tmp_df)[:][s-1]
            prob_3 = model5.predict(tmp_tmp_df)
            
            
            if e == 0:
                output = prob_1 * prob_2 * (1 - prob_3)
                prob_a0_c_y1_u.append(output)
            else:
                output = prob_1 * prob_2 * prob_3
                prob_a1_c_y1_u.append(output)
    
    # Getting P(A, c, y=0, u*)
    prob_a1_c_y0_u = []
    prob_a0_c_y0_u = []
    for s in unique_smoking:
        tmp_df = copy.deepcopy(dataframe)
    
        # Presetting the smoking status in the dataframe to either be 1 or 0
        tmp_df["SMOKING_STATUS"] = [s] * tmp_df.shape[0]
        
        for e in unique_echo:
            tmp_tmp_df = copy.deepcopy(tmp_df)
            tmp_tmp_df["echo"] = [e] * tmp_df.shape[0]
            
            prob_1 = 1 - model1.predict(tmp_tmp_df)
            prob_2 = model2.predict(tmp_tmp_df)[:][s-1]
            prob_3 = model5.predict(tmp_tmp_df)
            
            
            if e == 0:
                output = prob_1 * prob_2 * (1 - prob_3)
                prob_a0_c_y0_u.append(output)
            else:
                output = prob_1 * prob_2 * prob_3
                prob_a1_c_y0_u.append(output)
        
    
    # Getting P(Y=1 | A=1, C, U=0)
    num_0a = prob_a1_c_y1_u[0] * inverse[0][0] + prob_a1_c_y1_u[1] * inverse[1][0] + prob_a1_c_y1_u[2] * inverse[2][0] 
            + prob_a1_c_y1_u[3] * inverse[3][0]
    tmp_0a = prob_a1_c_y0_u[0] * inverse[0][0] + prob_a1_c_y0_u[1] * inverse[1][0] + prob_a1_c_y0_u[2] * inverse[2][0] 
            + prob_a1_c_y0_u[3] * inverse[3][0]
    denom_0a = num_0a + tmp_0a
    upper_0a = num_0a / denom_0a
    
    # Getting P(Y=1 | A=0, C, U=0)
    num_0b = prob_a0_c_y1_u[0] * inverse[0][0] + prob_a0_c_y1_u[1] * inverse[1][0] + prob_a0_c_y1_u[2] * inverse[2][0] 
            + prob_a0_c_y1_u[3] * inverse[3][0]
    tmp_0b = prob_a0_c_y0_u[0] * inverse[0][0] + prob_a0_c_y0_u[1] * inverse[1][0] + prob_a0_c_y0_u[2] * inverse[2][0] 
            + prob_a0_c_y0_u[3] * inverse[3][0]
    denom_0b = num_0b + tmp_0b
    lower_0b = num_0b / denom_0b
    
    comp_0 = upper_0a / lower_0b
    
    # Getting P(Y=1 | A=1, C, U=1)
    num_1a = prob_a1_c_y1_u[0] * inverse[0][1] + prob_a1_c_y1_u[1] * inverse[1][1] + prob_a1_c_y1_u[2] * inverse[2][1] 
            + prob_a1_c_y1_u[3] * inverse[3][1]
    tmp_1a = prob_a1_c_y0_u[0] * inverse[0][1] + prob_a1_c_y0_u[1] * inverse[1][1] + prob_a1_c_y0_u[2] * inverse[2][1] 
            + prob_a1_c_y0_u[3] * inverse[3][1]
    denom_1a = num_1a + tmp_1a
    upper_1a = num_1a / denom_1a
    
    # Getting P(Y=1 | A=0, C, U=1)
    num_1b = prob_a0_c_y1_u[0] * inverse[0][1] + prob_a0_c_y1_u[1] * inverse[1][1] + prob_a0_c_y1_u[2] * inverse[2][1] 
            + prob_a0_c_y1_u[3] * inverse[3][1]
    tmp_1b = prob_a0_c_y0_u[0] * inverse[0][1] + prob_a0_c_y0_u[1] * inverse[1][1] + prob_a0_c_y0_u[2] * inverse[2][1] 
            + prob_a0_c_y0_u[3] * inverse[3][1]
    denom_1b = num_1b + tmp_1b
    lower_1b = num_1b / denom_1b
    
    comp_1 = upper_1a / lower_1b
    
    # Getting P(Y=1 | A=1, C, U=2)
    num_2a = prob_a1_c_y1_u[0] * inverse[0][2] + prob_a1_c_y1_u[1] * inverse[1][2] + prob_a1_c_y1_u[2] * inverse[2][2] 
            + prob_a1_c_y1_u[3] * inverse[3][2]
    tmp_2a = prob_a1_c_y0_u[0] * inverse[0][2] + prob_a1_c_y0_u[1] * inverse[1][2] + prob_a1_c_y0_u[2] * inverse[2][2] 
            + prob_a1_c_y0_u[3] * inverse[3][2]
    denom_2a = num_2a + tmp_2a
    upper_2a = num_2a / denom_2a
    
    # Getting P(Y=1 | A=0, C, U=2)
    num_2b = prob_a0_c_y1_u[0] * inverse[0][2] + prob_a0_c_y1_u[1] * inverse[1][2] + prob_a0_c_y1_u[2] * inverse[2][2] 
            + prob_a0_c_y1_u[3] * inverse[3][2]
    tmp_2b = prob_a0_c_y0_u[0] * inverse[0][2] + prob_a0_c_y0_u[1] * inverse[1][2] + prob_a0_c_y0_u[2] * inverse[2][2] 
            + prob_a0_c_y0_u[3] * inverse[3][2]
    denom_2b = num_2b + tmp_2b
    lower_2b = num_2b / denom_2b
    
    comp_2 = upper_2a / lower_2b
    
    # Getting P(Y=1 | A=1, C, U=3)
    num_3a = prob_a1_c_y1_u[0] * inverse[0][3] + prob_a1_c_y1_u[1] * inverse[1][3] + prob_a1_c_y1_u[2] * inverse[2][3] 
            + prob_a1_c_y1_u[3] * inverse[3][3]
    tmp_3a = prob_a1_c_y0_u[0] * inverse[0][3] + prob_a1_c_y0_u[1] * inverse[1][3] + prob_a1_c_y0_u[2] * inverse[2][3] 
            + prob_a1_c_y0_u[3] * inverse[3][3]
    denom_3a = num_3a + tmp_3a
    upper_3a = num_3a / denom_3a
    
    # Getting P(Y=1 | A=0, C, U=3)
    num_3b = prob_a0_c_y1_u[0] * inverse[0][3] + prob_a0_c_y1_u[1] * inverse[1][3] + prob_a0_c_y1_u[2] * inverse[2][3] 
            + prob_a0_c_y1_u[3] * inverse[3][3]
    tmp_3b = prob_a0_c_y0_u[0] * inverse[0][3] + prob_a0_c_y0_u[1] * inverse[1][3] + prob_a0_c_y0_u[2] * inverse[2][3] 
            + prob_a0_c_y0_u[3] * inverse[3][3]
    denom_3b = num_3b + tmp_3b
    lower_3b = num_3b / denom_3b
    
    comp_3 = upper_3a / lower_3b
    
    # Getting P(u | c) = summation{u}{P(A=0, c, y=0, u) + P(A=1, c, y=0, u) + P(A=0, c, y=1, u) + P(A=0, c, y=1, u)}
    
    prob_u0_c = num_0a + tmp_0a + num_0b + tmp_0b
    prob_u1_c = num_1a + tmp_1a + num_1b + tmp_1b
    prob_u2_c = num_2a + tmp_2a + num_2b + tmp_2b
    prob_u3_c = num_3a + tmp_3a + num_3b + tmp_3b
    
    print([np.mean(comp_0 * prob_u0_c), np.mean(comp_1 * prob_u1_c), 
           np.mean(comp_2 * prob_u2_c), np.mean(comp_3 * prob_u3_c)])
    rr = np.mean(comp_0 * prob_u0_c) + np.mean(comp_1 * prob_u1_c) + np.mean(comp_2 * prob_u2_c) 
            + np.mean(comp_3 * prob_u3_c)
    return rr

In [14]:
m1, m2, m3, m4, m5 = generate_models(merged_df)
risk_ratio(merged_df, m1, m2, m3, m4, m5)

[0.4838000219913868, 0.054784920499759755, 0.16731889416457849, 0.18447476501081997]
0.8903786016665449


0.8903786016665449

### Bootstrapping Error Rate Matrix

In [16]:
def risk_ratio_bootstrap(dataframe, model1, model2, model3, model4, model5, error_mat):
    '''
    Given a pre-procesesed MIMIC + smoking proxy prediction dataframe as well as five trained models 
    from generate_models(), calculate the risk ratio as defined by: 
    causal_effect = summation(c,u){ p(c,u) * ( E[Y=1 | A=1,c,u*] / E[Y=1 | A=0,c,u*] ) }
    The assumptions of this function are:
        1) Smoking proxy predictions are categorical
        2) Treatment variable values are binary --> either 1 for receiving treatment or 0 for not receiving treatment
        3) Order for model inputs matter:
            a) model1 = P(y | u*, a, c) --> y ~ u* + a + c
            b) model2 = P(u* | a, c)
            c) model3 = P(c,u*) --> approximates to (P(u* | c) - eu) / (1- eu - &u) * 1 / N --> P(u | c)
            d) model4 = P(y | a, c)
            e) model5 = P(a | c)
        4) Default prediction is probability of getting 1 due to how statsmodels works
    '''
    
    tmp_df = None
    unique_smoking = [1,2,3,4]
    unique_echo = [1,0]
    exp_array = []
    
    # Inversing Error Rate Matrices
    inverse = np.linalg.inv(error_mat)
    
    # Getting P(A, c, y=1, u*) 
    prob_a1_c_y1_u = []
    prob_a0_c_y1_u = []
    for s in unique_smoking:
        tmp_df = copy.deepcopy(dataframe)
    
        # Presetting the smoking status in the dataframe to either be 1 or 0
        tmp_df["SMOKING_STATUS"] = [s] * tmp_df.shape[0]
        
        for e in unique_echo:
            tmp_tmp_df = copy.deepcopy(tmp_df)
            tmp_tmp_df["echo"] = [e] * tmp_df.shape[0]
            
            prob_1 = model1.predict(tmp_tmp_df)
            prob_2 = model2.predict(tmp_tmp_df)[:][s-1]
            prob_3 = model5.predict(tmp_tmp_df)
            
            
            if e == 0:
                output = prob_1 * prob_2 * (1 - prob_3)
                prob_a0_c_y1_u.append(output)
            else:
                output = prob_1 * prob_2 * prob_3
                prob_a1_c_y1_u.append(output)
    
    # Getting P(A, c, y=0, u*)
    prob_a1_c_y0_u = []
    prob_a0_c_y0_u = []
    for s in unique_smoking:
        tmp_df = copy.deepcopy(dataframe)
    
        # Presetting the smoking status in the dataframe to either be 1 or 0
        tmp_df["SMOKING_STATUS"] = [s] * tmp_df.shape[0]
        
        for e in unique_echo:
            tmp_tmp_df = copy.deepcopy(tmp_df)
            tmp_tmp_df["echo"] = [e] * tmp_df.shape[0]
            
            prob_1 = 1 - model1.predict(tmp_tmp_df)
            prob_2 = model2.predict(tmp_tmp_df)[:][s-1]
            prob_3 = model5.predict(tmp_tmp_df)
            
            
            if e == 0:
                output = prob_1 * prob_2 * (1 - prob_3)
                prob_a0_c_y0_u.append(output)
            else:
                output = prob_1 * prob_2 * prob_3
                prob_a1_c_y0_u.append(output)
        
    
    # Getting P(Y=1 | A=1, C, U=0)
    num_0a = prob_a1_c_y1_u[0] * inverse[0][0] + prob_a1_c_y1_u[1] * inverse[1][0] + prob_a1_c_y1_u[2] * inverse[2][0] 
            + prob_a1_c_y1_u[3] * inverse[3][0]
    tmp_0a = prob_a1_c_y0_u[0] * inverse[0][0] + prob_a1_c_y0_u[1] * inverse[1][0] + prob_a1_c_y0_u[2] * inverse[2][0] 
            + prob_a1_c_y0_u[3] * inverse[3][0]
    denom_0a = num_0a + tmp_0a
    upper_0a = num_0a / denom_0a
    
    # Getting P(Y=1 | A=0, C, U=0)
    num_0b = prob_a0_c_y1_u[0] * inverse[0][0] + prob_a0_c_y1_u[1] * inverse[1][0] + prob_a0_c_y1_u[2] * inverse[2][0] 
            + prob_a0_c_y1_u[3] * inverse[3][0]
    tmp_0b = prob_a0_c_y0_u[0] * inverse[0][0] + prob_a0_c_y0_u[1] * inverse[1][0] + prob_a0_c_y0_u[2] * inverse[2][0] 
            + prob_a0_c_y0_u[3] * inverse[3][0]
    denom_0b = num_0b + tmp_0b
    lower_0b = num_0b / denom_0b
    
    comp_0 = upper_0a / lower_0b
    
    # Getting P(Y=1 | A=1, C, U=1)
    num_1a = prob_a1_c_y1_u[0] * inverse[0][1] + prob_a1_c_y1_u[1] * inverse[1][1] + prob_a1_c_y1_u[2] * inverse[2][1] 
            + prob_a1_c_y1_u[3] * inverse[3][1]
    tmp_1a = prob_a1_c_y0_u[0] * inverse[0][1] + prob_a1_c_y0_u[1] * inverse[1][1] + prob_a1_c_y0_u[2] * inverse[2][1] 
            + prob_a1_c_y0_u[3] * inverse[3][1]
    denom_1a = num_1a + tmp_1a
    upper_1a = num_1a / denom_1a
    
    # Getting P(Y=1 | A=0, C, U=1)
    num_1b = prob_a0_c_y1_u[0] * inverse[0][1] + prob_a0_c_y1_u[1] * inverse[1][1] + prob_a0_c_y1_u[2] * inverse[2][1] 
            + prob_a0_c_y1_u[3] * inverse[3][1]
    tmp_1b = prob_a0_c_y0_u[0] * inverse[0][1] + prob_a0_c_y0_u[1] * inverse[1][1] + prob_a0_c_y0_u[2] * inverse[2][1] 
            + prob_a0_c_y0_u[3] * inverse[3][1]
    denom_1b = num_1b + tmp_1b
    lower_1b = num_1b / denom_1b
    
    comp_1 = upper_1a / lower_1b
    
    # Getting P(Y=1 | A=1, C, U=2)
    num_2a = prob_a1_c_y1_u[0] * inverse[0][2] + prob_a1_c_y1_u[1] * inverse[1][2] + prob_a1_c_y1_u[2] * inverse[2][2] 
            + prob_a1_c_y1_u[3] * inverse[3][2]
    tmp_2a = prob_a1_c_y0_u[0] * inverse[0][2] + prob_a1_c_y0_u[1] * inverse[1][2] + prob_a1_c_y0_u[2] * inverse[2][2] 
            + prob_a1_c_y0_u[3] * inverse[3][2]
    denom_2a = num_2a + tmp_2a
    upper_2a = num_2a / denom_2a
    
    # Getting P(Y=1 | A=0, C, U=2)
    num_2b = prob_a0_c_y1_u[0] * inverse[0][2] + prob_a0_c_y1_u[1] * inverse[1][2] + prob_a0_c_y1_u[2] * inverse[2][2] 
            + prob_a0_c_y1_u[3] * inverse[3][2]
    tmp_2b = prob_a0_c_y0_u[0] * inverse[0][2] + prob_a0_c_y0_u[1] * inverse[1][2] + prob_a0_c_y0_u[2] * inverse[2][2] + prob_a0_c_y0_u[3] * inverse[3][2]
    denom_2b = num_2b + tmp_2b
    lower_2b = num_2b / denom_2b
    
    comp_2 = upper_2a / lower_2b
    
    # Getting P(Y=1 | A=1, C, U=3)
    num_3a = prob_a1_c_y1_u[0] * inverse[0][3] + prob_a1_c_y1_u[1] * inverse[1][3] + prob_a1_c_y1_u[2] * inverse[2][3] 
            + prob_a1_c_y1_u[3] * inverse[3][3]
    tmp_3a = prob_a1_c_y0_u[0] * inverse[0][3] + prob_a1_c_y0_u[1] * inverse[1][3] + prob_a1_c_y0_u[2] * inverse[2][3] 
            + prob_a1_c_y0_u[3] * inverse[3][3]
    denom_3a = num_3a + tmp_3a
    upper_3a = num_3a / denom_3a
    
    # Getting P(Y=1 | A=0, C, U=3)
    num_3b = prob_a0_c_y1_u[0] * inverse[0][3] + prob_a0_c_y1_u[1] * inverse[1][3] + prob_a0_c_y1_u[2] * inverse[2][3] 
            + prob_a0_c_y1_u[3] * inverse[3][3]
    tmp_3b = prob_a0_c_y0_u[0] * inverse[0][3] + prob_a0_c_y0_u[1] * inverse[1][3] + prob_a0_c_y0_u[2] * inverse[2][3] 
            + prob_a0_c_y0_u[3] * inverse[3][3]
    denom_3b = num_3b + tmp_3b
    lower_3b = num_3b / denom_3b
    
    comp_3 = upper_3a / lower_3b
    
    # Getting P(u | c) = summation{u}{P(A=0, c, y=0, u) + P(A=1, c, y=0, u) + P(A=0, c, y=1, u) + P(A=0, c, y=1, u)}
    
    prob_u0_c = num_0a + tmp_0a + num_0b + tmp_0b
    prob_u1_c = num_1a + tmp_1a + num_1b + tmp_1b
    prob_u2_c = num_2a + tmp_2a + num_2b + tmp_2b
    prob_u3_c = num_3a + tmp_3a + num_3b + tmp_3b
    
    rr = np.mean(comp_0 * prob_u0_c) + np.mean(comp_1 * prob_u1_c) + np.mean(comp_2 * prob_u2_c) 
        + np.mean(comp_3 * prob_u3_c)
    return rr

In [17]:
def bootstrap(dataframe, model1, model2, model3, model4, model5):
    '''
    Given a dataframe and 5 models generated from generate_models(), bootstrap the testing set for n2c2 2006 smoking
    dataset to get different error rate matrices to test robustness of the risk ratio casual effect.
    Utilize predict_bootstrap_2006.py to generate pickle files that store the confusion matrices.
    '''
    
    # Iterating through the bootstrapped confusion matrices 
    # "iterations" var depends on how many bootstrapped confusion matrics were generated
    # Default in predict_bootstrap_2006.py is 10
    iterations = 10 
    rr_arr = []
    for x in range(iterations):
        # Access each pickle file containing the confusion matrix
        f = open("...", "rb")  # First input should be the bootstrapped matrices (pkl file)
        con_matrix = pickle.load(f)
        res = con_matrix/con_matrix.sum(axis=1)[:,None]
        rr = risk_ratio_bootstrap(dataframe, model1, model2, model3, model4, model5, res)
        rr_arr.append(rr)
    
    print(rr_arr)
    return sum(rr_arr) / len(rr_arr)

In [18]:
m1, m2, m3, m4, m5 = generate_models(merged_df)
bootstrap(merged_df, m1, m2, m3, m4, m5)

[0.8909875802243661, 0.8904475988807712, 0.890596509707885, 0.8906791999431076, 0.8900207140785991, 0.8907383965807781, 0.8895413941042646, 0.8908275785985995, 0.8954674682229409, 0.8904210990774163]


0.8909727539418728

### Implementing OR

In [21]:
def odds_ratio(dataframe, model1, model2, model3, model4, model5):
    '''
    Given a pre-procesesed MIMIC + smoking proxy prediction dataframe as well as five trained models 
    from generate_models(), calculate the odds ratio as defined by: 
    causal_effect = (P(Y^{a=1}=1) * P(Y^{a=0}=0)) / (P(Y^{a=1}=0) * P(Y^{a=0}=1))
    The assumptions of this function are:
        1) Smoking proxy predictions are categorical
        2) Treatment variable values are binary --> either 1 for receiving treatment or 0 for not receiving treatment
        3) Order for model inputs matter:
            a) model1 = P(y | u*, a, c) --> y ~ u* + a + c
            b) model2 = P(u* | a, c)
            c) model3 = P(c,u*) --> approximates to (P(u* | c) - eu) / (1- eu - &u) * 1 / N --> P(u | c)
            d) model4 = P(y | a, c)
            e) model5 = P(a | c)
        4) Default prediction is probability of getting 1 due to how statsmodels works
    '''
    
    tmp_df = None
    unique_smoking = [1,2,3,4]
    unique_echo = [1,0]
    exp_array = []
    
    # Creating Matrix of Error Adjustments
    confusion = [
                    [8, 0, 2, 1],
                    [4, 4, 3, 0],
                    [1, 0, 14, 1],
                    [1, 0, 1, 61]
                ] # rows represent the ground truth labels and cols represents the predicted labels

    error_mat = [
                    [8/11, 0, 2/11, 1/11],
                    [4/11, 4/11, 3/11, 0],
                    [1/16, 0, 14/16, 1/16],
                    [1/63, 0, 1/63, 61/63]
                ] # rows represent U* and cols represent U
    inverse = np.linalg.inv(error_mat)
    
    # Getting P(A, c, y=1, u*) 
    prob_a1_c_y1_u = []
    prob_a0_c_y1_u = []
    for s in unique_smoking:
        tmp_df = copy.deepcopy(dataframe)
    
        # Presetting the smoking status in the dataframe to either be 1 or 0
        tmp_df["SMOKING_STATUS"] = [s] * tmp_df.shape[0]
        
        for e in unique_echo:
            tmp_tmp_df = copy.deepcopy(tmp_df)
            tmp_tmp_df["echo"] = [e] * tmp_df.shape[0]
            
            prob_1 = model1.predict(tmp_tmp_df)
            prob_2 = model2.predict(tmp_tmp_df)[:][s-1]
            prob_3 = model5.predict(tmp_tmp_df)
            
            
            if e == 0:
                output = prob_1 * prob_2 * (1 - prob_3)
                prob_a0_c_y1_u.append(output)
            else:
                output = prob_1 * prob_2 * prob_3
                prob_a1_c_y1_u.append(output)
    
    # Getting P(A, c, y=0, u*)
    prob_a1_c_y0_u = []
    prob_a0_c_y0_u = []
    for s in unique_smoking:
        tmp_df = copy.deepcopy(dataframe)
    
        # Presetting the smoking status in the dataframe to either be 1 or 0
        tmp_df["SMOKING_STATUS"] = [s] * tmp_df.shape[0]
        
        for e in unique_echo:
            tmp_tmp_df = copy.deepcopy(tmp_df)
            tmp_tmp_df["echo"] = [e] * tmp_df.shape[0]
            
            prob_1 = 1 - model1.predict(tmp_tmp_df)
            prob_2 = model2.predict(tmp_tmp_df)[:][s-1]
            prob_3 = model5.predict(tmp_tmp_df)
            
            
            if e == 0:
                output = prob_1 * prob_2 * (1 - prob_3)
                prob_a0_c_y0_u.append(output)
            else:
                output = prob_1 * prob_2 * prob_3
                prob_a1_c_y0_u.append(output)
    
    # Getting P(Y=1 | A=1, C, U=0)
    num_0a = prob_a1_c_y1_u[0] * inverse[0][0] + prob_a1_c_y1_u[1] * inverse[1][0] + prob_a1_c_y1_u[2] * inverse[2][0] 
            + prob_a1_c_y1_u[3] * inverse[3][0]
    tmp_0a = prob_a1_c_y0_u[0] * inverse[0][0] + prob_a1_c_y0_u[1] * inverse[1][0] + prob_a1_c_y0_u[2] * inverse[2][0] 
            + prob_a1_c_y0_u[3] * inverse[3][0]
    denom_0a = num_0a + tmp_0a
    upper_0a = num_0a / denom_0a
    
    # Getting P(Y=1 | A=0, C, U=0)
    num_0b = prob_a0_c_y1_u[0] * inverse[0][0] + prob_a0_c_y1_u[1] * inverse[1][0] + prob_a0_c_y1_u[2] * inverse[2][0] 
            + prob_a0_c_y1_u[3] * inverse[3][0]
    tmp_0b = prob_a0_c_y0_u[0] * inverse[0][0] + prob_a0_c_y0_u[1] * inverse[1][0] + prob_a0_c_y0_u[2] * inverse[2][0] 
            + prob_a0_c_y0_u[3] * inverse[3][0]
    denom_0b = num_0b + tmp_0b
    lower_0b = num_0b / denom_0b
    
    
    # Getting P(Y=1 | A=1, C, U=1)
    num_1a = prob_a1_c_y1_u[0] * inverse[0][1] + prob_a1_c_y1_u[1] * inverse[1][1] + prob_a1_c_y1_u[2] * inverse[2][1] 
            + prob_a1_c_y1_u[3] * inverse[3][1]
    tmp_1a = prob_a1_c_y0_u[0] * inverse[0][1] + prob_a1_c_y0_u[1] * inverse[1][1] + prob_a1_c_y0_u[2] * inverse[2][1] 
            + prob_a1_c_y0_u[3] * inverse[3][1]
    denom_1a = num_1a + tmp_1a
    upper_1a = num_1a / denom_1a
    
    # Getting P(Y=1 | A=0, C, U=1)
    num_1b = prob_a0_c_y1_u[0] * inverse[0][1] + prob_a0_c_y1_u[1] * inverse[1][1] + prob_a0_c_y1_u[2] * inverse[2][1] 
            + prob_a0_c_y1_u[3] * inverse[3][1]
    tmp_1b = prob_a0_c_y0_u[0] * inverse[0][1] + prob_a0_c_y0_u[1] * inverse[1][1] + prob_a0_c_y0_u[2] * inverse[2][1] 
            + prob_a0_c_y0_u[3] * inverse[3][1]
    denom_1b = num_1b + tmp_1b
    lower_1b = num_1b / denom_1b
    
    
    # Getting P(Y=1 | A=1, C, U=2)
    num_2a = prob_a1_c_y1_u[0] * inverse[0][2] + prob_a1_c_y1_u[1] * inverse[1][2] + prob_a1_c_y1_u[2] * inverse[2][2] 
            + prob_a1_c_y1_u[3] * inverse[3][2]
    tmp_2a = prob_a1_c_y0_u[0] * inverse[0][2] + prob_a1_c_y0_u[1] * inverse[1][2] + prob_a1_c_y0_u[2] * inverse[2][2] 
            + prob_a1_c_y0_u[3] * inverse[3][2]
    denom_2a = num_2a + tmp_2a
    upper_2a = num_2a / denom_2a
    
    # Getting P(Y=1 | A=0, C, U=2)
    num_2b = prob_a0_c_y1_u[0] * inverse[0][2] + prob_a0_c_y1_u[1] * inverse[1][2] + prob_a0_c_y1_u[2] * inverse[2][2] 
            + prob_a0_c_y1_u[3] * inverse[3][2]
    tmp_2b = prob_a0_c_y0_u[0] * inverse[0][2] + prob_a0_c_y0_u[1] * inverse[1][2] + prob_a0_c_y0_u[2] * inverse[2][2] 
            + prob_a0_c_y0_u[3] * inverse[3][2]
    denom_2b = num_2b + tmp_2b
    lower_2b = num_2b / denom_2b
    
    # Getting P(Y=1 | A=1, C, U=3)
    num_3a = prob_a1_c_y1_u[0] * inverse[0][3] + prob_a1_c_y1_u[1] * inverse[1][3] + prob_a1_c_y1_u[2] * inverse[2][3] 
            + prob_a1_c_y1_u[3] * inverse[3][3]
    tmp_3a = prob_a1_c_y0_u[0] * inverse[0][3] + prob_a1_c_y0_u[1] * inverse[1][3] + prob_a1_c_y0_u[2] * inverse[2][3] 
            + prob_a1_c_y0_u[3] * inverse[3][3]
    denom_3a = num_3a + tmp_3a
    upper_3a = num_3a / denom_3a
    
    # Getting P(Y=1 | A=0, C, U=3)
    num_3b = prob_a0_c_y1_u[0] * inverse[0][3] + prob_a0_c_y1_u[1] * inverse[1][3] + prob_a0_c_y1_u[2] * inverse[2][3] 
            + prob_a0_c_y1_u[3] * inverse[3][3]
    tmp_3b = prob_a0_c_y0_u[0] * inverse[0][3] + prob_a0_c_y0_u[1] * inverse[1][3] + prob_a0_c_y0_u[2] * inverse[2][3] 
            + prob_a0_c_y0_u[3] * inverse[3][3]
    denom_3b = num_3b + tmp_3b
    lower_3b = num_3b / denom_3b
    
    # Getting P(u | c) = summation{u}{P(A=0, c, y=0, u) + P(A=1, c, y=0, u) + P(A=0, c, y=1, u) + P(A=0, c, y=1, u)}
    prob_u0_c = num_0a + tmp_0a + num_0b + tmp_0b
    prob_u1_c = num_1a + tmp_1a + num_1b + tmp_1b
    prob_u2_c = num_2a + tmp_2a + num_2b + tmp_2b
    prob_u3_c = num_3a + tmp_3a + num_3b + tmp_3b
    
    numerator_a = np.sum(upper_0a * prob_u0_c) + np.sum(upper_1a * prob_u1_c) + np.sum(upper_2a * prob_u2_c) 
                  + np.sum(upper_3a * prob_u3_c)
    numerator_b = np.sum((1 - lower_0b) * prob_u0_c) + np.sum((1 - lower_1b) * prob_u1_c) 
                  + np.sum((1 - lower_2b) * prob_u2_c) + np.sum((1 - lower_3b) * prob_u3_c)
     
    denominator_a = np.sum((1 - upper_0a) * prob_u0_c) + np.sum((1 - upper_1a) * prob_u1_c) 
                    + np.sum((1 - upper_2a) * prob_u2_c) + np.sum((1 - upper_3a) * prob_u3_c)
    denominator_b = np.sum(lower_0b * prob_u0_c) + np.sum(lower_1b * prob_u1_c) + np.sum(lower_2b * prob_u2_c) 
                    + np.sum(lower_3b * prob_u3_c)
    
    numerator = numerator_a * numerator_b
    denominator = denominator_a * denominator_b
    
    return numerator / denominator
    
    

In [22]:
m1, m2, m3, m4, m5 = generate_models(merged_df)
odds_ratio(merged_df, m1, m2, m3, m4, m5)

0.8901509440891107

### Bootstrapping for OR

In [19]:
def odds_ratio_bootstrap(dataframe, model1, model2, model3, model4, model5, error_mat):
     '''
    Given a pre-procesesed MIMIC + smoking proxy prediction dataframe as well as five trained models 
    from generate_models(), calculate the odds ratio as defined by: 
    causal_effect = (P(Y^{a=1}=1) * P(Y^{a=0}=0)) / (P(Y^{a=1}=0) * P(Y^{a=0}=1))
    The assumptions of this function are:
        1) Smoking proxy predictions are categorical
        2) Treatment variable values are binary --> either 1 for receiving treatment or 0 for not receiving treatment
        3) Order for model inputs matter:
            a) model1 = P(y | u*, a, c) --> y ~ u* + a + c
            b) model2 = P(u* | a, c)
            c) model3 = P(c,u*) --> approximates to (P(u* | c) - eu) / (1- eu - &u) * 1 / N --> P(u | c)
            d) model4 = P(y | a, c)
            e) model5 = P(a | c)
        4) Default prediction is probability of getting 1 due to how statsmodels works
    '''
    
    tmp_df = None
    unique_smoking = [1,2,3,4]
    unique_echo = [1,0]
    exp_array = []
    
    inverse = np.linalg.inv(error_mat)
    
    # Getting P(A, c, y=1, u*) 
    prob_a1_c_y1_u = []
    prob_a0_c_y1_u = []
    for s in unique_smoking:
        tmp_df = copy.deepcopy(dataframe)
    
        # Presetting the smoking status in the dataframe to either be 1 or 0
        tmp_df["SMOKING_STATUS"] = [s] * tmp_df.shape[0]
        
        for e in unique_echo:
            tmp_tmp_df = copy.deepcopy(tmp_df)
            tmp_tmp_df["echo"] = [e] * tmp_df.shape[0]
            
            prob_1 = model1.predict(tmp_tmp_df)
            prob_2 = model2.predict(tmp_tmp_df)[:][s-1]
            prob_3 = model5.predict(tmp_tmp_df)
            
            
            if e == 0:
                output = prob_1 * prob_2 * (1 - prob_3)
                prob_a0_c_y1_u.append(output)
            else:
                output = prob_1 * prob_2 * prob_3
                prob_a1_c_y1_u.append(output)
    
    # Getting P(A, c, y=0, u*)
    prob_a1_c_y0_u = []
    prob_a0_c_y0_u = []
    for s in unique_smoking:
        tmp_df = copy.deepcopy(dataframe)
    
        # Presetting the smoking status in the dataframe to either be 1 or 0
        tmp_df["SMOKING_STATUS"] = [s] * tmp_df.shape[0]
        
        for e in unique_echo:
            tmp_tmp_df = copy.deepcopy(tmp_df)
            tmp_tmp_df["echo"] = [e] * tmp_df.shape[0]
            
            prob_1 = 1 - model1.predict(tmp_tmp_df)
            prob_2 = model2.predict(tmp_tmp_df)[:][s-1]
            prob_3 = model5.predict(tmp_tmp_df)
            
            
            if e == 0:
                output = prob_1 * prob_2 * (1 - prob_3)
                prob_a0_c_y0_u.append(output)
            else:
                output = prob_1 * prob_2 * prob_3
                prob_a1_c_y0_u.append(output)
    
    # Getting P(Y=1 | A=1, C, U=0)
    num_0a = prob_a1_c_y1_u[0] * inverse[0][0] + prob_a1_c_y1_u[1] * inverse[1][0] + prob_a1_c_y1_u[2] * inverse[2][0] 
            + prob_a1_c_y1_u[3] * inverse[3][0]
    tmp_0a = prob_a1_c_y0_u[0] * inverse[0][0] + prob_a1_c_y0_u[1] * inverse[1][0] + prob_a1_c_y0_u[2] * inverse[2][0] 
            + prob_a1_c_y0_u[3] * inverse[3][0]
    denom_0a = num_0a + tmp_0a
    upper_0a = num_0a / denom_0a
    
    # Getting P(Y=1 | A=0, C, U=0)
    num_0b = prob_a0_c_y1_u[0] * inverse[0][0] + prob_a0_c_y1_u[1] * inverse[1][0] + prob_a0_c_y1_u[2] * inverse[2][0] 
            + prob_a0_c_y1_u[3] * inverse[3][0]
    tmp_0b = prob_a0_c_y0_u[0] * inverse[0][0] + prob_a0_c_y0_u[1] * inverse[1][0] + prob_a0_c_y0_u[2] * inverse[2][0] 
            + prob_a0_c_y0_u[3] * inverse[3][0]
    denom_0b = num_0b + tmp_0b
    lower_0b = num_0b / denom_0b
    
    
    # Getting P(Y=1 | A=1, C, U=1)
    num_1a = prob_a1_c_y1_u[0] * inverse[0][1] + prob_a1_c_y1_u[1] * inverse[1][1] + prob_a1_c_y1_u[2] * inverse[2][1] 
            + prob_a1_c_y1_u[3] * inverse[3][1]
    tmp_1a = prob_a1_c_y0_u[0] * inverse[0][1] + prob_a1_c_y0_u[1] * inverse[1][1] + prob_a1_c_y0_u[2] * inverse[2][1] 
            + prob_a1_c_y0_u[3] * inverse[3][1]
    denom_1a = num_1a + tmp_1a
    upper_1a = num_1a / denom_1a
    
    # Getting P(Y=1 | A=0, C, U=1)
    num_1b = prob_a0_c_y1_u[0] * inverse[0][1] + prob_a0_c_y1_u[1] * inverse[1][1] + prob_a0_c_y1_u[2] * inverse[2][1] 
            + prob_a0_c_y1_u[3] * inverse[3][1]
    tmp_1b = prob_a0_c_y0_u[0] * inverse[0][1] + prob_a0_c_y0_u[1] * inverse[1][1] + prob_a0_c_y0_u[2] * inverse[2][1] 
            + prob_a0_c_y0_u[3] * inverse[3][1]
    denom_1b = num_1b + tmp_1b
    lower_1b = num_1b / denom_1b
    
    
    # Getting P(Y=1 | A=1, C, U=2)
    num_2a = prob_a1_c_y1_u[0] * inverse[0][2] + prob_a1_c_y1_u[1] * inverse[1][2] + prob_a1_c_y1_u[2] * inverse[2][2] 
            + prob_a1_c_y1_u[3] * inverse[3][2]
    tmp_2a = prob_a1_c_y0_u[0] * inverse[0][2] + prob_a1_c_y0_u[1] * inverse[1][2] + prob_a1_c_y0_u[2] * inverse[2][2] 
            + prob_a1_c_y0_u[3] * inverse[3][2]
    denom_2a = num_2a + tmp_2a
    upper_2a = num_2a / denom_2a
    
    # Getting P(Y=1 | A=0, C, U=2)
    num_2b = prob_a0_c_y1_u[0] * inverse[0][2] + prob_a0_c_y1_u[1] * inverse[1][2] + prob_a0_c_y1_u[2] * inverse[2][2] 
            + prob_a0_c_y1_u[3] * inverse[3][2]
    tmp_2b = prob_a0_c_y0_u[0] * inverse[0][2] + prob_a0_c_y0_u[1] * inverse[1][2] + prob_a0_c_y0_u[2] * inverse[2][2] 
            + prob_a0_c_y0_u[3] * inverse[3][2]
    denom_2b = num_2b + tmp_2b
    lower_2b = num_2b / denom_2b
    
    # Getting P(Y=1 | A=1, C, U=3)
    num_3a = prob_a1_c_y1_u[0] * inverse[0][3] + prob_a1_c_y1_u[1] * inverse[1][3] + prob_a1_c_y1_u[2] * inverse[2][3] 
            + prob_a1_c_y1_u[3] * inverse[3][3]
    tmp_3a = prob_a1_c_y0_u[0] * inverse[0][3] + prob_a1_c_y0_u[1] * inverse[1][3] + prob_a1_c_y0_u[2] * inverse[2][3] 
            + prob_a1_c_y0_u[3] * inverse[3][3]
    denom_3a = num_3a + tmp_3a
    upper_3a = num_3a / denom_3a
    
    # Getting P(Y=1 | A=0, C, U=3)
    num_3b = prob_a0_c_y1_u[0] * inverse[0][3] + prob_a0_c_y1_u[1] * inverse[1][3] + prob_a0_c_y1_u[2] * inverse[2][3] 
            + prob_a0_c_y1_u[3] * inverse[3][3]
    tmp_3b = prob_a0_c_y0_u[0] * inverse[0][3] + prob_a0_c_y0_u[1] * inverse[1][3] + prob_a0_c_y0_u[2] * inverse[2][3] 
            + prob_a0_c_y0_u[3] * inverse[3][3]
    denom_3b = num_3b + tmp_3b
    lower_3b = num_3b / denom_3b
    
    # Getting P(u | c) = summation{u}{P(A=0, c, y=0, u) + P(A=1, c, y=0, u) + P(A=0, c, y=1, u) + P(A=0, c, y=1, u)}
    prob_u0_c = num_0a + tmp_0a + num_0b + tmp_0b
    prob_u1_c = num_1a + tmp_1a + num_1b + tmp_1b
    prob_u2_c = num_2a + tmp_2a + num_2b + tmp_2b
    prob_u3_c = num_3a + tmp_3a + num_3b + tmp_3b
    
    numerator_a = np.sum(upper_0a * prob_u0_c) + np.sum(upper_1a * prob_u1_c) + np.sum(upper_2a * prob_u2_c) 
                  + np.sum(upper_3a * prob_u3_c)
    numerator_b = np.sum((1 - lower_0b) * prob_u0_c) + np.sum((1 - lower_1b) * prob_u1_c) 
                  + np.sum((1 - lower_2b) * prob_u2_c) + np.sum((1 - lower_3b) * prob_u3_c)
     
    denominator_a = np.sum((1 - upper_0a) * prob_u0_c) + np.sum((1 - upper_1a) * prob_u1_c) 
                    + np.sum((1 - upper_2a) * prob_u2_c) + np.sum((1 - upper_3a) * prob_u3_c)
    denominator_b = np.sum(lower_0b * prob_u0_c) + np.sum(lower_1b * prob_u1_c) + np.sum(lower_2b * prob_u2_c) 
                    + np.sum(lower_3b * prob_u3_c)
    
    numerator = numerator_a * numerator_b
    denominator = denominator_a * denominator_b
    
    return numerator / denominator

In [23]:
def bootstrap_or(dataframe, model1, model2, model3, model4, model5):
    '''
    Given a dataframe and 5 models generated from generate_models(), bootstrap the testing set for n2c2 2006 smoking
    dataset to get different error rate matrices to test robustness of the odds ratio casual effect.
    Utilize predict_bootstrap_2006.py to generate pickle files that store the confusion matrices.
    '''
    
    # Iterating through the bootstrapped confusion matrices 
    # "iterations" var depends on how many bootstrapped confusion matrics were generated
    # Default in predict_bootstrap_2006.py is 10
    iterations = 10 
    o_r_arr = []
    for x in range(iterations):
        # Access each pickle file containing the confusion matrix
        f = open("...", "rb")  # First input should be the bootstrapped matrices (pkl file)
        con_matrix = pickle.load(f)
        res = con_matrix/con_matrix.sum(axis=1)[:,None]
        o_r = odds_ratio_bootstrap(dataframe, model1, model2, model3, model4, model5, res)
        o_r_arr.append(o_r)
    
    print(o_r_arr)
    return sum(o_r_arr) / len(o_r_arr)

In [24]:
m1, m2, m3, m4, m5 = generate_models(merged_df)
bootstrap_or(merged_df, m1, m2, m3, m4, m5)

[0.8899959374745883, 0.8898435307077874, 0.8896619266716497, 0.8897183772991344, 0.8895934058255944, 0.8899128538729955, 0.888273217775345, 0.8900140205941309, 0.9027932942358518, 0.8890330213136267]


0.8908839585770704