In [3]:
library(simex)
library(reticulate)

In [4]:
merged_df <- read.csv("combined_mimic_smoking_status_0417.csv")
# head(merged_df)

In [5]:
fml <- 'mort_28_day ~ echo + first_careunit + age + gender + weight + saps + sofa + elix_score + vent + \
            vaso + icu_adm_weekday + icu_adm_hour + icd_chf + icd_afib + icd_renal + icd_liver + icd_copd + \
            icd_cad + icd_stroke + icd_malignancy + vs_heart_rate_first + vs_map_first + vs_temp_first + \
            lab_hemoglobin_first + lab_platelet_first + lab_wbc_first + lab_ph_first + lab_chloride_first + \
            lab_sodium_first + lab_bun_first + lab_bicarbonate_first + lab_pco2_first + lab_creatinine_first + \
            lab_potassium_first + lab_po2_first + lab_lactate_first + sedative + vs_cvp_flag + \
            lab_creatinine_kinase_flag + lab_bnp_flag + lab_troponin_flag + SMOKING_STATUS'
fml

In [6]:
merged_df$SMOKING_STATUS <- as.factor(merged_df$SMOKING_STATUS)

In [7]:
glm_model = glm(as.formula(fml), data = merged_df, family = binomial, na.action = na.exclude)

In [8]:
summary(glm_model)


Call:
glm(formula = as.formula(fml), family = binomial, data = merged_df, 
    na.action = na.exclude)

Coefficients:
                             Estimate Std. Error z value Pr(>|z|)    
(Intercept)                 3.5542731  7.4974785   0.474 0.635455    
echo                       -0.1629170  0.1284231  -1.269 0.204585    
first_careunitSICU         -0.1714718  0.1725299  -0.994 0.320288    
age                         0.0196017  0.0049069   3.995 6.48e-05 ***
genderM                     0.2627325  0.1292458   2.033 0.042071 *  
weight                     -0.0076758  0.0028308  -2.712 0.006697 ** 
saps                        0.0941357  0.0169318   5.560 2.70e-08 ***
sofa                        0.2292723  0.0260908   8.787  < 2e-16 ***
elix_score                  0.0016398  0.0382643   0.043 0.965819    
vent                        0.2919105  0.2076957   1.405 0.159881    
vaso                        0.0096973  0.1592693   0.061 0.951450    
icu_adm_weekdaymonday       0.2885723  0.

In [9]:
matrix_error <- matrix(c(8/11, 0, 2/11, 1/11, 4/11, 4/11, 3/11, 0, 
                         1/16, 0, 14/16, 1/16, 1/63, 0, 1/63, 61/63), nrow=4)
matrix_error <- build.mc.matrix(matrix_error)
dimnames(matrix_error) <- list(levels(merged_df$SMOKING_STATUS), 
                               levels(merged_df$SMOKING_STATUS))
matrix_error

Unnamed: 0,1,2,3,4
1,0.7272727,0.3508917,0.0625,0.015873
2,0.0,0.3545172,0.0,0.0
3,0.1818182,0.2634643,0.875,0.015873
4,0.0909091,0.0311268,0.0625,0.968254


In [11]:
tte_smoking_mcsimex <- mcsimex(glm_model, 
                               SIMEXvariable = "SMOKING_STATUS",
                               mc.matrix=matrix_error, 
                               asymptotic = FALSE)

In [12]:
summary(tte_smoking_mcsimex)

Call:
mcsimex(model = glm_model, SIMEXvariable = "SMOKING_STATUS", 
    mc.matrix = matrix_error, asymptotic = FALSE)

Naive model: 
glm(formula = as.formula(fml), family = binomial, data = merged_df, 
    na.action = na.exclude)

Simex variable : SMOKING_STATUS 
Misclassification matrix: 
          1         2      3        4
1 0.7272727 0.3508917 0.0625 0.015873
2 0.0000000 0.3545172 0.0000 0.000000
3 0.1818182 0.2634643 0.8750 0.015873
4 0.0909091 0.0311268 0.0625 0.968254

Number of iterations:  100 

Residuals: 
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
-0.924002 -0.208724 -0.065014  0.003324  0.175681  0.983596 

Coefficients: 

Jackknife variance: 
                             Estimate Std. Error t value Pr(>|t|)    
(Intercept)                 3.321e+00  7.609e+00   0.436 0.662543    
echo                       -1.564e-01  1.301e-01  -1.202 0.229595    
first_careunitSICU         -1.560e-01  1.748e-01  -0.892 0.372440    
age                         2.020e-02

In [None]:
# plot(tte_smoking_mcsimex)

### Calculating Risk Ratio using MC-SIMEX

In [75]:
merged_df_0 <- read.csv("combined_mimic_smoking_status_0417.csv")

In [76]:
data <- replace(merged_df_0["echo"], merged_df_0["echo"]>0, 0) 
# print(data)

In [77]:
merged_df_0["echo"] <- data
merged_df_0$SMOKING_STATUS <- as.factor(merged_df_0$SMOKING_STATUS)

In [78]:
predictions_0 = predict(tte_smoking_mcsimex, merged_df_0, type="response")

In [79]:
p0_total = sum(predictions_0, na.rm=T)

In [80]:
merged_df_1 <- read.csv("combined_mimic_smoking_status_0417.csv")

In [81]:
data <- replace(merged_df_1["echo"], merged_df_1["echo"]>-1, 1)

In [82]:
merged_df_1["echo"] <- data
merged_df_1$SMOKING_STATUS <- as.factor(merged_df_1$SMOKING_STATUS)

In [83]:
predictions_1 = predict(tte_smoking_mcsimex, merged_df_1, type="response")

In [84]:
p1_total = sum(predictions_1, na.rm=T)

In [85]:
rr = p1_total / p0_total
print(rr)

[1] 0.9343328


In [87]:
merged_df_0$predictions <- predictions_0
merged_df_1$predictions <- predictions_1

status_1a <- merged_df_0[merged_df_0$SMOKING_STATUS==1,]
status_1a_total = sum(status_1a$predictions, na.rm=T)
status_1b <- merged_df_1[merged_df_1$SMOKING_STATUS==1,]
status_1b_total = sum(status_1b$predictions, na.rm=T)
rr_1 <- status_1b_total / status_1a_total
print(rr_1)
    
status_2a <- merged_df_0[merged_df_0$SMOKING_STATUS==2,]
status_2a_total = sum(status_2a$predictions, na.rm=T)
status_2b <- merged_df_1[merged_df_1$SMOKING_STATUS==2,]
status_2b_total = sum(status_2b$predictions, na.rm=T)
rr_2 <- status_2b_total / status_2a_total
print(rr_2)
    
status_3a <- merged_df_0[merged_df_0$SMOKING_STATUS==3,]
status_3a_total = sum(status_3a$predictions, na.rm=T)
status_3b <- merged_df_1[merged_df_1$SMOKING_STATUS==3,]
status_3b_total = sum(status_3b$predictions, na.rm=T)
rr_3 <- status_3b_total / status_3a_total
print(rr_3)
    
status_4a <- merged_df_0[merged_df_0$SMOKING_STATUS==4,]
status_4a_total = sum(status_4a$predictions, na.rm=T)
status_4b <- merged_df_1[merged_df_1$SMOKING_STATUS==4,]
status_4b_total = sum(status_4b$predictions, na.rm=T)
rr_4 <- status_4b_total / status_4a_total
print(rr_4)

[1] 0.930508
[1] 0.9677904
[1] 0.9114576
[1] 0.9443506


### Calculating Odds Ratio using MC-SIMEX

In [29]:
# (p(M=1 | E=1) * P(M=0 | E=0)) / (p(M=0 | E = 1) * p(M=1 | E=0))
predictions_0b <- 1 - predictions_0
p0b_total <- sum(predictions_0b, na.rm=T)

predictions_1b <- 1 - predictions_1
p1b_total <- sum(predictions_1b, na.rm=T)

or <- (p1_total * p0b_total) / (p1b_total * p0_total)
print(or)


[1] 0.9043839


### Bootstrapping Risk Ratio Via Error Rate Matrices

In [32]:
require("reticulate")
source_python("matrix_script_reader.py")

In [63]:
rr_arr <- list()
status_1_rr_arr <- list()
status_2_rr_arr <- list()
status_3_rr_arr <- list()
status_4_rr_arr <- list()

for (x in 0:9){
    print(x)
    tmp_m_error <- matrix_script_reader(paste("INSERT FILE PATH",
                                              as.character(x),".pkl", 
                                              sep=''))
    tmp_m_error <- build.mc.matrix(tmp_m_error, method="log")
    dimnames(tmp_m_error) <- list(levels(merged_df$SMOKING_STATUS), levels(merged_df$SMOKING_STATUS))
    
    tryCatch({boot_mcsimex <- mcsimex(glm_model, 
                                      SIMEXvariable = "SMOKING_STATUS", 
                                      mc.matrix=tmp_m_error, 
                                      asymptotic = FALSE)}
            , error = function(e) {tmp_m_error <- build.mc.matrix(tmp_m_error, method="jlt"); 
                                   boot_mcsimex <- mcsimex(glm_model, 
                                                           SIMEXvariable = "SMOKING_STATUS", 
                                                           mc.matrix=tmp_m_error, 
                                                           asymptotic = FALSE)} )
    
    merged_df_0 <- read.csv("combined_mimic_smoking_status_0417.csv")
    data_0 <- replace(merged_df_0["echo"], merged_df_0["echo"]>0, 0) 
    merged_df_0["echo"] <- data_0
    merged_df_0$SMOKING_STATUS <- as.factor(merged_df_0$SMOKING_STATUS)
    predictions_0 = predict(boot_mcsimex, merged_df_0, type="response")
    p0_total = sum(predictions_0, na.rm=T)
    
    merged_df_1 <- read.csv("combined_mimic_smoking_status_0417.csv")
    data_1 <- replace(merged_df_1["echo"], merged_df_1["echo"]>-1, 1)
    merged_df_1["echo"] <- data_1
    merged_df_1$SMOKING_STATUS <- as.factor(merged_df_1$SMOKING_STATUS)
    predictions_1 = predict(boot_mcsimex, merged_df_1, type="response")
    p1_total = sum(predictions_1, na.rm=T)
    
    merged_df_0$predictions <- predictions_0
    merged_df_1$predictions <- predictions_1
    
    status_1a <- merged_df_0[merged_df_0$SMOKING_STATUS==1,]
    status_1a_total = sum(status_1a$predictions, na.rm=T)
    status_1b <- merged_df_1[merged_df_1$SMOKING_STATUS==1,]
    status_1b_total = sum(status_1b$predictions, na.rm=T)
    status_1_rr_arr <- append(status_1_rr_arr, status_1b_total / status_1a_total)
    
    status_2a <- merged_df_0[merged_df_0$SMOKING_STATUS==2,]
    status_2a_total = sum(status_2a$predictions, na.rm=T)
    status_2b <- merged_df_1[merged_df_1$SMOKING_STATUS==2,]
    status_2b_total = sum(status_2b$predictions, na.rm=T)
    status_2_rr_arr <- append(status_2_rr_arr, status_2b_total / status_2a_total)
    
    status_3a <- merged_df_0[merged_df_0$SMOKING_STATUS==3,]
    status_3a_total = sum(status_3a$predictions, na.rm=T)
    status_3b <- merged_df_1[merged_df_1$SMOKING_STATUS==3,]
    status_3b_total = sum(status_3b$predictions, na.rm=T)
    status_3_rr_arr <- append(status_3_rr_arr, status_3b_total / status_3a_total)
    
    status_4a <- merged_df_0[merged_df_0$SMOKING_STATUS==4,]
    status_4a_total = sum(status_4a$predictions, na.rm=T)
    status_4b <- merged_df_1[merged_df_1$SMOKING_STATUS==4,]
    status_4b_total = sum(status_4b$predictions, na.rm=T)
    status_4_rr_arr <- append(status_4_rr_arr, status_4b_total / status_4a_total)
    
    rr = p1_total / p0_total
    rr_arr <- append(rr_arr, rr)
    print(rr)
    
}

[1] 0
[1] 0.9316413
[1] 1
[1] 0.9343548
[1] 2
[1] 0.9320158
[1] 3
[1] 0.933391
[1] 4
[1] 0.9366119
[1] 5
[1] 0.9303805
[1] 6
[1] 0.9303805
[1] 7
[1] 0.9325448
[1] 8
[1] 0.9311083
[1] 9
[1] 0.9312361


In [69]:
quantile(unlist(rr_arr), c(0.025, 0.975))

In [64]:
# Smoking Status 1 CI
quantile(unlist(status_1_rr_arr), c(0.025, 0.975))

In [65]:
# Smoking Status 2 CI
quantile(unlist(status_2_rr_arr), c(0.025, 0.975))

In [66]:
# Smoking Status 3 CI
quantile(unlist(status_3_rr_arr), c(0.025, 0.975))

In [67]:
# Smoking Status 4 CI
quantile(unlist(status_4_rr_arr), c(0.025, 0.975))

### Bootstrapping Risk Ratio via Sampling Dataframe

In [68]:
rr_df_arr <- list()
status_1_rr_df_arr <- list()
status_2_rr_df_arr <- list()
status_3_rr_df_arr <- list()
status_4_rr_df_arr <- list()
for (x in 0:9){

    sampled_df <- merged_df[sample(nrow(merged_df), size=nrow(merged_df), replace=TRUE), ]
    sampled_df$SMOKING_STATUS <- as.factor(sampled_df$SMOKING_STATUS)
    sampled_glm_model <- glm(as.formula(fml), 
                             data = sampled_df, 
                             family = binomial, 
                             na.action = na.exclude)
    
    matrix_error <- matrix(c(8/11, 0, 2/11, 1/11, 4/11, 4/11, 3/11, 
                             0, 1/16, 0, 14/16, 1/16, 1/63, 0, 1/63, 
                             61/63), nrow=4)
    matrix_error <- build.mc.matrix(matrix_error)
    dimnames(matrix_error) <- list(levels(merged_df$SMOKING_STATUS), levels(merged_df$SMOKING_STATUS))
    
    tryCatch({sampled_mc_simex_model <- mcsimex(sampled_glm_model, 
                                                SIMEXvariable = "SMOKING_STATUS", 
                                                mc.matrix=matrix_error, 
                                                asymptotic = FALSE)}
            , error = function(e) {sampled_df <- merged_df[sample(nrow(merged_df), 
                                                                  size=nrow(merged_df)-1, 
                                                                  replace=TRUE), ]; 
                                   sampled_df$SMOKING_STATUS <- as.factor(sampled_df$SMOKING_STATUS)
                                   sampled_glm_model <- glm(as.formula(fml), 
                                                            data = sampled_df, 
                                                            family = binomial, 
                                                            na.action = na.exclude); 
                                   sampled_mc_simex_model <- mcsimex(sampled_glm_model, 
                                                                     SIMEXvariable = "SMOKING_STATUS", 
                                                                     mc.matrix=matrix_error, 
                                                                     asymptotic = FALSE)} )
    
    sampled_df_0 <- sampled_df
    sampled_data_0 <- replace(sampled_df_0["echo"], sampled_df_0["echo"]>0, 0) 
    sampled_df_0["echo"] <- sampled_data_0
    sampled_df_0$SMOKING_STATUS <- as.factor(sampled_df_0$SMOKING_STATUS)
    sampled_predictions_0 = predict(sampled_mc_simex_model, sampled_df_0, type="response")
    sampled_p0_total = sum(sampled_predictions_0, na.rm=T)
    
    sampled_df_1 <- sampled_df
    sampled_data_1 <- replace(sampled_df_1["echo"], sampled_df_1["echo"]>-1, 1)
    sampled_df_1["echo"] <- sampled_data_1
    sampled_df_1$SMOKING_STATUS <- as.factor(sampled_df_1$SMOKING_STATUS)
    sampled_predictions_1 = predict(sampled_mc_simex_model, sampled_df_1, type="response")
    sampled_p1_total = sum(sampled_predictions_1, na.rm=T)
    
    sampled_df_0$predictions <- sampled_predictions_0
    sampled_df_1$predictions <- sampled_predictions_1
    
    status_1a <- sampled_df_0[sampled_df_0$SMOKING_STATUS==1,]
    status_1a_total = sum(status_1a$predictions, na.rm=T)
    status_1b <- sampled_df_1[sampled_df_1$SMOKING_STATUS==1,]
    status_1b_total = sum(status_1b$predictions, na.rm=T)
    status_1_rr_df_arr <- append(status_1_rr_df_arr, status_1b_total / status_1a_total)
    
    status_2a <- sampled_df_0[sampled_df_0$SMOKING_STATUS==2,]
    status_2a_total = sum(status_2a$predictions, na.rm=T)
    status_2b <- sampled_df_1[sampled_df_1$SMOKING_STATUS==2,]
    status_2b_total = sum(status_2b$predictions, na.rm=T)
    status_2_rr_df_arr <- append(status_2_rr_df_arr, status_2b_total / status_2a_total)
    
    status_3a <- sampled_df_0[sampled_df_0$SMOKING_STATUS==3,]
    status_3a_total = sum(status_3a$predictions, na.rm=T)
    status_3b <- sampled_df_1[sampled_df_1$SMOKING_STATUS==3,]
    status_3b_total = sum(status_3b$predictions, na.rm=T)
    status_3_rr_df_arr <- append(status_3_rr_df_arr, status_3b_total / status_3a_total)
    
    status_4a <- sampled_df_0[sampled_df_0$SMOKING_STATUS==4,]
    status_4a_total = sum(status_4a$predictions, na.rm=T)
    status_4b <- sampled_df_1[sampled_df_1$SMOKING_STATUS==4,]
    status_4b_total = sum(status_4b$predictions, na.rm=T)
    status_4_rr_df_arr <- append(status_4_rr_df_arr, status_4b_total / status_4a_total)

    sample_rr = sampled_p1_total / sampled_p0_total
    rr_df_arr <- append(rr_df_arr, sample_rr)
    print(sample_rr)
}

[1] 0.9177163
[1] 0.9192531
[1] 0.918995
[1] 0.8905925
[1] 0.9624975
[1] 0.9466161
[1] 0.903054
[1] 0.9608147
[1] 0.9609693
[1] 0.9609696


In [70]:
quantile(unlist(rr_df_arr), c(.025, 0.975))

In [71]:
# Smoking Status 1 CI
quantile(unlist(status_1_rr_df_arr), c(0.025, 0.975))

In [72]:
# Smoking Status 2 CI
quantile(unlist(status_2_rr_df_arr), c(0.025, 0.975))

In [73]:
# Smoking Status 3 CI
quantile(unlist(status_3_rr_df_arr), c(0.025, 0.975))

In [74]:
# Smoking Status 4 CI
quantile(unlist(status_4_rr_df_arr), c(0.025, 0.975))

### Bootstrapping Risk Ratio combining both strategies

In [43]:
rr_combined_arr <- list()
for (x in 0:9){

    sampled_df <- merged_df[sample(nrow(merged_df), size=nrow(merged_df), replace=TRUE), ]
    sampled_df$SMOKING_STATUS <- as.factor(sampled_df$SMOKING_STATUS)
    sampled_glm_model <- glm(as.formula(fml), data = sampled_df, family = binomial, na.action = na.exclude)
    
    
    for (y in 0:9){
        
        tmp_m_error <- matrix_script_reader(paste("INSERT FILE PATH",
                                                  as.character(y),".pkl", 
                                                  sep=''))
        tmp_m_error <- build.mc.matrix(tmp_m_error, method="log")
        dimnames(tmp_m_error) <- list(levels(sampled_df$SMOKING_STATUS), levels(sampled_df$SMOKING_STATUS))
        
        tryCatch({sampled_mc_simex_model <- mcsimex(sampled_glm_model, 
                                                    SIMEXvariable = "SMOKING_STATUS", 
                                                    mc.matrix=tmp_m_error, 
                                                    asymptotic = FALSE)}
            , error = function(e) {sampled_df <- merged_df[sample(nrow(merged_df), 
                                                                  size=nrow(merged_df)-1, 
                                                                  replace=TRUE), ]; 
                                   sampled_df$SMOKING_STATUS <- as.factor(sampled_df$SMOKING_STATUS)
                                   sampled_glm_model <- glm(as.formula(fml), 
                                                            data = sampled_df, 
                                                            family = binomial, 
                                                            na.action = na.exclude);
                                   tmp_m_error <- build.mc.matrix(tmp_m_error, method="jlt");
                                   sampled_mc_simex_model <- mcsimex(sampled_glm_model, 
                                                                     SIMEXvariable = "SMOKING_STATUS", 
                                                                     mc.matrix=tmp_m_error, 
                                                                     asymptotic = FALSE)} )
        sampled_df_0 <- sampled_df
        sampled_data_0 <- replace(sampled_df_0["echo"], sampled_df_0["echo"]>0, 0) 
        sampled_df_0["echo"] <- sampled_data_0
        sampled_df_0$SMOKING_STATUS <- as.factor(sampled_df_0$SMOKING_STATUS)
        sampled_predictions_0 = predict(sampled_mc_simex_model, sampled_df_0, type="response")
        sampled_p0_total = sum(sampled_predictions_0, na.rm=T)

        sampled_df_1 <- sampled_df
        sampled_data_1 <- replace(sampled_df_1["echo"], sampled_df_1["echo"]>-1, 1)
        sampled_df_1["echo"] <- sampled_data_1
        sampled_df_1$SMOKING_STATUS <- as.factor(sampled_df_1$SMOKING_STATUS)
        sampled_predictions_1 = predict(sampled_mc_simex_model, sampled_df_1, type="response")
        sampled_p1_total = sum(sampled_predictions_1, na.rm=T)

        sample_rr = sampled_p1_total / sampled_p0_total
        rr_combined_arr <- append(rr_combined_arr, sample_rr)
        print(sample_rr)
           
    }
    
}

[1] 0.8498759
[1] 0.8547966
[1] 0.8494987
[1] 0.8505417
[1] 0.857892
[1] 0.8506836
[1] 0.8506836
[1] 0.8522792
[1] 0.8505662
[1] 0.8507709
[1] 0.8829385
[1] 0.8815192
[1] 0.8804538
[1] 0.881803
[1] 0.8782177
[1] 0.8834176
[1] 0.8834176
[1] 0.8824677
[1] 0.8830197
[1] 0.8812688
[1] 0.9178056
[1] 0.9169759
[1] 0.9148318
[1] 0.9172119
[1] 0.9174504
[1] 0.9179122
[1] 0.9179122
[1] 0.9168176
[1] 0.9168748
[1] 0.9166274
[1] 0.8943567
[1] 0.8936341
[1] 0.8909847
[1] 0.8910933
[1] 0.8968332
[1] 0.8923147
[1] 0.8923147
[1] 0.8946779
[1] 0.8939735
[1] 0.8914655
[1] 0.9099752
[1] 0.9074819
[1] 0.9072579
[1] 0.9080405
[1] 0.9102136
[1] 0.9089579
[1] 0.9089579
[1] 0.9092853
[1] 0.9083634
[1] 0.909059
[1] 0.9191958
[1] 0.9199039
[1] 0.9164768
[1] 0.9179599
[1] 0.9192645
[1] 0.918949
[1] 0.918949
[1] 0.9189645
[1] 0.9166585
[1] 0.918716
[1] 1.009724
[1] 1.014891
[1] 1.010643
[1] 1.012132
[1] 1.017634
[1] 1.007861
[1] 1.007861
[1] 1.008069
[1] 1.00921
[1] 1.008941
[1] 0.9159847
[1] 0.9121385
[1] 0.911

In [44]:
quantile(unlist(rr_combined_arr))

In [47]:
quantile(unlist(rr_combined_arr), c(0.025, 0.975))

### Bootstrapping Odds Ratio via Error Rate Matrices

In [33]:
or_arr <- list()
for (x in 0:9){
    print(x)
    tmp_m_error <- matrix_script_reader(paste("INSERT FILE PATH",
                                              as.character(x),".pkl", 
                                              sep=''))
    tmp_m_error <- build.mc.matrix(tmp_m_error, method="log")
    dimnames(tmp_m_error) <- list(levels(merged_df$SMOKING_STATUS), levels(merged_df$SMOKING_STATUS))
    
    tryCatch({boot_mcsimex <- mcsimex(glm_model, 
                                      SIMEXvariable = "SMOKING_STATUS", 
                                      mc.matrix=tmp_m_error, 
                                      asymptotic = FALSE)}
            , error = function(e) {tmp_m_error <- build.mc.matrix(tmp_m_error, method="jlt"); 
                                   boot_mcsimex <- mcsimex(glm_model, 
                                                           SIMEXvariable = "SMOKING_STATUS", 
                                                           mc.matrix=tmp_m_error, 
                                                           asymptotic = FALSE)} )
    
    merged_df_0 <- read.csv("combined_mimic_smoking_status_0417.csv")
    data_0 <- replace(merged_df_0["echo"], merged_df_0["echo"]>0, 0) 
    merged_df_0["echo"] <- data_0
    merged_df_0$SMOKING_STATUS <- as.factor(merged_df_0$SMOKING_STATUS)
    predictions_0 = predict(boot_mcsimex, merged_df_0, type="response")
    p0_total = sum(predictions_0, na.rm=T)
    
    merged_df_1 <- read.csv("combined_mimic_smoking_status_0417.csv")
    data_1 <- replace(merged_df_1["echo"], merged_df_1["echo"]>-1, 1)
    merged_df_1["echo"] <- data_1
    merged_df_1$SMOKING_STATUS <- as.factor(merged_df_1$SMOKING_STATUS)
    predictions_1 = predict(boot_mcsimex, merged_df_1, type="response")
    p1_total = sum(predictions_1, na.rm=T)
    
    
    predictions_0b <- 1 - predictions_0
    p0b_total <- sum(predictions_0b, na.rm=T)

    predictions_1b <- 1 - predictions_1
    p1b_total <- sum(predictions_1b, na.rm=T)

    or <- (p1_total * p0b_total) / (p1b_total * p0_total)
    or_arr <- append(or_arr, or)
    print(or)
    
}

[1] 0
[1] 0.9013468
[1] 1
[1] 0.9044345
[1] 2
[1] 0.8999322
[1] 3
[1] 0.9024599
[1] 4
[1] 0.9085702
[1] 5
[1] 0.8986585
[1] 6
[1] 0.8986585
[1] 7
[1] 0.898196
[1] 8
[1] 0.8997136
[1] 9
[1] 0.8980627


In [34]:
quantile(unlist(or_arr), c(0.025, 0.975))

### Bootstrapping Odds Ratio via Sampling Dataframe

In [36]:
or_df_arr <- list()
for (x in 0:9){

    sampled_df <- merged_df[sample(nrow(merged_df), size=nrow(merged_df), replace=TRUE), ]
    sampled_df$SMOKING_STATUS <- as.factor(sampled_df$SMOKING_STATUS)
    sampled_glm_model <- glm(as.formula(fml), 
                             data = sampled_df, 
                             family = binomial, 
                             na.action = na.exclude)
    
    matrix_error <- matrix(c(8/11, 0, 2/11, 1/11, 4/11, 4/11, 3/11, 
                             0, 1/16, 0, 14/16, 1/16, 1/63, 0, 1/63, 
                             61/63), nrow=4)
    matrix_error <- build.mc.matrix(matrix_error)
    dimnames(matrix_error) <- list(levels(merged_df$SMOKING_STATUS), levels(merged_df$SMOKING_STATUS))
    
    tryCatch({sampled_mc_simex_model <- mcsimex(sampled_glm_model, 
                                                SIMEXvariable = "SMOKING_STATUS", 
                                                mc.matrix=matrix_error, 
                                                asymptotic = FALSE)}
            , error = function(e) {sampled_df <- merged_df[sample(nrow(merged_df), 
                                                                  size=nrow(merged_df)-1, 
                                                                  replace=TRUE), ]; 
                                   sampled_df$SMOKING_STATUS <- as.factor(sampled_df$SMOKING_STATUS)
                                   sampled_glm_model <- glm(as.formula(fml), 
                                                            data = sampled_df, 
                                                            family = binomial, 
                                                            na.action = na.exclude); 
                                   sampled_mc_simex_model <- mcsimex(sampled_glm_model, 
                                                                     SIMEXvariable = "SMOKING_STATUS", 
                                                                     mc.matrix=matrix_error, 
                                                                     asymptotic = FALSE)} )
    
    sampled_df_0 <- sampled_df
    sampled_data_0 <- replace(sampled_df_0["echo"], sampled_df_0["echo"]>0, 0) 
    sampled_df_0["echo"] <- sampled_data_0
    sampled_df_0$SMOKING_STATUS <- as.factor(sampled_df_0$SMOKING_STATUS)
    sampled_predictions_0 = predict(sampled_mc_simex_model, sampled_df_0, type="response")
    sampled_p0_total = sum(sampled_predictions_0, na.rm=T)
    
    sampled_df_1 <- sampled_df
    sampled_data_1 <- replace(sampled_df_1["echo"], sampled_df_1["echo"]>-1, 1)
    sampled_df_1["echo"] <- sampled_data_1
    sampled_df_1$SMOKING_STATUS <- as.factor(sampled_df_1$SMOKING_STATUS)
    sampled_predictions_1 = predict(sampled_mc_simex_model, sampled_df_1, type="response")
    sampled_p1_total = sum(sampled_predictions_1, na.rm=T)
    
    sampled_predictions_0b <- 1 - sampled_predictions_0
    sampled_p0b_total <- sum(sampled_predictions_0b, na.rm=T)
    
    sampled_predictions_1b <- 1 - sampled_predictions_1
    sampled_p1b_total <- sum(sampled_predictions_1b, na.rm=T)

    sample_or <- (sampled_p1_total * sampled_p0b_total) / (sampled_p1b_total * sampled_p0_total)
    or_df_arr <- append(or_df_arr, sample_or)
    print(sample_or)
}

[1] 0.9597897
[1] 0.8355193
[1] 0.7804283
[1] 0.7817963
[1] 0.7811384
[1] 0.7555916
[1] 0.8333176
[1] 0.838733
[1] 0.9276948
[1] 0.9031998


In [38]:
quantile(unlist(or_df_arr), c(0.025, 0.975))

### Bootstrapping Odds Ratio combining both strategies

In [39]:
or_combined_arr <- list()
for (x in 0:9){

    sampled_df <- merged_df[sample(nrow(merged_df), size=nrow(merged_df), replace=TRUE), ]
    sampled_df$SMOKING_STATUS <- as.factor(sampled_df$SMOKING_STATUS)
    sampled_glm_model <- glm(as.formula(fml), data = sampled_df, family = binomial, na.action = na.exclude)
    
    
    for (y in 0:9){
        
        tmp_m_error <- matrix_script_reader(paste("INSERT FILE PATH",
                                                  as.character(y),".pkl", 
                                                  sep=''))
        tmp_m_error <- build.mc.matrix(tmp_m_error, method="log")
        dimnames(tmp_m_error) <- list(levels(sampled_df$SMOKING_STATUS), levels(sampled_df$SMOKING_STATUS))
        
        tryCatch({sampled_mc_simex_model <- mcsimex(sampled_glm_model, 
                                                    SIMEXvariable = "SMOKING_STATUS", 
                                                    mc.matrix=tmp_m_error, 
                                                    asymptotic = FALSE)}
            , error = function(e) {sampled_df <- merged_df[sample(nrow(merged_df), 
                                                                  size=nrow(merged_df)-1, 
                                                                  replace=TRUE), ]; 
                                   sampled_df$SMOKING_STATUS <- as.factor(sampled_df$SMOKING_STATUS)
                                   sampled_glm_model <- glm(as.formula(fml), 
                                                            data = sampled_df, 
                                                            family = binomial, 
                                                            na.action = na.exclude);
                                   tmp_m_error <- build.mc.matrix(tmp_m_error, method="jlt");
                                   sampled_mc_simex_model <- mcsimex(sampled_glm_model, 
                                                                     SIMEXvariable = "SMOKING_STATUS", 
                                                                     mc.matrix=tmp_m_error, 
                                                                     asymptotic = FALSE)} )
        sampled_df_0 <- sampled_df
        sampled_data_0 <- replace(sampled_df_0["echo"], sampled_df_0["echo"]>0, 0) 
        sampled_df_0["echo"] <- sampled_data_0
        sampled_df_0$SMOKING_STATUS <- as.factor(sampled_df_0$SMOKING_STATUS)
        sampled_predictions_0 = predict(sampled_mc_simex_model, sampled_df_0, type="response")
        sampled_p0_total = sum(sampled_predictions_0, na.rm=T)

        sampled_df_1 <- sampled_df
        sampled_data_1 <- replace(sampled_df_1["echo"], sampled_df_1["echo"]>-1, 1)
        sampled_df_1["echo"] <- sampled_data_1
        sampled_df_1$SMOKING_STATUS <- as.factor(sampled_df_1$SMOKING_STATUS)
        sampled_predictions_1 = predict(sampled_mc_simex_model, sampled_df_1, type="response")
        sampled_p1_total = sum(sampled_predictions_1, na.rm=T)
        
        sampled_predictions_0b <- 1 - sampled_predictions_0
        sampled_p0b_total <- sum(sampled_predictions_0b, na.rm=T)

        sampled_predictions_1b <- 1 - sampled_predictions_1
        sampled_p1b_total <- sum(sampled_predictions_1b, na.rm=T)

        sample_or <- (sampled_p1_total * sampled_p0b_total) / (sampled_p1b_total * sampled_p0_total)
        or_combined_arr <- append(or_combined_arr, sample_or)
        print(sample_or)
           
    }
    
}

[1] 0.8307577
[1] 0.8349587
[1] 0.8326337
[1] 0.8305974
[1] 0.8358218
[1] 0.8253645
[1] 0.8253645
[1] 0.8284771
[1] 0.8315031
[1] 0.8288004
[1] 0.8571802
[1] 0.8613284
[1] 0.8622972
[1] 0.8616072
[1] 0.8593255
[1] 0.8558702
[1] 0.8558702
[1] 0.8586496
[1] 0.8608632
[1] 0.8631997
[1] 0.9027386
[1] 0.9036023
[1] 0.9011807
[1] 0.9030251
[1] 0.9060547
[1] 0.894438
[1] 0.894438
[1] 0.898146
[1] 0.9020912
[1] 0.8995272
[1] 0.8718458
[1] 0.8761887
[1] 0.871458
[1] 0.873121
[1] 0.8759731
[1] 0.8715824
[1] 0.8715824
[1] 0.8706098
[1] 0.8708969
[1] 0.8723712
[1] 0.9971386
[1] 1.001183
[1] 0.9943017
[1] 0.9971899
[1] 1.006179
[1] 0.991759
[1] 0.991759
[1] 0.9912891
[1] 0.9969101
[1] 0.9953207
[1] 0.9042866
[1] 0.9051599
[1] 0.9023571
[1] 0.9060004
[1] 0.9001079
[1] 0.9051268
[1] 0.9051268
[1] 0.9106352
[1] 0.9033449
[1] 0.9080424
[1] 0.8678863
[1] 0.8728354
[1] 0.8699545
[1] 0.8687646
[1] 0.875112
[1] 0.8633982
[1] 0.8633982
[1] 0.8664942
[1] 0.8669841
[1] 0.8667989
[1] 0.8566376
[1] 0.8689318
[1

In [40]:
quantile(unlist(or_combined_arr), c(0.025, 0.975))