# Workflow to extract PIP and set information for different methods

In [1]:
%revisions -s -n 10

Revision,Author,Date,Message
,,,
62926ca,Gao Wang,2018-06-25,Update coverage results
f96a5ab,Gao Wang,2018-06-23,Update power table
b2e364b,Gao Wang,2018-06-23,Adjust PIP cutoff for PRC
2cc5551,Gao Wang,2018-06-23,Implement precision-recall curve
bd74ef0,Gao Wang,2018-06-22,New simulations and add FINEMAP
f28ff9b,Gao Wang,2018-06-22,Add FINEMAP PIP calibrated
e19690c,Gao Wang,2018-06-22,Add ROC for other methods
2c06a8c,Gao Wang,2018-06-15,Complete susie-dap hard case comparisons
a769cb1,Gao Wang,2018-06-12,Update to susieR interface change


Previously I've ran this specific DSC using:

```
dsc susie.dsc --target run_comparison -o susie_comparison
```

So here I query from that result.

In [2]:
[global]
cwd = path('~/GIT/github/mvarbvs/dsc')
dirname = path(f'{cwd:a}/susie_comparison/')
date = '0622'
ld_col = 1
susie_prior = 0.1
#susie_prior = 0.05

## Get data

In [3]:
[pip_1, power_1, cali_pip_1, coverage_1, roc_1]
output: f'{dirname}/PIP_comparison_{date}.rds'
R: expand = '${ }', workdir = cwd
    dap_out = dscrutils::dscquery(${dirname:br}, 
                        target = "liter_data.dataset liter_data lm_less lm_less.pve lm_less.n_signal fit_dap plot_dap",
                             load.pkl = TRUE)
    susie_out = dscrutils::dscquery(${dirname:br}, 
                        target = "liter_data.dataset liter_data lm_less lm_less.pve lm_less.n_signal fit_susie.prior_var fit_susie.estimate_residual_variance fit_susie plot_susie",
                             load.pkl = TRUE)
    caviar_out = dscrutils::dscquery(${dirname:br}, 
                        target = "liter_data.dataset liter_data lm_less lm_less.pve lm_less.n_signal fit_caviar.args fit_caviar plot_caviar",
                             load.pkl = FALSE)
    finemap_out = dscrutils::dscquery(${dirname:br}, 
                        target = "liter_data.dataset liter_data lm_less lm_less.pve lm_less.n_signal fit_finemap.args fit_finemap plot_finemap",
                             load.pkl = FALSE)
    saveRDS(list(dap=dap_out, susie=susie_out, caviar=caviar_out, finemap=finemap_out), ${_output:r})

## PIP comparison

In [4]:
[pip_2]
pip_after_filter = ['FALSE', 'TRUE']
est_var = ['FALSE', 'TRUE']
ld_cutoff = [0,0.5]
input: for_each = ['pip_after_filter', 'ld_cutoff', 'est_var'], concurrent = True
output: f'{_input:n}_estvar_{_est_var.lower()}_filter_{_pip_after_filter.lower()}_{str(_ld_cutoff).replace(".", "p")}.rds'
R: stdout = f'{_output:n}.log', expand = '${ }', workdir = cwd
    ld_col = ${ld_col}
    ld_cutoff = ${_ld_cutoff}
    dat = readRDS(${_input:r})
    dap_out = dat$dap
    caviar_out = dat$caviar
    susie_out = dat$susie
    finemap_out = dat$finemap
    # favorit susie flavor
    susie_out = susie_out[which(susie_out$fit_susie.prior_var == ${susie_prior} & susie_out$fit_susie.estimate_residual_variance == ${_est_var}), ]
    susie_out = subset(susie_out, select =-c(lm_less.pve, fit_susie.prior_var, fit_susie.estimate_residual_variance))

    data_sets = unique(susie_out$liter_data.dataset)
    n_signals = unique(susie_out$lm_less.n_signal)

    result = list()
    for (s in n_signals) {
        result[[as.character(s)]] = NULL
        if (s > 3) {
            has_caviar = FALSE
        } else {
            has_caviar = TRUE
        }
        print(paste('==============', s, '=============='))
        for (d in data_sets) {
            out_files = susie_out[which(susie_out$lm_less.n_signal == s & susie_out$liter_data.dataset == d),c("fit_susie.output.file", "plot_susie.output.file", "lm_less.output.file")]
            fit = readRDS(paste0(${dirname:r}, '/', out_files[1,1], '.rds'))$posterior
            purity = readRDS(paste0(${dirname:r}, '/', out_files[1,2], '.rds'))
            truth = readRDS(paste0(${dirname:r}, '/', out_files[1,3], '.rds'))$data$true_coef
            for (r in 1:1) {
                signals = which(truth[,r]!=0)
                if (${_pip_after_filter}) {
                  alpha = fit$alpha[[r]][which(purity$purity[[paste0("V",r)]][,ld_col] > ld_cutoff),,drop=FALSE]
                } else {
                  alpha = fit$alpha[[r]]
                }
                pip = t(1 - apply(1 - alpha, 2, prod))
                in_CI_raw = fit$in_CI[[r]]
                in_CI_raw = in_CI_raw[which(purity$purity[[paste0("V",r)]][,ld_col] > ld_cutoff),,drop=FALSE]
                in_CI = which(colSums(in_CI_raw) > 0)
                pip = pip[in_CI]
                out_files = dap_out[which(dap_out$lm_less.n_signal == s & dap_out$liter_data.dataset == d),c("fit_dap.output.file"),drop=FALSE]
                dap = readRDS(paste0(${dirname:r}, '/', out_files[1,1], '.rds'))$posterior
                dap = dap[[paste0("V",r-1)]]$snp
                #print(head(dap, length(pip)))
                dap = dap[which(dap$snp %in% as.character(in_CI)),]
                dap = dap[match(in_CI, dap$snp),]
                #print(dap)
                #print(pip)        
                #print(in_CI)
                if (has_caviar) {
                    out_files = caviar_out[which(caviar_out$lm_less.n_signal == s & caviar_out$liter_data.dataset == d & caviar_out$fit_caviar.args == paste('-c', s)),c("fit_caviar.output.file"),drop=FALSE]
                    caviar = readRDS(paste0(${dirname:r}, '/', out_files[1,1], '.rds'))$posterior
                    caviar = caviar[[r]]$snp
                    caviar = caviar[which(caviar$snp %in% as.character(in_CI)),]
                    caviar = caviar[match(in_CI, caviar$snp),]
                    out_files = finemap_out[which(finemap_out$lm_less.n_signal == s & finemap_out$liter_data.dataset == d & finemap_out$fit_finemap.args == paste('--n-causal-max', s)),c("fit_finemap.output.file"),drop=FALSE]
                    finemap = readRDS(paste0(${dirname:r}, '/', out_files[1,1], '.rds'))$posterior
                    finemap = finemap[[r]]$snp
                    finemap = finemap[which(finemap$snp %in% as.character(in_CI)),]
                    finemap = finemap[match(in_CI, finemap$snp),]
                    pip = cbind(pip, as.vector(dap$snp_prob), as.vector(caviar$snp_prob), as.vector(finemap$snp_prob), in_CI %in% signals)
                } else {
                    pip = cbind(pip, as.vector(dap$snp_prob), in_CI %in% signals)
                }
                ## BEGIN debug
                outlier = pip[which(pip[,1] < 0.2 & pip[,2]>0.9), ,drop=F]
                if (nrow(outlier)>0 && s == 1) {
                  print("DAP outlier")
                  print(d)
                }
                if (has_caviar && s == 1) {
                  conflict = pip[which(pip[,1] < 0.95 & pip[,3] > 0.95), ,drop=F]
                  if (nrow(conflict) > 0) {
                      print("CAVIAR-susie conflict")
                      print(d)
                      print("CAVIAR")
                      print(caviar[which(caviar$snp_prob>0.95),])
                      print("susie")
                      print(purity$purity[[paste0("V",r)]][,ld_col])
                      print(rowSums(in_CI_raw))
                  }
                }
                ## END debug  
                if (is.null(result[[as.character(s)]])) {
                    result[[as.character(s)]] = pip
                } else {
                    result[[as.character(s)]] = rbind(result[[as.character(s)]], pip)
                }
            }
        }
        result[[as.character(s)]] = data.frame(result[[as.character(s)]])
        if (has_caviar) {
            colnames(result[[as.character(s)]]) = c('susie', 'dap', 'caviar', 'finemap', 'is_signal')
        } else {
            colnames(result[[as.character(s)]]) = c('susie', 'dap', 'is_signal')
        }
    }
    saveRDS(result, ${_output:r})

In [4]:
[pip_3]
comparisons = ['susie_vs_dap', 'susie_vs_caviar', 'susie_vs_finemap', 'dap_vs_caviar', 'dap_vs_finemap', 'caviar_vs_finemap']
input: group_by = 1, concurrent = True
output: paths([f'{_input:n}.{x}.png' for x in comparisons])
R: expand = '${ }'
    result = readRDS(${_input:r})
    merge_img = function(prefix, n) {
        files = paste0(prefix, '_', seq(1:n), '.png')
        cmd = paste('convert +append', paste(files, collapse=" "), paste0(prefix, '.png'))
        system(cmd)
        system(paste('rm -f', paste(files, collapse=" ")))
    }
    pip_cutoff = 0
    # susie vs dap
    for (i in 1:5) {
        i = as.character(i)
        x = result[[i]][result[[i]]$susie > pip_cutoff & result[[i]]$dap > pip_cutoff,]
        colors = sapply(1:length(x$is_signal), function(i) ifelse(x$is_signal[i],'#800000','#002b36'))
        png(paste0(${_output[0]:nr}, '_', i, '.png'), 400, 600)
        plot(x$susie, x$dap, xlab = paste('PIP SuSiE >', pip_cutoff), ylab = paste('PIP DAP-G >', pip_cutoff),
             main = paste('num. causal:', i, '\ncor:', round(cor(x)[1,2],2)),
            col = colors, pch = 20, cex = 1.5)
        abline(0,1,col=2)
        abline(h=0.95, col='gray')
        abline(v=0.95, col='gray')
        dev.off()
    }
    merge_img(${_output[0]:nr}, 5)
    # susie vs caviar
    for (i in 1:3) {
        i = as.character(i)
        x = result[[i]][result[[i]]$susie > pip_cutoff & result[[i]]$caviar > pip_cutoff,]
        colors = sapply(1:length(x$is_signal), function(i) ifelse(x$is_signal[i],'#800000','#002b36'))
        png(paste0(${_output[1]:nr}, '_', i, '.png'), 600, 600)
        plot(x$susie, x$caviar, xlab = paste('PIP SuSiE >', pip_cutoff), ylab = paste('PIP CAVIAR >', pip_cutoff),
             main = paste('num. causal:', i, '\ncor:', round(cor(x)[1,2],2)),
            col = colors, pch = 20, cex = 1.5)
        abline(0,1,col=2)
        abline(h=0.95, col='gray')
        abline(v=0.95, col='gray')
        dev.off()
    }
    merge_img(${_output[1]:nr}, 3)
    # susie vs finemap
    for (i in 1:3) {
        i = as.character(i)
        x = result[[i]][result[[i]]$susie > pip_cutoff & result[[i]]$finemap > pip_cutoff,]
        colors = sapply(1:length(x$is_signal), function(i) ifelse(x$is_signal[i],'#800000','#002b36'))
        png(paste0(${_output[2]:nr}, '_', i, '.png'), 600, 600)
        plot(x$susie, x$finemap, xlab = paste('PIP SuSiE >', pip_cutoff), ylab = paste('PIP FINEMAP >', pip_cutoff),
             main = paste('num. causal:', i, '\ncor:', round(cor(x)[1,2],2)),
            col = colors, pch = 20, cex = 1.5)
        abline(0,1,col=2)
        abline(h=0.95, col='gray')
        abline(v=0.95, col='gray')
        dev.off()
    }
    merge_img(${_output[2]:nr}, 3)
    # dap vs caviar
    for (i in 1:3) {
        i = as.character(i)
        x = result[[i]][result[[i]]$dap > pip_cutoff & result[[i]]$caviar > pip_cutoff,]
        colors = sapply(1:length(x$is_signal), function(i) ifelse(x$is_signal[i],'#800000','#002b36'))
        png(paste0(${_output[3]:nr}, '_', i, '.png'), 600, 600)
        plot(x$dap, x$caviar, xlab = paste('PIP DAP-G >', pip_cutoff), ylab = paste('PIP CAVIAR >', pip_cutoff),
             main = paste('num. causal:', i, '\ncor:', round(cor(x)[1,2],2)),
            col = colors, pch = 20, cex = 1.5)
        abline(0,1,col=2)
        abline(h=0.95, col='gray')
        abline(v=0.95, col='gray')
        dev.off()
    }
    merge_img(${_output[3]:nr}, 3)
    # dap vs finemap
    for (i in 1:3) {
        i = as.character(i)
        x = result[[i]][result[[i]]$dap > pip_cutoff & result[[i]]$finemap > pip_cutoff,]
        colors = sapply(1:length(x$is_signal), function(i) ifelse(x$is_signal[i],'#800000','#002b36'))
        png(paste0(${_output[4]:nr}, '_', i, '.png'), 600, 600)
        plot(x$dap, x$finemap, xlab = paste('PIP DAP-G >', pip_cutoff), ylab = paste('PIP FINEMAP >', pip_cutoff),
             main = paste('num. causal:', i, '\ncor:', round(cor(x)[1,2],2)),
            col = colors, pch = 20, cex = 1.5)
        abline(0,1,col=2)
        abline(h=0.95, col='gray')
        abline(v=0.95, col='gray')
        dev.off()
    }
    merge_img(${_output[4]:nr}, 3)
    # caviar vs finemap
    for (i in 1:3) {
        i = as.character(i)
        x = result[[i]][result[[i]]$caviar > pip_cutoff & result[[i]]$finemap > pip_cutoff,]
        colors = sapply(1:length(x$is_signal), function(i) ifelse(x$is_signal[i],'#800000','#002b36'))
        png(paste0(${_output[5]:nr}, '_', i, '.png'), 600, 600)
        plot(x$caviar, x$finemap, xlab = paste('PIP CAVIAR >', pip_cutoff), ylab = paste('PIP FINEMAP >', pip_cutoff),
             main = paste('num. causal:', i, '\ncor:', round(cor(x)[1,2],2)),
            col = colors, pch = 20, cex = 1.5)
        abline(0,1,col=2)
        abline(h=0.95, col='gray')
        abline(v=0.95, col='gray')
        dev.off()
    }
    merge_img(${_output[5]:nr}, 3)

## Summary of discovery

In [5]:
[power_2]
# Power analysis
# to match with DAP -ld_control 0.25
ld_avg_col = 2
ld_cutoff = 0.25
est_var = ['FALSE', 'TRUE']
# to match with susie 95% mappable CS, we set dap cutoff to 0.95 also
dap_cluster_cutoff = [('cluster_prob', 0.95), ('cluster_avg_r2', 0.25)]
input: for_each = ['dap_cluster_cutoff', 'est_var'], group_by = 1, concurrent = True
output: f'{dirname}/Power_comparison_{date}_{_dap_cluster_cutoff[0]}_estvar_{_est_var.lower()}.rds'
R: stdout = f'{_output:n}.log', expand = '${ }', workdir = cwd
    ld_col = ${ld_col}
    ld_cutoff = ${ld_cutoff}
    dat = readRDS(${_input:r})
    dap_out = dat$dap
    susie_out = dat$susie
    # favorit susie flavor
    susie_out = susie_out[which(susie_out$fit_susie.prior_var == ${susie_prior} & susie_out$fit_susie.estimate_residual_variance == ${_est_var}), ]
    susie_out = subset(susie_out, select =-c(lm_less.pve, fit_susie.prior_var, fit_susie.estimate_residual_variance))
    data_sets = unique(susie_out$liter_data.dataset)
    n_signals = unique(susie_out$lm_less.n_signal)
    n_r = 2
    n_experiments = n_r * length(data_sets)
    result = NULL
    for (s in n_signals) {
        susie_signals = 0
        dap_signals = 0
        susie_avg_ld = 0
        dap_avg_ld = 0
        susie_size = 0
        dap_size = 0
        # fixme: I cannot find a good median tracker so do it stupid way
        susie_sizes = vector()
        dap_sizes = vector()
        susie_tdc = 0
        dap_tdc = 0
        susie_dc = 0
        dap_dc = 0
        susie_tc = 0
        dap_tc = 0
        for (d in data_sets) {
            out_files = susie_out[which(susie_out$lm_less.n_signal == s & susie_out$liter_data.dataset == d), c("fit_susie.output.file", "plot_susie.output.file", "lm_less.output.file")]
            fit = readRDS(paste0(${dirname:r}, '/', out_files[1,1], '.rds'))$posterior
            purity = readRDS(paste0(${dirname:r}, '/', out_files[1,2], '.rds'))
            truth = readRDS(paste0(${dirname:r}, '/', out_files[1,3], '.rds'))$data$true_coef
            for (r in 1:n_r) {
                signals = which(truth[,r]!=0)
                # susie in CS
                susie_cs = fit$in_CI[[r]]
                susie_cs_raw = susie_cs[which(purity$purity[[paste0('V',r)]][,ld_col] > ld_cutoff),,drop=FALSE]
                cs_purity = purity$purity[[paste0('V',r)]][which(purity$purity[[paste0('V',r)]][,ld_col] > ld_cutoff),,drop=FALSE]
                susie_cs = list()
                if (nrow(susie_cs_raw) > 0) {
                    for (i in 1:nrow(susie_cs_raw)) {
                      susie_cs[[i]] = which(susie_cs_raw[i,] > 0)
                      if (length(susie_cs[[i]]) == 0) {
                          susie_tc = susie_tc - 1
                          next
                      }
                      if (any(signals %in% susie_cs[[i]])) {
                          susie_tdc = susie_tdc + 1
                          susie_size = susie_size + length(susie_cs[[i]])
                          susie_sizes = c(susie_sizes, length(susie_cs[[i]]))
                          susie_avg_ld = susie_avg_ld + cs_purity[i,${ld_avg_col}]
                          susie_dc = susie_dc + 1
                      }
                    }
                    susie_signals = susie_signals + sum(signals %in% unique(unlist(susie_cs)))
                }
                print(paste('==============', s, '=============='))
                print(susie_cs)
                susie_tc = susie_tc + length(susie_cs)
                # DAP in cluster
                out_files = dap_out[which(dap_out$lm_less.n_signal == s & dap_out$liter_data.dataset == d),c("fit_dap.output.file"),drop=FALSE]
                dap = readRDS(paste0(${dirname:r}, '/', out_files[1,1], '.rds'))$posterior
                dap_cluster_raw = dap[[paste0('V', r-1)]]$set[which(dap[[paste0('V', r-1)]]$set$${_dap_cluster_cutoff[0]} > ${_dap_cluster_cutoff[1]}), ]
                dap_cluster_ld = dap_cluster_raw$cluster_avg_r2
                dap_cluster_raw = dap_cluster_raw$snp
                dap_cluster = list()
                if (length(dap_cluster_raw) > 0) {
                    for (i in 1:length(dap_cluster_raw)) {
                      dap_cluster[[i]] = as.integer(unlist(strsplit(dap_cluster_raw[i], ",")))
                      if (any(signals %in% dap_cluster[[i]])) {
                          dap_size = dap_size + length(dap_cluster[[i]])
                          dap_sizes = c(dap_sizes, length(dap_cluster[[i]]))
                          dap_avg_ld = dap_avg_ld + sqrt(dap_cluster_ld[i])
                          dap_tdc = dap_tdc + 1
                          dap_dc = dap_dc + 1
                      }
                    }
                    dap_signals = dap_signals + sum(signals %in% unique(unlist(dap_cluster)))
                }
                print(dap_cluster)
                dap_tc = dap_tc + length(dap_cluster)
                ## BEGIN debug
                ## susie made more true discovery than DAP
                if (susie_dc > dap_dc) {
                  print('DAP miss')
                  print(dap[[paste0('V', r-1)]]$set)
                  print(d)
                }
                ## DAP made some (false) discovery, susie did not
                ## under n = 1
                if (length(dap_cluster) > dap_dc && s == 1) {
                  print('DAP false discovery')
                  print(dap[[paste0('V', r-1)]]$set)
                  print(d)  
                }
                ## END debug
                susie_dc = 0
                dap_dc = 0
            }
        }
        rates = c(s, s*n_experiments, susie_tc, dap_tc, susie_signals/s/n_experiments, dap_signals/s/n_experiments, 1 - (susie_tdc/susie_tc), 1 - (dap_tdc/dap_tc), susie_size / susie_tdc, dap_size / dap_tdc, median(susie_sizes), median(dap_sizes), susie_avg_ld / susie_tdc, dap_avg_ld / dap_tdc)
        if (is.null(result)) {
          result = rates
        } else {
          result = rbind(result, rates)
        }
    }
    colnames(result) = c('n_signal', 'expected_discoveries', 'susie_discoveries', 'dap_discoveries', 'susie_power', 'dap_power', 'susie_fdp', 'dap_fdp', 'susie_avg_size', 'dap_avg_size', 'susie_median_size', 'dap_median_size', 'susie_avg_ld', 'dap_avg_ld')
    rownames(result) = as.character(result[,1])
    saveRDS(data.frame(result), ${_output:r})

## PIP calibration

In [None]:
[cali_pip_2]
est_var = ['FALSE', 'TRUE']
ld_cutoff = 0.25
bin_size = 20
input: for_each = 'est_var', concurrent = True
output: f'{_input:n}.calibrated.estvar_{_est_var.lower()}.rds'
R: stdout = f'{_output:n}.log', expand = '${ }', workdir = cwd
    ld_col = ${ld_col}
    ld_cutoff = ${ld_cutoff}
    pip_cutoff = 0
    dat = readRDS(${_input:r})
    dap_out = dat$dap
    caviar_out = dat$caviar
    finemap_out = dat$finemap
    susie_out = dat$susie
    # favorit susie flavor
    susie_out = susie_out[which(susie_out$fit_susie.prior_var == ${susie_prior} & susie_out$fit_susie.estimate_residual_variance == ${_est_var}), ]
    susie_out = subset(susie_out, select =-c(lm_less.pve, fit_susie.prior_var, fit_susie.estimate_residual_variance))

    data_sets = unique(susie_out$liter_data.dataset)
    n_signals = unique(susie_out$lm_less.n_signal)
  
    result = list()
    pip_cali = list()
    for (s in n_signals) {
        result[[as.character(s)]] = NULL
        pip_cali[[as.character(s)]] = list(susie = c(0,0,0), dap = c(0,0,0), caviar = c(0,0,0))
        if (s > 3) {
            has_caviar = FALSE
        } else {
            has_caviar = TRUE
        }
        for (d in data_sets) {
            out_files = susie_out[which(susie_out$lm_less.n_signal == s & susie_out$liter_data.dataset == d),c("fit_susie.output.file", "plot_susie.output.file", "lm_less.output.file")]
            fit = readRDS(paste0(${dirname:r}, '/', out_files[1,1], '.rds'))$posterior
            purity = readRDS(paste0(${dirname:r}, '/', out_files[1,2], '.rds'))
            truth = readRDS(paste0(${dirname:r}, '/', out_files[1,3], '.rds'))$data$true_coef
            for (r in c(1,2)) {
                signals = truth[,r]
                signals[which(signals!=0)] = 1
                alpha = fit$alpha[[r]][which(purity$purity[[paste0('V',r)]][,ld_col] > ld_cutoff),,drop=FALSE]
                # susie PIP
                susie = as.vector(t(1 - apply(1 - alpha, 2, prod)))
                #in_CI_raw = fit$in_CI[[r]]
                #in_CI_raw = in_CI_raw[which(purity$purity[[paste0('V',r)]][,ld_col] > ld_cutoff),,drop=FALSE]
                #in_CI = which(colSums(in_CI_raw) > 0)
                in_CI = 1:length(susie)
                susie = susie[in_CI]
                out_files = dap_out[which(dap_out$lm_less.n_signal == s & dap_out$liter_data.dataset == d),c("fit_dap.output.file"),drop=FALSE]
                dap = readRDS(paste0(${dirname:r}, '/', out_files[1,1], '.rds'))$posterior
                snp = dap[[paste0('V',r-1)]]$snp
                snp = snp[which(snp$snp %in% as.character(in_CI)),]
                snp = snp[match(in_CI, snp$snp),]
                #print(head(snp))
                dap = as.vector(snp$snp_prob)
                if (has_caviar) {
                    # caviar
                    out_files = caviar_out[which(caviar_out$lm_less.n_signal == s & caviar_out$liter_data.dataset == d & caviar_out$fit_caviar.args == paste('-c', s)),c("fit_caviar.output.file"),drop=FALSE]
                    caviar = readRDS(paste0(${dirname:r}, '/', out_files[1,1], '.rds'))$posterior
                    snp = caviar[[r]]$snp
                    snp = snp[which(snp$snp %in% as.character(in_CI)),]
                    snp = snp[match(in_CI, snp$snp),]
                    #print(head(snp))
                    caviar = as.vector(snp$snp_prob)
                    # finemap
                    out_files = finemap_out[which(finemap_out$lm_less.n_signal == s & finemap_out$liter_data.dataset == d & finemap_out$fit_finemap.args == paste('--n-causal-max', s)),c("fit_finemap.output.file"),drop=FALSE]
                    finemap = readRDS(paste0(${dirname:r}, '/', out_files[1,1], '.rds'))$posterior
                    snp = finemap[[r]]$snp
                    snp = snp[which(snp$snp %in% as.character(in_CI)),]
                    snp = snp[match(in_CI, snp$snp),]
                    #print(head(snp))
                    finemap = as.vector(snp$snp_prob)  
                    pip = cbind(susie, dap, caviar, finemap, signals[in_CI])
                } else {
                    pip = cbind(susie, dap, signals[in_CI])
                }
                if (is.null(result[[as.character(s)]])) {
                    result[[as.character(s)]] = pip
                } else {
                    result[[as.character(s)]] = rbind(result[[as.character(s)]], pip)
                }
            }
        }
        # make data frame
        res = data.frame(result[[as.character(s)]])
        if (has_caviar) {
            colnames(res) = c('susie', 'dap', 'caviar', 'finemap', 'is_signal')
            names = c('susie', 'dap', 'caviar', 'finemap')
        } else {
            colnames(res) = c('susie', 'dap', 'is_signal')
            names = c('susie', 'dap')
        }
        # make bins
        bins = cbind(seq(1:${bin_size})/${bin_size}-1/${bin_size}, seq(1:${bin_size})/${bin_size})
        for (name in names) {
            for (i in 1:nrow(bins)) {
              tmp = res[which(res[[name]] > bins[i,1] & res[[name]] < bins[i,2]),]
              pip_cali[[as.character(s)]][[name]] = rbind(pip_cali[[as.character(s)]][[name]], c(sum(tmp[[name]]), sum(tmp$is_signal), length(tmp$is_signal)))
          }
        pip_cali[[as.character(s)]][[name]][which(is.na(pip_cali[[as.character(s)]][[name]]))] = 0
        }
    }
    susie = pip_cali[[as.character(1)]]$susie
    for (i in 2:5) susie = susie + pip_cali[[as.character(i)]]$susie
    dap = pip_cali[[as.character(1)]]$dap
    for (i in 2:5) dap = dap + pip_cali[[as.character(i)]]$dap
    cav = pip_cali[[as.character(1)]]$cav
    for (i in 2:3) cav = cav + pip_cali[[as.character(i)]]$cav
    finemap = pip_cali[[as.character(1)]]$finemap
    for (i in 2:3) finemap = finemap + pip_cali[[as.character(i)]]$finemap
    susie[,c(1,2)] = susie[,c(1,2)] / susie[,3]
    dap[,c(1,2)] = dap[,c(1,2)] / dap[,3]
    cav[,c(1,2)] = cav[,c(1,2)] / cav[,3]
    finemap[,c(1,2)] = finemap[,c(1,2)] / finemap[,3]
    saveRDS(list("SuSiE"=susie[-1,], "DAP-G"=dap[-1,], "CAVIAR"=cav[-1,], "FINEMAP"=finemap[-1,]), ${_output:r})

In [None]:
[cali_pip_3]
depends: executable('convert')
input: group_by = 1, concurrent = True
output: f'{_input:n}.png'
R: expand = '${ }'
    library(ggplot2)
    dot_plot = function(dataframe) {
        ggplot(dataframe, aes(x=mean_pip, y=observed_freq)) + 
          geom_errorbar(aes(ymin=observed_freq-se, ymax=observed_freq+se), colour="gray", size = 0.2, width=.01) +
          geom_point(size=1.5, shape=21, fill="#002b36") + # 21 is filled circle
          xlab("Mean PIP") +
          ylab("Observed frequency") +
          coord_cartesian(ylim=c(0,1), xlim=c(0,1)) +
          geom_abline(slope=1,intercept=0,colour='red', size=0.2) +
          ggtitle(name) +
          expand_limits(y=0) +                        # Expand y range
          theme_bw() + 
          theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), plot.title = element_text(hjust = 0.5))
    }

    dat = readRDS(${_input:r})
    idx = 0
    for (name in names(dat)) {
      idx = idx + 1
      dat[[name]][,3] = sqrt(dat[[name]][,2] * (1 - dat[[name]][,2]) / dat[[name]][,3]) * 2
      dat[[name]] = as.data.frame(dat[[name]])
      colnames(dat[[name]]) = c("mean_pip", "observed_freq", "se")
      png(paste0(${_output:nr}, '_' , idx, '.png'), 4, 4, units = 'in', res = 500)
      print(dot_plot(dat[[name]]))
      dev.off()
    }
    files = paste0(${_output:nr}, '_', seq(1:idx), '.png')
    cmd = paste('convert +append', paste(files, collapse=" "), ${_output:r})
    system(cmd)
    system(paste('rm -f', paste(files, collapse=" ")))

## Coverage

In [None]:
[coverage_2]
est_var = ['FALSE', 'TRUE']
ld_cutoff = 0.5
n_signals = 4
input: for_each = 'est_var', concurrent = True
output: f'{dirname}/Coverage_0623_estvar_{_est_var.lower()}.rds'

R: stdout = f'{_output:n}.log', expand = '${ }', workdir = cwd
    ld_col = ${ld_col}
    ld_cutoff = ${ld_cutoff}
    dat = readRDS(${_input:r})
    susie_out = dat$susie
    # favorit susie flavor
    susie_out = susie_out[which(susie_out$fit_susie.prior_var == ${susie_prior} & susie_out$fit_susie.estimate_residual_variance == ${_est_var}), ]
    susie_out = subset(susie_out, select =-c(lm_less.pve, fit_susie.prior_var, fit_susie.estimate_residual_variance))

    data_sets = unique(susie_out$liter_data.dataset)
    if (is.null(${n_signals})) {
        n_signals = unique(susie_out$lm_less.n_signal)
    } else {
        n_signals = 1:${n_signals}
    }
    positives = list()
    for (s in n_signals) {
        positives[[as.character(s)]] = list()
        print(paste('========', s, '========'))
        for (d in data_sets) {
            out_files = susie_out[which(susie_out$lm_less.n_signal == s & susie_out$liter_data.dataset == d),c("fit_susie.output.file", "liter_data.output.file", "lm_less.output.file")]
            fit = readRDS(paste0(${dirname:r}, '/', out_files[1,1], '.rds'))$posterior
            ld_mat = readRDS(paste0(${dirname:r}, '/', out_files[1,2], '.ld_mat.rds'))
            truth = readRDS(paste0(${dirname:r}, '/', out_files[1,3], '.rds'))$data$true_coef
            for (r in c(1,2)) {
                signals = which(truth[,r]!=0)
                susie_cs_all = list()
                for (level in c(0.01, 0.05, 0.1, 0.15, 0.2, 0.25)) {
                  susie_cs_all[[as.character(level*100)]] = susieR::susie_get_CS(fit$alpha[[r]], coverage=1-level, Xcorr = ld_mat, min_abs_corr = ld_cutoff)$cs
                  if (! as.character(level*100) %in% names(positives[[as.character(s)]])) {
                      positives[[as.character(s)]][[as.character(level*100)]] = c(0,0)
                  }
                  if (length(susie_cs_all[[as.character(level*100)]]) > 0) {
                      for (i in 1:length(susie_cs_all[[as.character(level*100)]])) {
                          susie_cs = susie_cs_all[[as.character(level*100)]][[i]]
                          if (length(susie_cs) == 0) {
                              next
                          }
                          if (any(signals %in% susie_cs)) {
                              positives[[as.character(s)]][[as.character(level*100)]][1] = positives[[as.character(s)]][[as.character(level*100)]][1] + 1
                          } else {
                              print(paste(d, r, level, i))
                              print(susie_cs)
                              print(signals)
                              positives[[as.character(s)]][[as.character(level*100)]][2] = positives[[as.character(s)]][[as.character(level*100)]][2] + 1
                          }
                      }            
                  }
                }
            }
        }
        print(positives[[as.character(s)]])
    }
    saveRDS(positives, ${_output:r})

## ROC

In [None]:
[roc_2]
pip_after_filter = 'TRUE'
est_var = ['FALSE', 'TRUE']
ld_cutoff = 0.25 # does not really matter
pip_cutoff = 0.4
dap_cluster_cutoff = ('cluster_prob', 0.95)
input: for_each = 'est_var', concurrent = True
output: f'{dirname}/ROC_{date}_estvar_{_est_var.lower()}_two.rds', f'{dirname}/ROC_{date}_estvar_{_est_var.lower()}_all.rds'
R: stdout = f'{_output[0]:n}.log', expand = '${ }', workdir = cwd
    ld_col = ${ld_col}
    ld_cutoff = ${ld_cutoff}
    dat = readRDS(${_input:r})
    dap_out = dat$dap
    susie_out = dat$susie
    caviar_out = dat$caviar
    finemap_out = dat$finemap
    # favorit susie flavor
    susie_out = susie_out[which(susie_out$fit_susie.prior_var == ${susie_prior} & susie_out$fit_susie.estimate_residual_variance == ${_est_var}), ]
    susie_out = subset(susie_out, select =-c(lm_less.pve, fit_susie.prior_var, fit_susie.estimate_residual_variance))

    data_sets = unique(susie_out$liter_data.dataset)
    n_signals = unique(susie_out$lm_less.n_signal)

    result = list()
    result_all = list()
    for (s in n_signals) {
        result[[as.character(s)]] = list(susie=NULL, dap=NULL)
        result_all[[as.character(s)]] = list(susie=NULL, dap=NULL, caviar=NULL, finemap=NULL)
        for (d in data_sets) {
            out_files = susie_out[which(susie_out$lm_less.n_signal == s & susie_out$liter_data.dataset == d),c("fit_susie.output.file", "plot_susie.output.file", "lm_less.output.file")]
            fit = readRDS(paste0(${dirname:r}, '/', out_files[1,1], '.rds'))$posterior
            purity = readRDS(paste0(${dirname:r}, '/', out_files[1,2], '.rds'))
            truth = readRDS(paste0(${dirname:r}, '/', out_files[1,3], '.rds'))$data$true_coef
            for (r in 1:2) {
                signals = truth[,r]
                signals[which(signals!=0)] = 1
                # susie
                if (${pip_after_filter}) {
                  alpha = fit$alpha[[r]][which(purity$purity[[paste0('V',r)]][,ld_col] > ld_cutoff),,drop=FALSE]
                } else {
                  alpha = fit$alpha[[r]]
                }
                pip = 1 - apply(1 - alpha, 2, prod)
                if (is.null(result[[as.character(s)]]$susie)) {
                    result[[as.character(s)]]$susie = cbind(pip, signals)
                    if (s <= 3) result_all[[as.character(s)]]$susie = cbind(pip, signals)
                } else {
                    result[[as.character(s)]]$susie = rbind(result[[as.character(s)]]$susie, cbind(pip, signals))
                    if (s <= 3) result_all[[as.character(s)]]$susie = rbind(result_all[[as.character(s)]]$susie, cbind(pip, signals))
                }
                # dap
                out_files = dap_out[which(dap_out$lm_less.n_signal == s & dap_out$liter_data.dataset == d),c("fit_dap.output.file"),drop=FALSE]
                dap = readRDS(paste0(${dirname:r}, '/', out_files[1,1], '.rds'))$posterior
                pip = dap[[paste0('V',r-1)]]$snp
                pip = pip[order(as.numeric(pip$snp)),]$snp_prob
                if (is.null(result[[as.character(s)]]$dap)) {
                    result[[as.character(s)]]$dap = cbind(pip, signals)
                    if (s <= 3) result_all[[as.character(s)]]$dap = cbind(pip, signals)
                } else {
                    result[[as.character(s)]]$dap = rbind(result[[as.character(s)]]$dap, cbind(pip, signals))
                    if (s <= 3) result_all[[as.character(s)]]$dap = rbind(result_all[[as.character(s)]]$dap, cbind(pip, signals))
                }
                if (s <= 3) {
                  # CAVIAR
                  out_files = caviar_out[which(caviar_out$lm_less.n_signal == s & caviar_out$liter_data.dataset == d & caviar_out$fit_caviar.args == paste('-c', s)),c("fit_caviar.output.file"),drop=FALSE]
                  caviar = readRDS(paste0(${dirname:r}, '/', out_files[1,1], '.rds'))$posterior
                  pip = caviar[[r]]$snp
                  pip = pip[order(as.numeric(pip$snp)),]$snp_prob
                  if (is.null(result_all[[as.character(s)]]$caviar)) {
                    result_all[[as.character(s)]]$caviar = cbind(pip, signals)
                  } else {
                    result_all[[as.character(s)]]$caviar = rbind(result_all[[as.character(s)]]$caviar, cbind(pip, signals))
                  }
                  # FINEMAP
                  out_files = finemap_out[which(finemap_out$lm_less.n_signal == s & finemap_out$liter_data.dataset == d & finemap_out$fit_finemap.args == paste('--n-causal-max', s)),c("fit_finemap.output.file"),drop=FALSE]
                  finemap = readRDS(paste0(${dirname:r}, '/', out_files[1,1], '.rds'))$posterior
                  pip = finemap[[r]]$snp
                  pip = pip[order(as.numeric(pip$snp)),]$snp_prob
                  if (is.null(result_all[[as.character(s)]]$finemap)) {
                    result_all[[as.character(s)]]$finemap = cbind(pip, signals)
                  } else {
                    result_all[[as.character(s)]]$finemap = rbind(result_all[[as.character(s)]]$finemap, cbind(pip, signals))
                  }  
                }
            }
        }
    }
  
    roc_data = function(d1, cutoff = c(${pip_cutoff}, 1)) {
        grid = 500
        ttv = seq(1:grid)/grid
        ttv = ttv[which(ttv>=cutoff[1] & ttv<=cutoff[2])]
        rst1 = t(sapply(ttv, function(x) c(sum(d1[,2][d1[,1]>=x]), length(d1[,2][d1[,1]>=x]))))
        rst1 = as.data.frame(rst1)
        rst1 = cbind(rst1, sum(d1[,2]))
        colnames(rst1) = c('true_positive', 'total_positive', 'total_signal')
        rst2 = as.data.frame(cbind(rst1$true_positive / rst1$total_positive, rst1$true_positive / rst1$total_signal))
        colnames(rst2) = c('Precision', 'Recall')
        # FIXME: There might be numerical issue that programs do not report PIP equals 1 and create
        # situations of both counts for PIP near 1 to be zeros ... thus creating NaN precisions
        # Fix it here by setting the last row of rates to 1 and 0
        if (is.nan(rst2$Precision[length(rst2$Precision)])) {
            rst2$Precision[length(rst2$Precision)] = 1
            rst2$Recall[length(rst2$Recall)] = 0
        }
        return(list(counts = rst1, rates = rst2))
    }

    susie = roc_data(do.call(rbind, lapply(1:length(result), function(i) result[[i]]$susie)))
    dap = roc_data(do.call(rbind, lapply(1:length(result), function(i) result[[i]]$dap)))
    saveRDS(list(data = result, susie = susie, dap = dap), ${_output[0]:r})
    #
    susie = roc_data(do.call(rbind, lapply(1:length(result_all), function(i) result_all[[i]]$susie)))
    dap = roc_data(do.call(rbind, lapply(1:length(result_all), function(i) result_all[[i]]$dap)))
    caviar = roc_data(do.call(rbind, lapply(1:length(result_all), function(i) result_all[[i]]$caviar)))
    finemap = roc_data(do.call(rbind, lapply(1:length(result_all), function(i) result_all[[i]]$finemap)))
    saveRDS(list(data = result_all, susie = susie, dap = dap, finemap=finemap, caviar=caviar), ${_output[1]:r}) 