# Workflow to extract PIP and set information for different methods

In [1]:
%revisions -s

Revision,Author,Date,Message
,,,
4bbe27e,Gao Wang,2018-06-11,Add median size to power comparison
24e1fbc,Gao Wang,2018-06-07,Update revision table
507ab0e,Gao Wang,2018-06-06,Add coverage check
27ea530,Gao Wang,2018-06-06,Compute ROC by random blocks
9319e28,Gao Wang,2018-06-06,Fix ROC calculation
85bdece,Gao Wang,2018-06-06,Add average size and LD for susie vs dap
1085909,Gao Wang,2018-06-06,Update power comparisons to use both replicates per gene
2ad462b,Gao Wang,2018-06-06,Add power comparisons
102a45f,Gao Wang,2018-06-06,Add ROC draft


Previously I've ran this specific DSC using:

```
dsc susie.dsc --target run_comparison -o susie_comparison
```

So here I query from that result.

In [2]:
[global]
cwd = path('~/GIT/github/mvarbvs/dsc')
dirname = path(f'{cwd:a}/susie_comparison/')
ld_col = 1
susie_prior = 0.1
#susie_prior = 0.05

## Get data

In [3]:
[pip_1, power_1, cali_pip_1, coverage_1, roc_1]
output: f'{dirname}/PIP_comparison_0528.rds'
R: expand = '${ }', workdir = cwd
    dap_out = dscrutils::dscquery(${dirname:br}, 
                        target = "liter_data.dataset liter_data lm_less lm_less.pve lm_less.n_signal fit_dap plot_dap",
                             load.pkl = TRUE)
    susie_out = dscrutils::dscquery(${dirname:br}, 
                        target = "liter_data.dataset liter_data lm_less lm_less.pve lm_less.n_signal fit_susie.prior_var fit_susie.estimate_residual_variance fit_susie plot_susie",
                             load.pkl = TRUE)
    caviar_out = dscrutils::dscquery(${dirname:br}, 
                        target = "liter_data.dataset liter_data lm_less lm_less.pve lm_less.n_signal fit_caviar.args fit_caviar plot_caviar",
                             load.pkl = FALSE)
    saveRDS(list(dap=dap_out, susie=susie_out, caviar=caviar_out), ${_output:r})

## PIP comparison

In [4]:
[pip_2]
pip_after_filter = ['FALSE', 'TRUE']
est_var = ['FALSE', 'TRUE']
ld_cutoff = [0,0.25]
input: for_each = ['pip_after_filter', 'ld_cutoff', 'est_var'], concurrent = True
output: paths([f'{_input:n}.{x}_estvar_{_est_var.lower()}_filter_{_pip_after_filter.lower()}_{str(_ld_cutoff).replace(".", "p")}.png' for x in ['susie_dap', 'susie_caviar', 'dap_caviar']])
R: stdout = f'{_output[0]:n}.log', expand = '${ }', workdir = cwd
    ld_col = ${ld_col}
    ld_cutoff = ${_ld_cutoff}
    pip_cutoff = 0
    dat = readRDS(${_input:r})
    dap_out = dat$dap
    caviar_out = dat$caviar
    susie_out = dat$susie
    # favorit susie flavor
    susie_out = susie_out[which(susie_out$fit_susie.prior_var == ${susie_prior} & susie_out$fit_susie.estimate_residual_variance == ${_est_var}), ]
    susie_out = subset(susie_out, select =-c(lm_less.pve, fit_susie.prior_var, fit_susie.estimate_residual_variance))

    data_sets = unique(susie_out$liter_data.dataset)
    n_signals = unique(susie_out$lm_less.n_signal)

    result = list()
    for (s in n_signals) {
        result[[as.character(s)]] = NULL
        if (s > 3) {
            has_caviar = FALSE
        } else {
            has_caviar = TRUE
        }
        print(paste('==============', s, '=============='))
        for (d in data_sets) {
            out_files = susie_out[which(susie_out$lm_less.n_signal == s & susie_out$liter_data.dataset == d),c("fit_susie.output.file", "plot_susie.output.file", "lm_less.output.file")]
            fit = readRDS(paste0(${dirname:r}, '/', out_files[1,1], '.rds'))$posterior
            purity = readRDS(paste0(${dirname:r}, '/', out_files[1,2], '.rds'))
            truth = readRDS(paste0(${dirname:r}, '/', out_files[1,3], '.rds'))$data$true_coef
            signals = which(truth[,1]!=0)
            if (${_pip_after_filter}) {
              alpha = fit$alpha[[1]][which(purity$purity$V1[,ld_col] > ld_cutoff),,drop=FALSE]
            } else {
              alpha = fit$alpha[[1]]
            }
            pip = t(1 - apply(1 - alpha, 2, prod))
            in_CI_raw = fit$in_CI[[1]]
            in_CI_raw = in_CI_raw[which(purity$purity$V1[,ld_col] > ld_cutoff),,drop=FALSE]
            in_CI = which(colSums(in_CI_raw) > 0)
            pip = pip[in_CI]
            out_files = dap_out[which(dap_out$lm_less.n_signal == s & dap_out$liter_data.dataset == d),c("fit_dap.output.file"),drop=FALSE]
            dap = readRDS(paste0(${dirname:r}, '/', out_files[1,1], '.rds'))$posterior
            dap = dap$V0$snp
            #print(head(dap, length(pip)))
            dap = dap[which(dap$snp %in% as.character(in_CI)),]
            dap = dap[match(in_CI, dap$snp),]
            #print(dap)
            #print(pip)        
            #print(in_CI)
            if (has_caviar) {
                out_files = caviar_out[which(caviar_out$lm_less.n_signal == s & caviar_out$liter_data.dataset == d & caviar_out$fit_caviar.args == paste('-c', s)),c("fit_caviar.output.file"),drop=FALSE]
                caviar = readRDS(paste0(${dirname:r}, '/', out_files[1,1], '.rds'))$posterior
                caviar = caviar[[1]]$snp
                caviar = caviar[which(caviar$snp %in% as.character(in_CI)),]
                caviar = caviar[match(in_CI, caviar$snp),]
                pip = cbind(pip, as.vector(dap$snp_prob), as.vector(caviar$snp_prob), in_CI %in% signals)
            } else {
                pip = cbind(pip, as.vector(dap$snp_prob), in_CI %in% signals)
            }
            ## BEGIN debug
            outlier = pip[which(pip[,1] < 0.2 & pip[,2]>0.9), ,drop=F]
            if (nrow(outlier)>0 && s == 1) {
              print("DAP outlier")
              print(d)
            }
            if (has_caviar && s == 1) {
              conflict = pip[which(pip[,1] < 0.95 & pip[,3] > 0.95), ,drop=F]
              if (nrow(conflict) > 0) {
                  print("CAVIAR-susie conflict")
                  print(d)
                  print("CAVIAR")
                  print(caviar[which(caviar$snp_prob>0.95),])
                  print("susie")
                  print(purity$purity$V1[,ld_col])
                  print(rowSums(in_CI_raw))
              }
            }
            ## END debug  
            if (is.null(result[[as.character(s)]])) {
                result[[as.character(s)]] = pip
            } else {
                result[[as.character(s)]] = rbind(result[[as.character(s)]], pip)
            }
        }
        result[[as.character(s)]] = data.frame(result[[as.character(s)]])
        if (has_caviar) {
            colnames(result[[as.character(s)]]) = c('susie', 'dap', 'caviar', 'is_signal')
        } else {
            colnames(result[[as.character(s)]]) = c('susie', 'dap', 'is_signal')
        }
    }
    # susie vs dap
    png(${_output[0]:r}, 600, 800)
    #par(mar=c(.5,.5,.5,.5))
    par(mfrow=c(3, 2))
    for (i in 1:5) {
        i = as.character(i)
        x = result[[i]][result[[i]]$susie > pip_cutoff & result[[i]]$dap > pip_cutoff,]
        colors = sapply(1:length(x$is_signal), function(i) ifelse(x$is_signal[i],'#800000','#002b36'))
        plot(x$susie, x$dap, xlab = paste('PIP susie >', pip_cutoff), ylab = paste('PIP DAP >', pip_cutoff),
             main = paste('num. causal:', i, '\ncor:', round(cor(x)[1,2],2)),
            col = colors, pch = 20, cex = 1.5)
        abline(0,1,col=2)
        abline(h=0.95, col='gray')
        abline(v=0.95, col='gray')
    }
    dev.off()
    # susie vs caviar
    png(${_output[1]:r}, 600, 800)
    #par(mar=c(.5,.5,.5,.5))
    par(mfrow=c(2, 2))
    for (i in 1:3) {
        i = as.character(i)
        x = result[[i]][result[[i]]$susie > pip_cutoff & result[[i]]$caviar > pip_cutoff,]
        colors = sapply(1:length(x$is_signal), function(i) ifelse(x$is_signal[i],'#800000','#002b36'))
        plot(x$susie, x$caviar, xlab = paste('PIP susie >', pip_cutoff), ylab = paste('PIP CAVIAR >', pip_cutoff),
             main = paste('num. causal:', i, '\ncor:', round(cor(x)[1,2],2)),
            col = colors, pch = 20, cex = 1.5)
        abline(0,1,col=2)
        abline(h=0.95, col='gray')
        abline(v=0.95, col='gray')
    }
    dev.off()
    # dap vs caviar
    png(${_output[2]:r}, 600, 800)
    #par(mar=c(.5,.5,.5,.5))
    par(mfrow=c(2, 2))
    for (i in 1:3) {
        i = as.character(i)
        x = result[[i]][result[[i]]$dap > pip_cutoff & result[[i]]$caviar > pip_cutoff,]
        colors = sapply(1:length(x$is_signal), function(i) ifelse(x$is_signal[i],'#800000','#002b36'))
        plot(x$dap, x$caviar, xlab = paste('PIP DAP >', pip_cutoff), ylab = paste('PIP CAVIAR >', pip_cutoff),
             main = paste('num. causal:', i, '\ncor:', round(cor(x)[1,2],2)),
            col = colors, pch = 20, cex = 1.5)
        abline(0,1,col=2)
        abline(h=0.95, col='gray')
        abline(v=0.95, col='gray')
    }
    dev.off() 

## Summary of discovery

In [5]:
[power_2]
# Power analysis
# to match with DAP -ld_control 0.25
ld_avg_col = 2
ld_cutoff = 0.25
est_var = ['FALSE', 'TRUE']
# to match with susie 95% mappable CS, we set dap cutoff to 0.95 also
dap_cluster_cutoff = [('cluster_prob', 0.95), ('cluster_avg_r2', 0.25)]
input: for_each = ['dap_cluster_cutoff', 'est_var'], group_by = 1, concurrent = True
output: f'{dirname}/Power_comparison_0528_{_dap_cluster_cutoff[0]}_estvar_{_est_var.lower()}.rds'
R: stdout = f'{_output:n}.log', expand = '${ }', workdir = cwd
    ld_col = ${ld_col}
    ld_cutoff = ${ld_cutoff}
    dat = readRDS(${_input:r})
    dap_out = dat$dap
    susie_out = dat$susie
    # favorit susie flavor
    susie_out = susie_out[which(susie_out$fit_susie.prior_var == ${susie_prior} & susie_out$fit_susie.estimate_residual_variance == ${_est_var}), ]
    susie_out = subset(susie_out, select =-c(lm_less.pve, fit_susie.prior_var, fit_susie.estimate_residual_variance))
    data_sets = unique(susie_out$liter_data.dataset)
    n_signals = unique(susie_out$lm_less.n_signal)
    n_r = 2
    n_experiments = n_r * length(data_sets)
    result = NULL
    for (s in n_signals) {
        susie_signals = 0
        dap_signals = 0
        susie_avg_ld = 0
        dap_avg_ld = 0
        susie_size = 0
        dap_size = 0
        # fixme: I cannot find a good median tracker so do it stupid way
        susie_sizes = vector()
        dap_sizes = vector()
        susie_tdc = 0
        dap_tdc = 0
        susie_dc = 0
        dap_dc = 0
        susie_tc = 0
        dap_tc = 0
        for (d in data_sets) {
            out_files = susie_out[which(susie_out$lm_less.n_signal == s & susie_out$liter_data.dataset == d), c("fit_susie.output.file", "plot_susie.output.file", "lm_less.output.file")]
            fit = readRDS(paste0(${dirname:r}, '/', out_files[1,1], '.rds'))$posterior
            purity = readRDS(paste0(${dirname:r}, '/', out_files[1,2], '.rds'))
            truth = readRDS(paste0(${dirname:r}, '/', out_files[1,3], '.rds'))$data$true_coef
            for (r in 1:n_r) {
                signals = which(truth[,r]!=0)
                # susie in CS
                susie_cs = fit$in_CI[[r]]
                susie_cs_raw = susie_cs[which(purity$purity[[paste0('V',r)]][,ld_col] > ld_cutoff),,drop=FALSE]
                cs_purity = purity$purity[[paste0('V',r)]][which(purity$purity[[paste0('V',r)]][,ld_col] > ld_cutoff),,drop=FALSE]
                susie_cs = list()
                if (nrow(susie_cs_raw) > 0) {
                    for (i in 1:nrow(susie_cs_raw)) {
                      susie_cs[[i]] = which(susie_cs_raw[i,] > 0)
                      if (length(susie_cs[[i]]) == 0) {
                          susie_tc = susie_tc - 1
                          next
                      }
                      if (any(signals %in% susie_cs[[i]])) {
                          susie_tdc = susie_tdc + 1
                          susie_size = susie_size + length(susie_cs[[i]])
                          susie_sizes = c(susie_sizes, length(susie_cs[[i]]))
                          susie_avg_ld = susie_avg_ld + cs_purity[i,${ld_avg_col}]^2
                          susie_dc = susie_dc + 1
                      }
                    }
                    susie_signals = susie_signals + sum(signals %in% unique(unlist(susie_cs)))
                }
                print(paste('==============', s, '=============='))
                print(susie_cs)
                susie_tc = susie_tc + length(susie_cs)
                # DAP in cluster
                out_files = dap_out[which(dap_out$lm_less.n_signal == s & dap_out$liter_data.dataset == d),c("fit_dap.output.file"),drop=FALSE]
                dap = readRDS(paste0(${dirname:r}, '/', out_files[1,1], '.rds'))$posterior
                dap_cluster_raw = dap[[paste0('V', r-1)]]$set[which(dap[[paste0('V', r-1)]]$set$${_dap_cluster_cutoff[0]} > ${_dap_cluster_cutoff[1]}), ]
                dap_cluster_ld = dap_cluster_raw$cluster_avg_r2
                dap_cluster_raw = dap_cluster_raw$snp
                dap_cluster = list()
                if (length(dap_cluster_raw) > 0) {
                    for (i in 1:length(dap_cluster_raw)) {
                      dap_cluster[[i]] = as.integer(unlist(strsplit(dap_cluster_raw[i], ",")))
                      if (any(signals %in% dap_cluster[[i]])) {
                          dap_size = dap_size + length(dap_cluster[[i]])
                          dap_sizes = c(dap_sizes, length(dap_cluster[[i]]))
                          dap_avg_ld = dap_avg_ld + dap_cluster_ld[i]
                          dap_tdc = dap_tdc + 1
                          dap_dc = dap_dc + 1
                      }
                    }
                    dap_signals = dap_signals + sum(signals %in% unique(unlist(dap_cluster)))
                }
                print(dap_cluster)
                dap_tc = dap_tc + length(dap_cluster)
                ## BEGIN debug
                ## susie made more true discovery than DAP
                if (susie_dc > dap_dc) {
                  print('DAP miss')
                  print(dap[[paste0('V', r-1)]]$set)
                  print(d)
                }
                ## DAP made some (false) discovery, susie did not
                ## under n = 1
                if (length(dap_cluster) > dap_dc && s == 1) {
                  print('DAP false discovery')
                  print(dap[[paste0('V', r-1)]]$set)
                  print(d)  
                }
                ## END debug
                susie_dc = 0
                dap_dc = 0
            }
        }
        rates = c(s, s*n_experiments, susie_tc, dap_tc, susie_signals/s/n_experiments, dap_signals/s/n_experiments, 1 - (susie_tdc/susie_tc), 1 - (dap_tdc/dap_tc), susie_size / susie_tdc, dap_size / dap_tdc, median(susie_sizes), median(dap_sizes), susie_avg_ld / susie_tdc, dap_avg_ld / dap_tdc)
        if (is.null(result)) {
          result = rates
        } else {
          result = rbind(result, rates)
        }
    }
    colnames(result) = c('n_signal', 'expected_discoveries', 'susie_discoveries', 'dap_discoveries', 'susie_power', 'dap_power', 'susie_fdp', 'dap_fdp', 'susie_avg_size', 'dap_avg_size', 'susie_median_size', 'dap_median_size', 'susie_avg_ld', 'dap_avg_ld')
    rownames(result) = as.character(result[,1])
    saveRDS(data.frame(result), ${_output:r})

## PIP calibration

In [None]:
[cali_pip_2]
est_var = ['FALSE', 'TRUE']
ld_cutoff = 0.25
input: for_each = 'est_var', concurrent = True
output: f'{_input:n}.calibrated.estvar_{_est_var.lower()}.rds'
R: stdout = f'{_output:n}.log', expand = '${ }', workdir = cwd
    ld_col = ${ld_col}
    ld_cutoff = ${ld_cutoff}
    pip_cutoff = 0
    dat = readRDS(${_input:r})
    dap_out = dat$dap
    caviar_out = dat$caviar
    susie_out = dat$susie
    # favorit susie flavor
    susie_out = susie_out[which(susie_out$fit_susie.prior_var == ${susie_prior} & susie_out$fit_susie.estimate_residual_variance == ${_est_var}), ]
    susie_out = subset(susie_out, select =-c(lm_less.pve, fit_susie.prior_var, fit_susie.estimate_residual_variance))

    data_sets = unique(susie_out$liter_data.dataset)
    n_signals = unique(susie_out$lm_less.n_signal)
  
    result = list()
    pip_cali = list()
    for (s in n_signals) {
        result[[as.character(s)]] = NULL
        pip_cali[[as.character(s)]] = list(susie = c(0,0,0), dap = c(0,0,0), caviar = c(0,0,0))
        if (s > 3) {
            has_caviar = FALSE
        } else {
            has_caviar = TRUE
        }
        for (d in data_sets) {
            out_files = susie_out[which(susie_out$lm_less.n_signal == s & susie_out$liter_data.dataset == d),c("fit_susie.output.file", "plot_susie.output.file", "lm_less.output.file")]
            fit = readRDS(paste0(${dirname:r}, '/', out_files[1,1], '.rds'))$posterior
            purity = readRDS(paste0(${dirname:r}, '/', out_files[1,2], '.rds'))
            truth = readRDS(paste0(${dirname:r}, '/', out_files[1,3], '.rds'))$data$true_coef
            for (r in c(1,2)) {
                signals = truth[,r]
                signals[which(signals!=0)] = 1
                alpha = fit$alpha[[r]][which(purity$purity[[paste0('V',r)]][,ld_col] > ld_cutoff),,drop=FALSE]
                # susie PIP
                susie = as.vector(t(1 - apply(1 - alpha, 2, prod)))
                #in_CI_raw = fit$in_CI[[r]]
                #in_CI_raw = in_CI_raw[which(purity$purity[[paste0('V',r)]][,ld_col] > ld_cutoff),,drop=FALSE]
                #in_CI = which(colSums(in_CI_raw) > 0)
                in_CI = 1:length(susie)
                susie = susie[in_CI]
                out_files = dap_out[which(dap_out$lm_less.n_signal == s & dap_out$liter_data.dataset == d),c("fit_dap.output.file"),drop=FALSE]
                dap = readRDS(paste0(${dirname:r}, '/', out_files[1,1], '.rds'))$posterior
                snp = dap[[paste0('V',r-1)]]$snp
                snp = snp[which(snp$snp %in% as.character(in_CI)),]
                snp = snp[match(in_CI, snp$snp),]
                #print(head(snp))
                dap = as.vector(snp$snp_prob)
                if (has_caviar) {
                    out_files = caviar_out[which(caviar_out$lm_less.n_signal == s & caviar_out$liter_data.dataset == d & caviar_out$fit_caviar.args == paste('-c', s)),c("fit_caviar.output.file"),drop=FALSE]
                    caviar = readRDS(paste0(${dirname:r}, '/', out_files[1,1], '.rds'))$posterior
                    snp = caviar[[r]]$snp
                    snp = snp[which(snp$snp %in% as.character(in_CI)),]
                    snp = snp[match(in_CI, snp$snp),]
                    #print(head(snp))
                    caviar = as.vector(snp$snp_prob)
                    pip = cbind(susie, dap, caviar, signals[in_CI])
                } else {
                    pip = cbind(susie, dap, signals[in_CI])
                }
                if (is.null(result[[as.character(s)]])) {
                    result[[as.character(s)]] = pip
                } else {
                    result[[as.character(s)]] = rbind(result[[as.character(s)]], pip)
                }
            }
        }
        # make data frame
        res = data.frame(result[[as.character(s)]])
        if (has_caviar) {
            colnames(res) = c('susie', 'dap', 'caviar', 'is_signal')
            names = c('susie', 'dap', 'caviar')
        } else {
            colnames(res) = c('susie', 'dap', 'is_signal')
            names = c('susie', 'dap')
        }
        # make bins
        bins = cbind(seq(1:10)/10-0.1, seq(1:10)/10)
        for (name in names) {
            for (i in 1:nrow(bins)) {
              tmp = res[which(res[[name]] > bins[i,1] & res[[name]] < bins[i,2]),]
              pip_cali[[as.character(s)]][[name]] = rbind(pip_cali[[as.character(s)]][[name]], c(sum(tmp[[name]]), sum(tmp$is_signal), length(tmp$is_signal)))
          }
        pip_cali[[as.character(s)]][[name]][which(is.na(pip_cali[[as.character(s)]][[name]]))] = 0
        }
    }
    susie = pip_cali[[as.character(1)]]$susie
    for (i in 2:5) susie = susie + pip_cali[[as.character(i)]]$susie
    dap = pip_cali[[as.character(1)]]$dap
    for (i in 2:5) dap = dap + pip_cali[[as.character(i)]]$dap
    cav = pip_cali[[as.character(1)]]$cav
    for (i in 2:3) cav = cav + pip_cali[[as.character(i)]]$cav
    susie[,c(1,2)] = susie[,c(1,2)] / susie[,3]
    dap[,c(1,2)] = dap[,c(1,2)] / dap[,3]
    cav[,c(1,2)] = cav[,c(1,2)] / cav[,3]
    saveRDS(list(susie=susie[-1,],DAP=dap[-1,],CAVIAR=cav[-1,]), ${_output:r})

In [None]:
[cali_pip_3]
input: group_by = 1, concurrent = True
output: f'{_input:n}.png'
R: expand = '${ }'
    dat = readRDS(${_input:r})
    #pdf(${_output:r}, 12, 4)
    png(${_output:r}, 12, 4, units = 'in', res = 500)
    par(mfrow=c(1, 3))
    for (name in names(dat)) {
      plot(dat[[name]][,1], dat[[name]][,2], col = '#002b36', ylab = "True frequency", xlab = "Mean PIP", xlim = c(0,1), ylim = c(0,1), main = name, pch = 20, cex = 1.5)
      abline(0,1,col=2)
    }
    dev.off()

## Coverage

It was not part of the design of the DSC to evaluated situations of different coverage. So implementation here is a bit awkward -- to use existing codes to compute purity I need to provide the `in_CS` matrix and the LD matrix to previously established Python codes, via tmp RDS files; then save output also to tmp RDS files to load calculated `purity`.

In [2]:
[purity_utils: provides = file_target(f'{cwd}/.sos/purity.py')]
output: f'{cwd}/.sos/purity.py'

report: output=f'{_output}', expand = '${ }'

    import sys
    sys.path.append('${cwd:a}/modules')
    from plot_susie import SusieReporter
    from dsc.dsc_io import load_rds, save_rds
    import numpy as np
    cs = load_rds(sys.argv[1])
    ld = load_rds(sys.argv[2])
    purity = dict()
    for k in cs.keys():
        reporter = SusieReporter(np.array(cs[k]), [], [], ld)
        purity[k] = reporter.purity
    save_rds(purity, sys.argv[3])

Workflow can only be executed with magic %run or %sosrun.

In [None]:
[coverage_2]
est_var = ['FALSE', 'TRUE']
ld_cutoff = 0.25
depends: file_target(f'{cwd}/.sos/purity.py')
input: for_each = 'est_var', concurrent = True
output: f'{dirname}/Coverage_0606_estvar_{_est_var.lower()}.rds'

R: stdout = f'{_output:n}.log', expand = '${ }', workdir = cwd
    ld_col = ${ld_col}
    ld_cutoff = ${ld_cutoff}
    dat = readRDS(${_input:r})
    susie_out = dat$susie
    # favorit susie flavor
    susie_out = susie_out[which(susie_out$fit_susie.prior_var == ${susie_prior} & susie_out$fit_susie.estimate_residual_variance == ${_est_var}), ]
    susie_out = subset(susie_out, select =-c(lm_less.pve, fit_susie.prior_var, fit_susie.estimate_residual_variance))

    data_sets = unique(susie_out$liter_data.dataset)
    n_signals = unique(susie_out$lm_less.n_signal)
    positives = list()
    for (s in n_signals) {
        positives[[as.character(s)]] = list()
        print(paste('========', s, '========'))
        for (d in data_sets) {
            out_files = susie_out[which(susie_out$lm_less.n_signal == s & susie_out$liter_data.dataset == d),c("fit_susie.output.file", "liter_data.output.file", "lm_less.output.file")]
            fit = readRDS(paste0(${dirname:r}, '/', out_files[1,1], '.rds'))$posterior
            ld_mat_fn = paste0(${dirname:r}, '/', out_files[1,2], '.ld_mat.rds')
            truth = readRDS(paste0(${dirname:r}, '/', out_files[1,3], '.rds'))$data$true_coef
            for (r in c(1,2)) {
                # now compute a grid of in_CI
                # save in_CI to tmp RDS
                # and make this file name and ld_mat_fn input to some program via a system call, generating some other file names of purity result
                # and load that result
                in_CI = list()
                for (level in c(0.01, 0.05, 0.1, 0.15, 0.2, 0.25)) {
                  in_CI[[as.character(level*100)]] = susieR::susie_in_CS(fit$alpha[[r]], coverage=1-level)
                }
                saveRDS(in_CI, "/tmp/${_output:bn}.tmp.cs.rds")
                dscrutils::run_cmd(paste("python .sos/purity.py", "/tmp/${_output:bn}.tmp.cs.rds", ld_mat_fn, "/tmp/${_output:bn}.tmp.purity.rds"))
                purity = readRDS("/tmp/${_output:bn}.tmp.purity.rds")
                signals = which(truth[,r]!=0)
                for (level in c(0.01, 0.05, 0.1, 0.15, 0.2, 0.25)) {
                  if (! as.character(level*100) %in% names(positives[[as.character(s)]])) {
                      positives[[as.character(s)]][[as.character(level*100)]] = c(0,0)
                  }
                  susie_cs_raw = in_CI[[as.character(level*100)]][which(purity[[paste0('V', level*100)]][,ld_col] > ld_cutoff),,drop=FALSE]
                  if (nrow(susie_cs_raw) > 0) {
                      for (i in 1:nrow(susie_cs_raw)) {
                          susie_cs = which(susie_cs_raw[i,] > 0)
                          if (length(susie_cs) == 0) {
                              next
                          }
                          if (any(signals %in% susie_cs)) {
                              positives[[as.character(s)]][[as.character(level*100)]][1] = positives[[as.character(s)]][[as.character(level*100)]][1] + 1
                          } else {
                              print(paste(d, r, level, i))
                              print(susie_cs)
                              print(signals)
                              positives[[as.character(s)]][[as.character(level*100)]][2] = positives[[as.character(s)]][[as.character(level*100)]][2] + 1
                          }
                      }            
                  }
                }
            }
        }
        print(positives[[as.character(s)]])
    }
    saveRDS(positives, ${_output:r})

## ROC

In [None]:
[roc_2]
pip_after_filter = 'TRUE'
est_var = ['FALSE', 'TRUE']
ld_cutoff = 0.25
dap_cluster_cutoff = ('cluster_prob', 0.95)
input: for_each = 'est_var', concurrent = True
output: f'{dirname}/ROC_0605_estvar_{_est_var.lower()}.rds'
R: stdout = f'{_output[0]:n}.log', expand = '${ }', workdir = cwd
    ld_col = ${ld_col}
    ld_cutoff = ${ld_cutoff}
    dat = readRDS(${_input:r})
    dap_out = dat$dap
    susie_out = dat$susie
    # favorit susie flavor
    susie_out = susie_out[which(susie_out$fit_susie.prior_var == ${susie_prior} & susie_out$fit_susie.estimate_residual_variance == ${_est_var}), ]
    susie_out = subset(susie_out, select =-c(lm_less.pve, fit_susie.prior_var, fit_susie.estimate_residual_variance))

    data_sets = unique(susie_out$liter_data.dataset)
    n_signals = unique(susie_out$lm_less.n_signal)

    result = list()
    for (s in n_signals) {
        result[[as.character(s)]] = list(susie=NULL, dap=NULL)
        for (d in data_sets) {
            out_files = susie_out[which(susie_out$lm_less.n_signal == s & susie_out$liter_data.dataset == d),c("fit_susie.output.file", "plot_susie.output.file", "lm_less.output.file")]
            fit = readRDS(paste0(${dirname:r}, '/', out_files[1,1], '.rds'))$posterior
            purity = readRDS(paste0(${dirname:r}, '/', out_files[1,2], '.rds'))
            truth = readRDS(paste0(${dirname:r}, '/', out_files[1,3], '.rds'))$data$true_coef
            for (r in 1:2) {
                signals = truth[,r]
                signals[which(signals!=0)] = 1
                # susie
                if (${pip_after_filter}) {
                  alpha = fit$alpha[[r]][which(purity$purity[[paste0('V',r)]][,ld_col] > ld_cutoff),,drop=FALSE]
                } else {
                  alpha = fit$alpha[[r]]
                }
                pip = 1 - apply(1 - alpha, 2, prod)
                if (is.null(result[[as.character(s)]]$susie)) {
                    result[[as.character(s)]]$susie = cbind(pip, signals)
                } else {
                    result[[as.character(s)]]$susie = rbind(result[[as.character(s)]]$susie, cbind(pip, signals))
                }
                # dap
                out_files = dap_out[which(dap_out$lm_less.n_signal == s & dap_out$liter_data.dataset == d),c("fit_dap.output.file"),drop=FALSE]
                dap = readRDS(paste0(${dirname:r}, '/', out_files[1,1], '.rds'))$posterior
                pip = dap[[paste0('V',r-1)]]$snp
                pip = pip[order(as.numeric(pip$snp)),]$snp_prob
                if (is.null(result[[as.character(s)]]$dap)) {
                    result[[as.character(s)]]$dap = cbind(pip, signals)
                } else {
                    result[[as.character(s)]]$dap = rbind(result[[as.character(s)]]$dap, cbind(pip, signals))
                }
            }
        }
    }
  
    roc_data = function(d1, cutoff = c(0.4, 0.99)) {
        grid = 500
        ttv = seq(1:grid)/grid
        ttv = ttv[which(ttv>cutoff[1] & ttv<cutoff[2])]
        rst1 = t(sapply(ttv, function(x) c(sum(d1[,2][d1[,1]>=x]), length(d1[,2][d1[,1]>=x]))))
        rst1[,2] = rst1[,2]-rst1[,1]
        rst1 = as.data.frame(rst1)
        colnames(rst1) = c('true_disc', 'false_disc')
        return(rst1)
    }

    susie = roc_data(do.call(rbind, lapply(1:length(result), function(i) result[[i]]$susie)))
    dap = roc_data(do.call(rbind, lapply(1:length(result), function(i) result[[i]]$dap)))
    saveRDS(list(data = result, susie = susie, dap = dap), ${_output:r})