# Workflow to extract info for power comparison with DAP for a hard case

In [1]:
%revisions -s

Previously I've ran this specific DSC using:

```
dsc susie.dsc --target hard_case -o hard_case
```

So here I query from that result for DAP power. Again the settings:

- PVE 0.3
- 10 causal
- ~8K SNPs (all cis-SNPs of potential interest)

susie parameters:

- prior 0.1, which is an unfavorable overestimate
- L = 10

DAP parameters are default.

In [2]:
[global]
cwd = path('~/GIT/github/mvarbvs/dsc')
dirname = path(f'{cwd:a}/hard_case/')
ld_col = 1

susie_prior = 0.1
date = '0801'

## The workflow

In [3]:
[power_1]
output: f'{dirname}/DAP_comparison_{date}.rds'
R: expand = '${ }', workdir = cwd
    dap_out = dscrutils::dscquery(${dirname:br},
                        target = "full_data.dataset full_data lm_less03 lm_less03.pve lm_less03.n_signal fit_dap plot_dap",
                             load.pkl = TRUE)
    susie_out = dscrutils::dscquery(${dirname:br},
                        target = "full_data.dataset full_data lm_less03 lm_less03.pve lm_less03.n_signal fit_susie10.prior_var fit_susie10.estimate_residual_variance fit_susie10 plot_susie",
                             load.pkl = TRUE)
    saveRDS(list(dap=dap_out, susie=susie_out), ${_output:r})

In [5]:
[power_2]
# Power analysis
ld_avg_col = 2
ld_cutoff = 0.25
est_var = ['FALSE', 'TRUE']
# to match with susie 95% mappable CS, we set dap cutoff to 0.95 also
dap_cluster_cutoff = [('cluster_prob', 0.95), ('cluster_avg_r2', 0.25)]
input: for_each = ['dap_cluster_cutoff', 'est_var'], group_by = 1, concurrent = True
output: f'{dirname}/{_input:bn}_{_dap_cluster_cutoff[0]}_estvar_{_est_var.lower()}.rds'
R: stdout = f'{_output:n}.log', expand = '${ }', workdir = cwd
    ld_col = ${ld_col}
    ld_cutoff = ${ld_cutoff}
    dat = readRDS(${_input:r})
    dap_out = dat$dap
    susie_out = dat$susie
    # favorit susie flavor
    susie_out = susie_out[which(susie_out$fit_susie10.prior_var == ${susie_prior} & susie_out$fit_susie10.estimate_residual_variance == ${_est_var}), ]
    susie_out = subset(susie_out, select =-c(lm_less03.pve, fit_susie10.prior_var, fit_susie10.estimate_residual_variance))
    data_sets = unique(susie_out$full_data.dataset)
    n_signals = unique(susie_out$lm_less03.n_signal)
    n_r = 2
    n_experiments = n_r * length(data_sets)
    result = NULL
    for (s in n_signals) {
        susie_signals = 0
        dap_signals = 0
        susie_avg_ld = 0
        dap_avg_ld = 0
        susie_size = 0
        dap_size = 0
        # fixme: I cannot find a good median tracker so do it stupid way
        susie_sizes = vector()
        dap_sizes = vector()
        susie_tdc = 0
        dap_tdc = 0
        susie_dc = 0
        dap_dc = 0
        susie_tc = 0
        dap_tc = 0
        for (d in data_sets) {
            out_files = susie_out[which(susie_out$lm_less03.n_signal == s & susie_out$full_data.dataset == d), c("fit_susie10.output.file", "plot_susie.output.file", "lm_less03.output.file")]
            fit = readRDS(paste0(${dirname:r}, '/', out_files[1,1], '.rds'))$posterior
            purity = readRDS(paste0(${dirname:r}, '/', out_files[1,2], '.rds'))
            truth = readRDS(paste0(${dirname:r}, '/', out_files[1,3], '.rds'))$data$true_coef
            for (r in 1:n_r) {
                signals = which(truth[,r]!=0)
                # susie in CS
                susie_cs = fit$in_CI[[r]]
                susie_cs_raw = susie_cs[which(purity$purity[[paste0('V',r)]][,ld_col] > ld_cutoff),,drop=FALSE]
                cs_purity = purity$purity[[paste0('V',r)]][which(purity$purity[[paste0('V',r)]][,ld_col] > ld_cutoff),,drop=FALSE]
                susie_cs = list()
                if (nrow(susie_cs_raw) > 0) {
                    for (i in 1:nrow(susie_cs_raw)) {
                      susie_cs[[i]] = which(susie_cs_raw[i,] > 0)
                      if (length(susie_cs[[i]]) == 0) {
                          susie_tc = susie_tc - 1
                          next
                      }
                      if (any(signals %in% susie_cs[[i]])) {
                          susie_tdc = susie_tdc + 1
                          susie_size = susie_size + length(susie_cs[[i]])
                          susie_sizes = c(susie_sizes, length(susie_cs[[i]]))
                          susie_avg_ld = susie_avg_ld + cs_purity[i,${ld_avg_col}]^2
                          susie_dc = susie_dc + 1
                      }
                    }
                    susie_signals = susie_signals + sum(signals %in% unique(unlist(susie_cs)))
                }
                print(paste('==============', s, '=============='))
                print(susie_cs)
                susie_tc = susie_tc + length(susie_cs)
                # DAP in cluster
                out_files = dap_out[which(dap_out$lm_less03.n_signal == s & dap_out$full_data.dataset == d),c("fit_dap.output.file"),drop=FALSE]
                dap = readRDS(paste0(${dirname:r}, '/', out_files[1,1], '.rds'))$posterior
                dap_cluster_raw = dap[[paste0('V', r-1)]]$set[which(dap[[paste0('V', r-1)]]$set$${_dap_cluster_cutoff[0]} > ${_dap_cluster_cutoff[1]}), ]
                dap_cluster_ld = dap_cluster_raw$cluster_avg_r2
                dap_cluster_raw = dap_cluster_raw$snp
                dap_cluster = list()
                if (length(dap_cluster_raw) > 0) {
                    for (i in 1:length(dap_cluster_raw)) {
                      dap_cluster[[i]] = as.integer(unlist(strsplit(dap_cluster_raw[i], ",")))
                      if (any(signals %in% dap_cluster[[i]])) {
                          dap_size = dap_size + length(dap_cluster[[i]])
                          dap_sizes = c(dap_sizes, length(dap_cluster[[i]]))
                          dap_avg_ld = dap_avg_ld + dap_cluster_ld[i]
                          dap_tdc = dap_tdc + 1
                          dap_dc = dap_dc + 1
                      }
                    }
                    dap_signals = dap_signals + sum(signals %in% unique(unlist(dap_cluster)))
                }
                print(dap_cluster)
                dap_tc = dap_tc + length(dap_cluster)
                ## BEGIN debug
                ## susie made more true discovery than DAP
                if (susie_dc > dap_dc) {
                  print('DAP miss')
                  print(dap[[paste0('V', r-1)]]$set)
                  print(d)
                }
                ## DAP made some (false) discovery, susie did not
                ## under n = 1
                if (length(dap_cluster) > dap_dc && s == 1) {
                  print('DAP false discovery')
                  print(dap[[paste0('V', r-1)]]$set)
                  print(d)  
                }
                ## END debug
                susie_dc = 0
                dap_dc = 0
            }
        }
        rates = c(s, s*n_experiments, susie_tc, dap_tc, susie_signals/s/n_experiments, dap_signals/s/n_experiments, 1 - (susie_tdc/susie_tc), 1 - (dap_tdc/dap_tc), susie_size / susie_tdc, dap_size / dap_tdc, median(susie_sizes), median(dap_sizes), susie_avg_ld / susie_tdc, dap_avg_ld / dap_tdc)
        if (is.null(result)) {
          result = rates
        } else {
          result = rbind(result, rates)
        }
    }
    result = matrix(result, byrow=T, nrow=1)
    colnames(result) = c('n_signal', 'expected_discoveries', 'susie_discoveries', 'dap_discoveries', 'susie_power', 'dap_power', 'susie_fdp', 'dap_fdp', 'susie_avg_size', 'dap_avg_size', 'susie_median_size', 'dap_median_size', 'susie_avg_ld', 'dap_avg_ld')
    rownames(result) = as.character(result[,1])
    saveRDS(data.frame(result), ${_output:r})

## Power comparison, susie VS DAP, for ~8K region

In [2]:
%cd ~/GIT/github/mvarbvs/dsc

/home/gaow/GIT/github/mvarbvs/dsc

In [4]:
readRDS('hard_case/DAP_comparison_0615_cluster_prob_estvar_true.rds')

Unnamed: 0,n_signal,expected_discoveries,susie_discoveries,dap_discoveries,susie_power,dap_power,susie_fdp,dap_fdp,susie_avg_size,dap_avg_size,susie_median_size,dap_median_size,susie_avg_ld,dap_avg_ld
10,10,2000,602,557,0.2775,0.248,0.08803987,0.1202873,21.09472,13.09184,9,11,0.9243739,0.895302


In [5]:
readRDS('hard_case/DAP_comparison_0615_cluster_prob_estvar_false.rds')

Unnamed: 0,n_signal,expected_discoveries,susie_discoveries,dap_discoveries,susie_power,dap_power,susie_fdp,dap_fdp,susie_avg_size,dap_avg_size,susie_median_size,dap_median_size,susie_avg_ld,dap_avg_ld
10,10,2000,484,557,0.238,0.248,0.03512397,0.1202873,23.56959,13.09184,11,11,0.9163804,0.895302
