# Finemapping benchmark

Methods evaluated:

- Variational methods:
    - spike-slab, mixture normal, sum of single effects, m&m
- Popular fine-mapping methods:
    - DAP, FINEMAP, CAVIAR
    
[PAINTOR](https://github.com/gkichaev/PAINTOR_V3.0) is not included because [FINEMAP is recommanded over PAINTOR when used without annotation](https://github.com/gkichaev/PAINTOR_V3.0/issues/11#issuecomment-303135031).

## DSC run

### `mnm.dsc`

Master DSC script.

In [1]:
%save -f mnm.dsc
#!/usr/bin/env dsc

%include modules/setup
%include modules/fit
%include modules/evaluate

DSC:
  define:
    get_Y: original_Y
    fit: (init_mnm * fit_mnm), fit_susie, fit_varbvs, 
        (fit_finemap * plot_finemap), 
        (fit_dap * plot_dap),
        (fit_caviar * plot_caviar)
  run:
    first_pass: get_data * get_Y * get_sumstats * fit
  output: benchmark
  exec_path: modules
  global:
    data_file: ~/Documents/GTExV8/Thyroid.Lung.FMO2.filled.rds

## DSC modules

### `setup.dsc`

Data generators.

In [2]:
%save -f modules/setup.dsc

# Modules to provide data
# Real or simulated

# Module output
# =============
# $data: full data
# $sumstats: summary statistics

get_data: Shell(ln -sf `realpath ${data_file}` $data)
  # FIXME: see 20171103_MNMASH_Data.ipynb for GTEx multitissue data preparation
  # and implement it more formally here
  $data: file(rds)

original_Y: Python(data['Y'] = numpy.vstack(data['Y'].values()).T)
  # do not simulate data, just use original
  data: $data
  $data: data

get_sumstats: regression.R + R(res = mm_regression(data$X, data$Y);
                               r2 = cor(data$X)^2;
                               V = cor(data$Y);
                               N = nrow(data$Y);
                               write.table(r2,ld_file,quote=F,col.names=F,row.names=F))
  @CONF: R_libs = abind
  data: $data
  $sumstats: res
  $ld: r2
  $ld_file: file(ld)
  $V: V
  $N: N

### `fit.dsc`

Fine mapping methods.

In [3]:
%save -f modules/fit.dsc
# workhorse(s) for finemapping

# Module input
# ============
# $data: full data; or
# $sumstats: summary statistics; or / and
# $ld: LD information

# Module output
# =============
# $fitted: for diagnostics
# $posterior: for inference

init_mnm: init_mnm.R
  # mashr comes from `dev` branch on github
  @CONF: R_libs = mashr
  V: $V
  reg: $sumstats
  # FIXME: these quantities are to be computed seperately and globally using mashr procedure
  # See http://stephenslab.github.io/gtex-eqtls/analysis/20171002_MASH_V8.html
  Sigma: empirical
  (U, grid, p): (auto, (0.9,0.01,0.01,0.01,0.01,0.01,0.01,0.02,0.02), auto)
  $model: model
  $V: V

fit_mnm: regression.R + fit_mnm.R
  @CONF: R_libs = mashr
  maxL: 5
  maxI: 10
  data: $data
  model: $model
  V: $V
  $fitted: fitted_track
  $posterior: posterior

fit_susie: fit_susie.R
  # Prior variance of nonzero effects.
  @CONF: R_libs = susieR@stephenslab/susieR
  maxL: 5
  maxI: 50
  data: $data
  $posterior: posterior
  $fitted: fitted

fit_varbvs(fit_susie): setup_varbvs.R + fit_varbvs.R
  @CONF: R_libs = varbvs@pcarbo/varbvs/varbvs-R
  sa: 1

fit_caviar: fit_caviar.R + \
             R(posterior = finemap_mcaviar(sumstats[1,,]/sumstats[2,,], 
                                            ld, args, prefix=cache))
  @CONF: R_libs = (dplyr, magrittr)
  sumstats: $sumstats
  ld: $ld_file
  args: -c 1, -c 3
  cache: file(CAVIAR)
  $posterior: posterior

fit_finemap(fit_caviar): fit_finemap.R + \
             R(posterior = finemap_mvar(sumstats[1,,]/sumstats[2,,], 
                                        ld, N, k,
                                        args, prefix=cache))
  N: $N
  k: R(rep(1/5,5)), (0.6,0.25,0.1,0.05)
  args: --regions 1 --prior-std 0.4 --n-causal-max 5
  cache: file(FM)

fit_dap: fit_dap.py + Python(posterior = dap_batch(data['X'], data['Y'], cache, args))
  data: $data
  args: -ld_control 0.25
  cache: file(DAP)
  $posterior: posterior

# fit_dap_mv(fit_dap): fit_dap.py + Python(res = dap_mv())

# fit_dap_ss(fit_dap): fit_dap.py + Python(res = dap_batch_ss())
#   data: $sumstats

# fit_dap_mv_ss(fit_dap): fit_dap.py + Python(res = dap_mv_ss())

### `evaluate.dsc`

Methods evaluation / visualization.

In [7]:
%save -f modules/evaluate.dsc
# Modules to evaluate various methods
# for finemapping-m

# Module input
# ============
# $fit: see fit.dsc
# $result: see fit.dsc

# Module output
# =============
# ? an object or plot for diagnosis

plot_finemap: plot_finemap.R
  @CONF: R_libs = (dplyr, ggplot2, cowplot)
  result: $posterior
  top_rank: 10
  $plot_file: file(pdf)

plot_caviar(plot_finemap): plot_caviar.R
plot_dap(plot_finemap): plot_dap.R

## Workhorses

### `regression.R`

In [5]:
%save -f modules/regression.R
## Perform univariate regression for each column of Y on each column of X
univariate_regression = function(X, y, Z=NULL, return_residue=FALSE) {
  if (!is.null(Z)) {
    y = .lm.fit(Z, y)$residuals
  }
  calc_stderr = function(X, residuals) {
    # S = (X'X)^-1 \Sigma
    sqrt(diag(sum(residuals^2) / (nrow(X) - 2) * chol2inv(chol(t(X) %*% X))))
  }
  output = do.call(rbind,
                   lapply(1:ncol(X), function(i) {
                     g = .lm.fit(cbind(1, X[,i]), y)
                     return(c(coef(g)[2], calc_stderr(cbind(1, X[,i]), g$residuals)[2]))
                   })
                   )
  if (return_residue) {
    return(list(betahat = output[,1], sebetahat = output[,2],
                residuals = y))
  } else {
    return(list(betahat = output[,1], sebetahat = output[,2]))
  }
}

library(abind)
mm_regression = function(X, Y, Z=NULL) {
  reg = lapply(seq_len(ncol(Y)), function (i) simplify2array(univariate_regression(X, Y[,i])))
  reg = do.call(abind, c(reg, list(along=0)))
  # return array: out[1,,] is betahat, out[2,,] is shat
  return(aperm(reg, c(3,2,1)))
}

### `setup_varbvs.R`

In [6]:
%save -f modules/setup_varbvs.R

X <- data$X
storage.mode(X) <- "double"
n <- nrow(X)
p <- ncol(X)
X <- scale(X,center = TRUE,scale = FALSE)
alpha0  <- runif(p)
alpha0  <- alpha0/sum(alpha0)
mu0     <- rnorm(p)
pp      <- rep(maxL/p, p)
logodds <- varbvs:::logit(pp)
Y <- data$Y
for (r in 1:ncol(Y)) {
  Y[,r] <- Y[,r] - mean(Y[,r])
}
storage.mode(Y) <- "double"

### `fit_varbvs.R`

In [7]:
%save -f modules/fit_varbvs.R
fitted <- list()
for (r in 1:ncol(Y)) {
  sigma <- var(Y[,r])
  fitted[[r]] <- varbvs::varbvsnorm(X,Y[,r],sigma,sa,logodds,alpha0,mu0,update.order = 1:p,
                                    update.sigma = FALSE,update.sa = FALSE,tol = 1e-6,
                                    verbose = FALSE, maxiter=maxI)
}

post_mean <- do.call(cbind, lapply(1:length(fitted), function(i) fitted[[i]]$alpha * fitted[[i]]$mu))
lfdr <- do.call(cbind, lapply(1:length(fitted), function(i) 1 - fitted[[i]]$alpha))
posterior <- list(PosteriorMean=post_mean, lfdr=lfdr)

### `init_mnm.R`

In [8]:
%save -f modules/init_mnm.R
# Initialize model data: priors and init values

if (Sigma != 'empirical') {
  # FIXME data$V has to be changed
  V = diag(nrow(V))
}
mash_data = mashr::mash_set_data(reg[1,,], Shat = reg[2,,], V = as.matrix(V))
if (U == 'auto') {
  U = mashr::cov_canonical(mash_data)
} else {
  ## FIXME: add other methods to get U
  U = mashr::cov_canonical(mash_data)
}
model = list()
if (p == 'auto') {
  model$fitted_g = mashr::mash(mash_data, Ulist=U, outputlevel=1, usepointmass=TRUE)$fitted_g
} else {
  ## FIXME: need to use pre-fitted pi on larger data from mash procedure
  model$fitted_g = list(pi=p, Ulist=U, grid=grid, usepointmass=TRUE)
}

### `fit_mnm.R`

In [9]:
%save -f modules/fit_mnm.R
update_mash_model <- function(X, Y, V, fitted_g) {
  ## result contains 'PosteriorMean' 'PosteriorSD' 'lfdr' 'NegativeProb' 'lfsr'
  reg <- mm_regression(X, Y)
  mash_data <- mashr::mash_set_data(reg[1,,], Shat = reg[2,,], V = as.matrix(V))
  return(mashr::mash(mash_data, g = fitted_g, fixg = TRUE, outputlevel=3))
}

update_mnmash_model <- function(X, Y, V, fitted_g, fitted) {
  ## "fitted" include p_alpha, alpha, mu and Xr
  maxL = ncol(fitted$alpha)
  for (l in 1:maxL) {
    ## remove the lth effect
    fitted$Xr <- fitted$Xr - X %*% (fitted$alpha[,l] * fitted$mu[[l]])
    ## update mash model
    mout <- update_mash_model(X, Y - fitted$Xr, V, fitted_g)
    ## update fitted values
    fitted$mu[[l]] <- mout$result$PosteriorMean
    fitted$s[[l]] <- mout$result$PosteriorCov
    fitted$eb[[l]] <- mout$result$elbo_base
    fitted$lfsr[[l]] <- mout$result$lfsr
    fitted$neg[[l]] <- mout$result$NegativeProb
    l10bf <- mashr::get_log10bf(mout)
    alpha_post <- exp((l10bf - max(l10bf)) * log(10)) * fitted$p_alpha
    fitted$alpha[,l] <- alpha_post / sum(alpha_post)
    ## add back the updated lth effect
    fitted$Xr <- fitted$Xr + X %*% (fitted$alpha[,l] * fitted$mu[[l]])
  }
  return(fitted)
}

## Initialize storage for results
p_alpha <- rep(1, ncol(data$X)) / ncol(data$X)
alpha <- matrix(0, ncol(data$X), maxL)
mu <- lapply(1:maxL, function(i) matrix(0, ncol(data$X), ncol(data$Y)))
Xr <- matrix(0, nrow(data$Y), ncol(data$Y))
fitted <- list(p_alpha=p_alpha, alpha=alpha, mu=mu, s=list(), Xr=Xr, eb=list(), lfsr=list(), neg=list())
fitted_track <- list()

## Fit m&m model
for (i in 1:maxI) {
  fitted <- update_mnmash_model(data$X, data$Y, V, model$fitted_g, fitted)
  fitted_track[[i]] <- fitted
}

## Compute posterior mean and covariances
post_mean <- matrix(0, ncol(data$X), ncol(data$Y))
for (l in 1:maxL) {
  post_mean <- post_mean + fitted$mu[[l]] * fitted$alpha[,l]
}
post_cov <- array(0, dim=c(ncol(data$Y), ncol(data$Y), ncol(data$X)))
for (j in 1:ncol(data$X)) {
  for (l in 1:maxL) {
    post_cov[,,j] <- post_cov[,,j] + (fitted$mu[[l]][j,] %*% t(fitted$mu[[l]][j,]) + fitted$s[[l]][,,j]) * fitted$alpha[j,l]
  }
  post_cov[,,j] <- post_cov[,,j] - post_mean[j,] %*% t(post_mean[j,])
}

## Compute lfsr
lfsr <- do.call(rbind, lapply(1:maxL, function(l) colSums(fitted$alpha[,l] * fitted$lfsr[[l]])))
posterior <- list(PosteriorMean=post_mean,
                  PosteriorCov=post_cov,
                  alpha = fitted$alpha,
                  lfsr=lfsr,
                  n_in_CI=susieR:::n_in_CI(t(fitted$alpha)),
                  in_CI=susieR:::in_CI(t(fitted$alpha))
                  )

### `fit_susie.R`

In [10]:
%save -f modules/fit_susie.R
fitted <- list()
for (r in 1:ncol(data$Y)) {
  fitted[[r]] <- susieR::susie(data$X,data$Y[,r],L=maxL,max_iter=maxI)
  fitted[[r]]$lfsr <- susieR:::lfsr_fromfit(fitted[[r]])
  fitted[[r]]$n_in_CI <- susieR:::n_in_CI(fitted[[r]])
  fitted[[r]]$in_CI <- susieR:::in_CI(fitted[[r]])
}

posterior <- list(PosteriorMean=do.call(cbind, lapply(1:length(fitted), function(i) susieR:::coef.susie(fitted[[i]]))),
                  lfsr=do.call(cbind, lapply(1:length(fitted), function(i) fitted[[i]]$lfsr)),
                  alpha=do.call(cbind, lapply(1:length(fitted), function(i) fitted[[i]]$alpha)),
                  n_in_CI=do.call(cbind, lapply(1:length(fitted), function(i) fitted[[i]]$n_in_CI)),
                  in_CI= do.call(cbind, lapply(1:length(fitted), function(i) fitted[[i]]$in_CI))
                  )


### `fit_dap.py`

DAP version 1 was published as Wen et al 2016 AJHG. Here William has polished the software `dap-g` with another manuscript that describes improved algorithm and working with summary statistics. This benchmark uses DAP version 2. Below is an example output that I parse and save.

```
Posterior expected model size: 0.500 (sd = 0.500)
LogNC = -0.30685 ( Log10NC = -0.133 )
Posterior inclusion probability

((1))              7492 6.68581e-05       0.000 1
((2))              7490 6.68581e-05       0.000 1
((3))              7484 6.68581e-05       0.000 1
((4))              7486 6.68581e-05       0.000 1
((5))              7481 6.68581e-05       0.000 1
((6))              7476 6.68581e-05       0.000 1
((7))              7479 6.68581e-05       0.000 1
((8))              7491 6.68046e-05       0.000 2
((9))              7483 6.68046e-05       0.000 2
((10))             7485 6.68046e-05       0.000 2
((11))             7488 6.68046e-05       0.000 2
((12))             7474 6.68046e-05       0.000 2
((13))             7475 6.68046e-05       0.000 2
((14))             7478 6.68046e-05       0.000 2
((15))             7465 6.68046e-05       0.000 2
((16))             7473 6.68046e-05       0.000 2
((17))             7470 6.68046e-05       0.000 2
((18))             7467 6.68046e-05       0.000 2
((19))             7461 6.68046e-05       0.000 2
((20))             7459 6.68046e-05       0.000 2
((21))             7482 6.67422e-05       0.000 -1
((22))             7489 6.67422e-05       0.000 -1
((23))             7487 6.67422e-05       0.000 -1
((24))             7477 6.67422e-05       0.000 -1
((25))             7480 6.67422e-05       0.000 -1
((26))             7463 6.67422e-05       0.000 -1
...
Independent association signal clusters

     cluster         member_snp      cluster_pip      average_r2
       {1}              7            4.680e-04          0.951                 0.951   0.037
       {2}             13            8.685e-04          0.623                 0.037   0.623

```

In [5]:
%save -f modules/fit_dap.py
import subprocess
import pandas as pd
import numpy as np

def dap_single(x, y, prefix, r, args):
    names = np.array([('geno', i+1, str(r)) for i in range(x.shape[1])])
    with open(f'{prefix}.data', 'w') as f:
        print(*(['pheno', 'pheno', str(r)] + list(y.ravel())), file=f)
        np.savetxt(f, np.hstack((names, x.T)), fmt = '%s', delimiter = ' ')
    grid = '''         
        0.0000  0.1000
        0.0000  0.2000
        0.0000  0.4000
        0.0000  0.8000
        0.0000  1.6000
        '''
    grid = '\n'.join([x.strip() for x in grid.strip().split('\n')])
    with open(f'{prefix}.grid', 'w') as f:
        print(grid, file=f)
    cmd = ['dap-g', '-d', f'{prefix}.data', '-g', f'{prefix}.grid', '-o', f'{prefix}.result', '--all'] + ' '.join(args).split()
    subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
    out = [x.strip().split() for x in open(f'{prefix}.result').readlines()]
    pips = []
    clusters = []
    still_pip = True
    for line in out:
        if len(line) == 0:
            continue
        if len(line) > 2 and line[2] == 'cluster_pip':
            still_pip = False
            continue
        if still_pip and (not line[0].startswith('((') or int(line[-1]) < 0):
            continue
        if still_pip:
            pips.append([line[1], float(line[2]), float(line[3]), int(line[4])])
        else:
            clusters.append([len(clusters) + 1, float(line[2]), float(line[3])])
    pips = pd.DataFrame(pips, columns = ['snp', 'snp_prob', 'snp_log10bf', 'cluster'])
    clusters = pd.DataFrame(clusters, columns = ['cluster', 'cluster_prob', 'cluster_avg_r2'])
    clusters = pd.merge(clusters, pips.groupby(['cluster'])['snp'].apply(','.join).reset_index(), on = 'cluster')
    return {'snp': pips, 'set': clusters}

def dap_batch(X, Y, prefix, *args):
    return dict([(r, dap_single(X, Y[:,r], f'{prefix}_condition_{r+1}', r+1, args)) for r in range(Y.shape[1])])

### `fit_finemap.R`

In [12]:
%save -f modules/fit_finemap.R
#' FINEMAP I/O
write_finemap_sumstats <- function(z, LD_file, n, k, prefix) {
  cfg = list(z=paste0(prefix,".z"),
             ld=LD_file,
             snp=paste0(prefix,".snp"),
             config=paste0(prefix,".config"),
             k=paste0(prefix,".k"),
             log=paste0(prefix,".log"),
             meta=paste0(prefix,".master"))
  write.table(z,cfg$z,quote=F,col.names=F)
  write.table(t(k),cfg$k,quote=F,col.names=F,row.names=F)
  write("z;ld;snp;config;k;log;n-ind",file=cfg$meta)
  write(paste(cfg$z, cfg$ld, cfg$snp, cfg$config, cfg$k, cfg$log, n, sep=";"),
        file=cfg$meta,append=TRUE)
  return(cfg)
}

#' Run FINEMAP.
#' http://www.christianbenner.com
## FIXME: read the finemapr implementation for data sanity check.
## Can be useful as a general data sanity checker (in previous modules)

run_finemap <- function(z, LD_file, n, k, args = "", prefix="data")
{
  cfg = write_finemap_sumstats(z, LD_file, n, k, prefix)
  cmd = paste("finemap --sss --log", "--in-files", cfg$meta, args)
  dscrutils::run_cmd(cmd)

  # read output tables
  snp = read.table(cfg$snp,header=TRUE,sep=" ")
  snp$snp = as.character(snp$snp)

  snp = rank_snp(snp)
  config = read.table(cfg$config,header=TRUE,sep=" ")

  # extract number of causal
  ncausal = finemap_extract_ncausal(cfg$log)
  return(list(snp=snp, set=config, ncausal=ncausal))
}

rank_snp <- function(snp) {
  snp <- arrange(snp, -snp_prob) %>%
    mutate(
        rank = seq(1, n()),
        snp_prob_cumsum = cumsum(snp_prob) / sum(snp_prob)) %>%
    select(rank, snp, snp_prob, snp_prob_cumsum, snp_log10bf)
  return(snp)    
}

finemap_extract_ncausal <- function(logfile)
{
  lines <- grep("->", readLines(logfile), value = TRUE)
  lines <- gsub("\\(|\\)|>", "", lines)
  splits <- strsplit(lines, "\\s+")
  tab <- data.frame(
    ncausal_num = sapply(splits, function(x) as.integer(x[2])),
    ncausal_prob = sapply(splits, function(x) as.double(x[4])))
  tab <- mutate(tab, type = ifelse(duplicated(ncausal_num), "post", "prior"))
  return(tab)
}

finemap_mvar <- function(zscore, LD_file, n, k, args, prefix) {
  return(parallel::mclapply(1:ncol(zscore), function(r) 
          run_finemap(zscore[,r], LD_file, n, k, args, 
                      paste0(prefix, '_condition_', r)),
                            mc.cores = min(8, ncol(zscore))))
}

### `fit_caviar.R`

`CAVIAR` output file (`*_post`): 
- column #1 is the variant name;
- column #2 is the [posterior prob. that the variant is causal](https://github.com/fhormoz/caviar/issues/1#issuecomment-286521771);
- column #3 is the amount that this variant contributes to 95%-causal credible set.

In [13]:
%save -f modules/fit_caviar.R
#' CAVIAR I/O
write_caviar_sumstats <- function(z, prefix) {
  cfg = list(z=paste0(prefix,".z"),
             set=paste0(prefix,"_set"),
             post=paste0(prefix,"_post"),
             log=paste0(prefix,".log"))
  write.table(z,cfg$z,quote=F,col.names=F)
  return(cfg)
}

#' Run CAVIAR
#' https://github.com/fhormoz/caviar

run_caviar <- function(z, LD_file, args = "", prefix="data")
{
  cfg = write_caviar_sumstats(z, prefix)
  cmd = paste("CAVIAR", "-z", cfg$z, "-l", LD_file, "-o", prefix, args)
  dscrutils::run_cmd(cmd)
  if(!all(file.exists(cfg$post, cfg$set, cfg$log))) {
      stop("Cannot find one of the post, set, and log files")
  }
  
  log <- readLines(cfg$log)

  # read output tables
  snp <- read.delim(cfg$post)  
  stopifnot(ncol(snp) == 3)
  names(snp) <- c("snp", "snp_prob_set", "snp_prob")
  snp$snp <- as.character(snp$snp)

  # `set` of snps
  set <- readLines(cfg$set)
  set_ordered <- left_join(data_frame(snp = set), snp, by = "snp") %>% 
    arrange(rank) %$% snp
  return(list(snp=snp, set=set_ordered))
}

finemap_mcaviar <- function(zscore, LD_file, args, prefix) {
  return(parallel::mclapply(1:ncol(zscore), function(r)
          run_caviar(zscore[,r], LD_file, args, 
                     paste0(prefix, '_condition_', r)), 
                            mc.cores = min(8, ncol(zscore))))
}

## Visualization

### `plot_finemap.R`

In [2]:
%save -f modules/plot_finemap.R

plot_finemap <- function(x,
                         grid_nrow = NULL, 
                         grid_ncol = NULL, 
                         label_size = 2,
                         top_rank = 5,
                         lim_prob = c(0, 1.2),
                         ...)
{
  label_size_config = label_size
  label_size_snp = label_size
  top_rank_config = top_rank
  top_rank_snp = top_rank
  lim_prob_config = lim_prob
  lim_prob_snp = lim_prob
  lim_prob_ncausal = lim_prob   
    
  p1 <- plot_ncausal(x, 
    lim_prob = lim_prob_ncausal, ...)
  p2 <- plot_set(x,  
    top_rank = top_rank_config, 
    label_size = label_size_config, 
    lim_prob = lim_prob_config, ...)
  p3 <- plot_snp(x, 
    top_rank = top_rank_snp,
    label_size = label_size_snp, 
    lim_prob = lim_prob_snp, ...)
  
  plot_grid(p1, p2, p3,  labels = "AUTO", nrow = grid_nrow, ncol = grid_ncol)
}


plot_ncausal <- function(x, lim_prob, ...)
{
  ptab <- x$ncausal
  
  sum_prop_zero <- filter(ptab, ncausal_num == 0)[["prob"]]  %>% sum
  if(sum_prop_zero == 0) {
    ptab <- filter(ptab, ncausal_num != 0)
  }
  
  ptab <- mutate(ptab, 
    ncausal_num = factor(ncausal_num, levels = sort(unique(ncausal_num), 
                                                    decreasing = TRUE)),
    type = factor(type, levels = c("prior", "post")))
    
  p <- ggplot(ptab, aes(ncausal_num, ncausal_prob, fill = type)) + 
    geom_hline(yintercept = 1, linetype = 3) + 
    geom_bar(stat = "identity", position = "dodge") + 
    coord_flip() + theme(legend.position = "top") + 
    scale_fill_manual(values = c("grey50", "orange")) +
    ylim(lim_prob)
    
  return(p)
}

plot_set <- function(x, lim_prob, label_size, top_rank, ...)
{
  ptab <- x$set

  ptab <- head(ptab, top_rank)

  ptab <- mutate(ptab,
    label = paste0(config, "\n", 
      "P = ", round(config_prob, 2),
      "; ", "log10(BF) = ", round(config_log10bf, 2)))

  ggplot(ptab, aes(config_prob, rank)) + 
    geom_vline(xintercept = 1, linetype = 3) + 
    geom_point() + 
    geom_segment(aes(xend = config_prob, yend = rank, x = 0)) + 
    geom_text(aes(label = label), hjust = 0, nudge_x = 0.025, size = label_size) + 
    xlim(lim_prob) + 
    scale_y_continuous(limits  = c(top_rank + 0.5, 0.5), trans = "reverse")
}


plot_snp <- function(x, lim_prob, label_size, top_rank, ...)
{
  ptab <- x$snp
  
  ptab <- head(ptab, top_rank)

  ptab <- mutate(ptab,
    rank = seq(1, n()), 
    label = paste0(snp, "\n", 
      "P = ", round(snp_prob, 2),
      "; ", "log10(BF) = ", round(snp_log10bf, 2)))

  ggplot(ptab, aes(snp_prob, rank)) +
    geom_vline(xintercept = 1, linetype = 3) + 
    geom_point() + 
    geom_segment(aes(xend = snp_prob, yend = rank, x = 0)) + 
    geom_text(aes(label = label), hjust = 0, nudge_x = 0.025, size = label_size) + 
    xlim(lim_prob) + 
    scale_y_continuous(limits  = c(top_rank + 0.5, 0.5), trans = "reverse")
}

pdf(plot_file)
for (r in 1:length(result)) {
    print(plot_finemap(result[[r]], top_rank = top_rank))
}
dev.off()

### `plot_caviar.R`

In [None]:
%save -f modules/plot_caviar.R
plot_caviar <- function(x,
                        grid_nrow = NULL, 
                        grid_ncol = NULL, 
                        label_size = 2,
                        top_rank = 5,
                        lim_prob = c(0, 1.5),
                        ...)
{
  plot_snp(x, label_size, top_rank, lim_prob, ...)
}

plot_snp <- function(x, label_size, top_rank, lim_prob, ...)
{
  ptab <- x$snp

  ptab <- head(ptab, top_rank)

  ptab <- mutate(ptab,
    label = paste0(snp, "\n", 
      "P = ", round(snp_prob, 2),
      "; ", "P(set) = ", round(snp_prob_set, 2)))

  ggplot(ptab, aes(snp_prob, rank)) +
    geom_vline(xintercept = 1, linetype = 3) + 
    geom_point() + 
    geom_segment(aes(xend = snp_prob, yend = rank, x = 0)) + 
    geom_text(aes(label = label), hjust = 0, nudge_x = 0.025, size = label_size) + 
    xlim(lim_prob) + 
    scale_y_continuous(limits  = c(top_rank + 0.5, 0.5), trans = "reverse")
}

pdf(plot_file)
for (r in 1:length(result)) {
    print(plot_caviar(result[[r]], top_rank = top_rank))
}
dev.off()

### `plot_dap.R`

In [6]:
%save -f modules/plot_dap.R


plot_dap <- function(x,
                     grid_nrow = 2, 
                     grid_ncol = 1, 
                     label_size = 2,
                     top_rank = 5,
                     lim_prob = c(0, 1.2),
                     ...)
{
  label_size_config = label_size
  label_size_snp = label_size
  top_rank_config = top_rank
  top_rank_snp = top_rank
  lim_prob_config = lim_prob
  lim_prob_snp = lim_prob
    
  p2 <- plot_set(x,  
    top_rank = top_rank_config, 
    label_size = label_size_config, 
    lim_prob = lim_prob_config, ...)
  p3 <- plot_snp(x, 
    top_rank = top_rank_snp,
    label_size = label_size_snp, 
    lim_prob = lim_prob_snp, ...)
  
  plot_grid(p2, p3,  labels = "AUTO", nrow = grid_nrow, ncol = grid_ncol)
}


plot_set <- function(x, lim_prob, label_size, top_rank, ...)
{
  ptab <- x$set

  ptab <- head(ptab, top_rank)

  ptab <- mutate(ptab,
    label = paste0(snp, "\n", 
      "P = ", round(cluster_prob, 2),
      "; ", "avg(r^2) = ", round(cluster_avg_r2, 2)))

  ggplot(ptab, aes(cluster_prob, cluster)) + 
    geom_vline(xintercept = 1, linetype = 3) + 
    geom_point() + 
    geom_segment(aes(xend = cluster_prob, yend = cluster, x = 0)) + 
    geom_text(aes(label = label), hjust = 0, nudge_x = 0.025, size = label_size) + 
    xlim(lim_prob) + 
    scale_y_continuous(limits  = c(min(top_rank, nrow(ptab)) + 0.5, 0.5), trans = "reverse")
}


plot_snp <- function(x, lim_prob, label_size, top_rank, ...)
{
  ptab <- x$snp
  
  ptab <- head(ptab, top_rank)

  ptab <- mutate(ptab,
    rank = seq(1, n()), 
    label = paste0(snp, "\n", 
      "P = ", round(snp_prob, 2),
      "; ", "log10(BF) = ", round(snp_log10bf, 2)))

  ggplot(ptab, aes(snp_prob, rank)) +
    geom_vline(xintercept = 1, linetype = 3) + 
    geom_point() + 
    geom_segment(aes(xend = snp_prob, yend = rank, x = 0)) + 
    geom_text(aes(label = label), hjust = 0, nudge_x = 0.025, size = label_size) + 
    xlim(lim_prob) + 
    scale_y_continuous(limits  = c(top_rank + 0.5, 0.5), trans = "reverse")
}

pdf(plot_file)
for (r in 1:length(result)) {
    print(plot_dap(result[[r]], top_rank = top_rank))
}
dev.off()