# Multivariate fine-mapping workflow

This notebook applies mvSuSiE on data analysis.

Three versions of pipelines are implemented:
        
1. Individual level data input `X`, `Y` and `C` (for covariates).
2. Sufficient statistics input `XtX`, `XtY`, `YtY` and `n`. We assume covariates `C` have been removed from `X` and `Y`. We provide a procedure to implement this.
3. GWAS summary statistics input `z` and `R`. We assume `z` scores have been computed after removal of covariates `C`.

## Input

Several file formats are supported:

1. RDS format of a list of objects, in which case you can specify the names of objects corresponding to the quantities `X`, `Y`, `XtX`, etc.
2. `pgen`/`psam`/`pvar` bundle for genotypes and text file for phenotypes.
3. `bed`/`fam`/`bim` bundle for genotypes and text file for phenotypes.

## Output

For each analysis unit we output:

1. Analysis ready data-set in RDS format (in `cache` directory so you can remove at any time)
2. Analysis results in RDS format
3. Default visualization plots

## Analysis examples

```
sos run mvSuSiE.ipynb complete_data_analysis \
    --analysis-units data/27_brain_non_brain_genes_v8.txt \
    --data-dir /project2/compbio/GTEx_eQTL/cis_eqtl_analysis_ready \
    --data-suffix GTEx_V8.rds \
    --name 20210409 \
    --wd /project2/compbio/GTEx_eQTL/mvSuSiE_output/cis_results \
    --prior /project2/compbio/GTEx_eQTL/mvSuSiE_output/GTEx_V8_strong_z.teem.rds \
    -c midway2.yml -q midway2
```

In [None]:
nohup sos run /home/hs3163/GIT/bioworkflows/multivariate-fine-mapping/mvSuSiE.ipynb complete_data_preprocess  \
    --analysis-units /home/hs3163/Project/mwe/data/rgs_lst \
    --data-dir /home/hs3163/Project/mwe/data/ \
    --data-suffix GTEx_V8.rds \
    --name 20210509 \
    --wd ~/Project/mwe/new_mvsusie \
    --container /mnt/mfs/statgen/containers/twas_latest.sif \
    --prior /home/hs3163/Project/mwe/data/GTEx_V8_strong_z.teem.rds \
    -J 200 -q csg -c ~/GIT/neuro-twas/code/csg.yml -s build &



nohup sos dryrun /home/hs3163/GIT/bioworkflows/multivariate-fine-mapping/mvSuSiE.ipynb complete_data_analysis \
    --analysis-units /home/hs3163/Project/mwe/data/rgs_lst \
    --data-dir /home/hs3163/Project/mwe/data/ \
    --data-suffix GTEx_V8.rds \
    --name 20210509 \
    --wd /mnt/mfs/statgen/neuro-twas/mvsusie_polish \
    --container /mnt/mfs/statgen/containers/twas_latest.sif \
    --prior /home/hs3163/Project/mwe/data/GTEx_V8_strong_z.teem.rds 

nohup sos dryrun /home/hs3163/GIT/bioworkflows/multivariate-fine-mapping/mvSuSiE.ipynb complete_data_preprocess \
    --analysis-units /home/hs3163/Project/mwe/data/rgs_lst \
    --data-dir /home/hs3163/Project/mwe/data/ \
    --data-suffix GTEx_V8.rds \
    --name 20210509 \
    --wd /mnt/mfs/statgen/neuro-twas/mvsusie_polish \
    --container /mnt/mfs/statgen/containers/twas_latest.sif \
    --prior /home/hs3163/Project/mwe/data/GTEx_V8_strong_z.teem.rds 

In [1]:
[global]
import glob
# single column file each line is the data filename(gene_name)
parameter: analysis_units = path
# Path to data directory
parameter: data_dir = path
# An identifier for your run of analysis
parameter: name = str
# data file suffix
parameter: data_suffix = str
# data file prefix
parameter: data_prefix = ""
# Path to work directory where output locates
parameter: wd = path("./output")
# Path to prior data file: an RDS file with `U` and `w` for prior matrices and weights
parameter: prior = path('.')
# Path to residual cor/cov data file
parameter: resid_cor = path('.')
# Only analyze `cis` variants -- cis = N means using N variants around the center column of X matrix  
parameter: cis = 'NULL'
regions = [x.strip() for x in open(analysis_units).readlines() if x.strip() and not x.strip().startswith('#')]
genes = [f"{data_dir:a}/{x}.{data_suffix}" for x in regions if path(f"{data_dir:a}/{x}.{data_suffix}").exists()]
# Containers that contains the necessary packages
parameter: container = 'gaow/twas'

In [None]:
[plink_merging_1]
input:  molecular_pheno_dir, for_each = "regions"
output: f'{wd:a}/cache/{data_prefix}.{_regions[0]}.merged.bed'
# Path to a list of molecular phenotypes that are to be merged, shall contains a cache file within it.
parameter: molecular_pheno_dir = path
molecular_pheno = [x.strip().split() for x in open(molecular_pheno_dir).readlines() if x.strip() and not x.strip().startswith('#')]
task: trunk_workers = 1, trunk_size = job_size, walltime = '4h',  mem = '60G', tags = f'{step_name}_{_output[0]:bn}'  

R: expand = "$[ ]", stderr = f'{_output[2]}.stderr', stdout = f'{_output[2]}.stdout',container = container
    library("dplyr")
    library("tibble")
    library("plink2R")
    library("purrr")
    library("readr")
    molecular_pheno = read_delim("$[molecular_pheno_dir]",delim = "\t")
    molecular_pheno = molecular_pheno%>%mutate(dir = map_chr(`#molc_pheno`,~paste(c(`.x`,"/cache/$[data_prefix].$[_regions[0]]"),collapse = "")))
    n = nrow(molecular_pheno)
    # For every tissues read plink, and extract the fam df.
    genos = tibble( i = 1:n)
    genos = genos%>%mutate(fam = map(i, ~read_plink(molecular_pheno[[.x,2]])$fam%>%as_tibble()%>%mutate(name = paste(V1,":",V2,sep = ""))%>%select(name,V6)))
    
    # Join two tissues
    genos_join_phe_$[_regions[0]] = full_join((genos%>%pull(fam))[[1]],(genos%>%pull(fam))[[2]],by = "name")
    
    # If there are more tissues, join the rest
    if(n > 2){
    for(j in 3:n){
    genos_join_phe_$[_regions[0]] = full_join(genos_join_phe_$[_regions[0]],(genos%>%pull(fam))[[j]],by = "name")
    }
    }
    genos_join_phe_$[_regions[0]]%>%readr::write_delim("$[_output[0]:n].merged.exp",delim = "\t")
    
    # Create merge list
    molecular_pheno[2]%>%readr::write_delim("$[_output[0]:n].list",delim = "\t",col_names=FALSE)


bash: expand = "$[ ]", stderr = f'{_output[1]}.stderr', stdout = f'{_output[1]}.stdout',container = container
    # create the merged output X
    plink --bfile '$[next(iter(molecular_pheno[0]))]/cache/$[name_prefix].$[_regions[0]]'\
          --merge-list $[_output[0]:n].list \
          --mac 1 \
          --make-bed \
          --out $[_output[0]:n] \
          --allow-no-sex \
          --extract $[extract_snp]

In [None]:
# This needs pgenlibr package
# devtools::install_github("chrchang/plink-ng", subdir="/2.0/pgenlibr")
[complete_data_preprocess]
# For RDS data
parameter: x_table=""
parameter: y_table=""
parameter: z_table=""
# For PLINK data
# Phenotype file, assuming a header
parameter: phenoFile = path('.')
parameter: phenoCols = []
parameter: covarFile = path('.')
parameter: covarCols = []
input: genes, group_by = 1
output: f'{wd:a}/{_genes:n}.analysis_ready.rds'
task: trunk_workers = 1, trunk_size = 1, walltime = '12h', mem = '12G', cores = 2, tags = f'{step_name}_{_output:bn}'
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container

    remove_covariate_effects <- function (X, Z, Y) {
      # include the intercept term
      if (any(Z[,1]!=1)) Z = cbind(1, Z)
      # make Y a matrix
      if (is.null(dim(Y))) Y = matrix(Y, length(Y), 1)
      A   <- Matrix::forceSymmetric(crossprod(Z))
      SZy <- as.vector(solve(A,c(y %*% Z)))
      SZX <- as.matrix(solve(A,t(Z) %*% X))
      SZY <- as.matrix(solve(A,t(Z) %*% Y))
      X <- X - Z %*% SZX
      y_res <- Y - Z %*% SZY
      return(list(X = X,Y = Y, Z = Z, y_res = y_res))
    }
  
    # read PLINK files
    read_pvar <- function(pgen){
      pvarf <- paste0(tools::file_path_sans_ext(pgen), ".pvar")
      pvardt <- data.table::fread(pvarf, skip = "#CHROM")
      pvardt <- dplyr::rename(pvardt, "chrom" = "#CHROM", "pos" = "POS",
                    "alt" = "ALT", "ref" = "REF", "id" = "ID")
      pvardt <- pvardt[, c("chrom", "id", "pos", "alt", "ref")]
      return(pvardt)
    }
    
    read_bim <- function(bed) {
      bimf <- paste0(tools::file_path_sans_ext(bed), ".bim")
      bim <- data.table::fread(bimf)
      colnames(bim) <- c("chrom", "id", "gpos", "pos", "a1", "a0")
      return(bim)
    }
    
    read_psam <- function(pgen) {
      psamf <- paste0(tools::file_path_sans_ext(pgen), ".psam")
      psam = data.table::fread(psamf, header=T)
      colnames(psam)[1:2] = c("FID", "IID")
      return(psam)
    }
  
    read_fam <- function(bed) {
        famf <- paste0(tools::file_path_sans_ext(bed), ".fam")
        return(data.table::fread(famf, header = F))
    }

    # open pgen/pvar PLINK 2 data format
    open_pgen <- function(pgenf){
        return(pgenlibr::NewPgen(pgenf))
    } 

    # open bed/bim/fam: A PLINK 1 .bed is a valid .pgen
    open_bed <- function(bed){
        raw_s_ct <- nrow(read_fam(bed))
        return(pgenlibr::NewPgen(bed, raw_sample_ct = raw_s_ct))
    }

    read_pgen <- function(pgen, variantidx = NULL, meanimpute = F ) {
      if (is.null(variantidx)){
        variantidx <- 1: pgenlibr::GetVariantCt(pgen)}

      pgenlibr::ReadList(pgen,
                         variant_subset = variantidx,
                         meanimpute = meanimpute)
    }

    genof = ${path(_geno):ar}
    ext = tools::file_ext(genof)
    if (ext == 'rds') {
      dat = readRDS(genof)
      X = dat$${x_table}
      Y = dat$${y_table}
      Z = dat$${z_table}
    } else if (ext == 'pgen' || ext == 'bed') {
      if (ext == 'pgen') X = read_pgen(open_pgen(genof))
      else X = read_pgen(open_bed(genof))
      Y = read_sample(${phenoFile:r}, c(${paths(phenoCols):,r})
      if (tools::file.exists(${covarFile:r})) {
          Z = read_sample(${covarFile:r}, c(${paths(covarCols):,r}) # return this: %>% select(-FID, -IID) %>% as.matrix
      } else {
          Z = NULL
      }
    } else {
      stop("Unsupported genotype format")
    }
    # match X and Y data
    match.idx = match(rownames(X), rownames(Y))
    Y = Y[match.idx,]
    # center Y
    Y = sweep(Y, 2, colMeans(Y), '-')
    if (!is.null(Z)) {
        match.idx = match(rownames(Y), rownames(Z))
        Z = Z[match.idx,]
        Z = sweep(Z, 2, colMeans(Z), '-')
    }
    
   # Remove covariate effect
    if(!is.null(Z)){
  
  }
  
  
    
    # FIXME: not working; to be completed
    # FIXME: need to remove Z from X and Y
    # https://github.com/gaow/mvarbvs/blob/master/workflow/GTEx_V8_preprocessing.ipynb
    # What if different Y have different missing? We cannot remove Z from X then ...  

In [None]:
[complete_data_analysis_1]
parameter: x_table = 'X'
parameter: y_table = 'y_res'
# remove a variant if it has more than imiss missing individual data
parameter: imiss = 0.1
parameter: maf = 0.05
parameter: max_L = 5
input: genes, group_by = 1
output: mvsusie = f'{wd:a}/{_input:bn}{("_cis_%s" % cis) if cis != "NULL" else ""}_{name}.mvsusie.rds',
        susie = f'{wd:a}/{_input:bn}{("_cis_%s" % cis) if cis != "NULL" else ""}_{name}.susie.rds',
        ss = f'{wd:a}/{_input:bn}{("_cis_%s" % cis) if cis != "NULL" else ""}_{name}.sumstat.rds',
        vary = f'{wd:a}/{_input:bn}{("_cis_%s" % cis) if cis != "NULL" else ""}_{name}.covY_flash.rds'
# task: trunk_workers = 1, trunk_size = 36, walltime = '36h', mem = '55G', cores = 1, tags = f'{step_name}_{_output[0]:bn}'
task: trunk_workers = 1, trunk_size = 1, walltime = '36h', mem = '55G', cores = 1, tags = f'{step_name}_{_output[0]:bn}'
R: expand = '${ }', stdout = f"{_output[0]:nn}.stdout", stderr = f"{_output[0]:nn}.stderr", container = container
    
    ###
    # Utility functions
    ###
    compute_maf <- function(geno){
      f <- mean(geno,na.rm = TRUE)/2
      return(min(f, 1-f))
    }

    compute_missing <- function(geno){
      miss <- sum(is.na(geno))/length(geno)
      return(miss)
    }
    
    mean_impute <- function(geno){
      f <- apply(geno, 2, function(x) mean(x,na.rm = TRUE))
      for (i in 1:length(f)) geno[,i][which(is.na(geno[,i]))] <- f[i]
      return(geno)
    }

    is_zero_variance <- function(x) {
      if (length(unique(x))==1) return(T)
      else return(F)
    }
  
    filter_X <- function(X, missing_rate_thresh, maf_thresh) {
        rm_col <- which(apply(X, 2, compute_missing) > missing_rate_thresh)
        if (length(rm_col)) X <- X[, -rm_col]
        rm_col <- which(apply(X, 2, compute_maf) < maf_thresh)
        if (length(rm_col)) X <- X[, -rm_col]
        rm_col <- which(apply(X, 2, is_zero_variance))
        if (length(rm_col)) X <- X[, -rm_col]
        return(mean_impute(X))
    }

    compute_cov_flash <- function(Y, error_cache = NULL){
        covar <- diag(ncol(Y))
        tryCatch({
        fl <- flashier::flash(Y, var.type = 2, prior.family = c(flashier::prior.normal(), flashier::prior.normal.scale.mix()), backfit = TRUE, verbose.lvl=0)
        if(fl$n.factors==0){
          covar <- diag(fl$residuals.sd^2)
        } else {
          fsd <- sapply(fl$fitted.g[[1]], '[[', "sd")
          covar <- diag(fl$residuals.sd^2) + crossprod(t(fl$flash.fit$EF[[2]]) * fsd)
        }
        if (nrow(covar) == 0) {
          covar <- diag(ncol(Y))
          stop("Computed covariance matrix has zero rows")
        }
        }, error = function(e) {
          if (!is.null(error_cache)) {
            saveRDS(list(data=Y, message=warning(e)), error_cache)
            warning("FLASH failed. Using Identity matrix instead.")
            warning(e)
          } else {
            stop(e)
          }
        })
        s <- apply(Y, 2, sd, na.rm=T)
        if (length(s)>1) s = diag(s)
        else s = matrix(s,1,1)
        covar <- s%*%cov2cor(covar)%*%s
        return(covar)
    }
  
    compute_cov_diag <- function(Y){
        covar <- diag(apply(Y, 2, var, na.rm=T))
        return(covar)
    }

    get_center <- function(k,n) {
      ## For given number k, get the range k surrounding n/2
      ## but have to make sure it does not go over the bounds
      if (is.null(k)) {
          return(1:n)
      }
      start = floor(n/2 - k/2)
      end = floor(n/2 + k/2)
      if (start<1) start = 1
      if (end>n) end = n
      return(start:end)
    }
    
    get_prior_indices <- function(Y, U) {
      # make sure the prior col/rows match the colnames of the Y matrix
      y_names = colnames(Y)
      u_names = colnames(U)
      if (is.null(y_names) || is.null(u_names)) {
          return(NULL)
      } else if (identical(y_names, u_names)) {
          return(NULL)
      } else {
          return(match(y_names, u_names))
      }
    }
    
    ###
    # Core code
    ###
    dat = readRDS(${_input:r})
    y_res = dat$${y_table}
    if (file.exists(${_output[3]:r})) {
      resid_Y = readRDS(${_output[3]:r})
    } else {
      resid_Y = compute_cov_flash(y_res)
      saveRDS(resid_Y, ${_output[3]:r})
    }
    prior = readRDS(${prior:r})
    print(paste("Number of components in the mixture prior:", length(prior$U)))
    prior = mvsusieR::create_mash_prior(mixture_prior=list(weights=prior$w, matrices=prior$U), include_indices = get_prior_indices(y_res, prior$U[[1]]), max_mixture_len=-1)
    X = filter_X(dat$${x_table}, ${imiss}, ${maf})
    X = X[,get_center(${cis}, ncol(X))]
    print(paste("Dimension of X matrix:", nrow(X), ncol(X)))
    print(paste("Dimension of Y matrix:", nrow(y_res), ncol(y_res)))
      
    # Fine-mapping with SuSiE
    fitted = list()
    non_missing = list()
    for (r in 1:ncol(y_res)) {
        non_missing[[r]] = which(!is.na(y_res[,r]))
        st = proc.time()
        fitted[[r]] <- susieR::susie(X[non_missing[[r]],], y_res[non_missing[[r]],r],
                           L=${max_L},
                           max_iter=1000,
                           estimate_residual_variance=TRUE,
                           estimate_prior_variance=TRUE,
                           refine=TRUE)
        fitted[[r]]$time = proc.time() - st
        fitted[[r]]$cs_corr = susieR:::get_cs_correlation(fitted[[r]], X=X[non_missing[[r]],])
    }

    saveRDS(fitted, ${_output[1]:r})
    
    # GWAS Summary statistics
    univariate_res = lapply(1:ncol(y_res), function(r) susieR:::univariate_regression(X[non_missing[[r]], ], y_res[non_missing[[r]], r]))
    bhat = do.call(cbind, lapply(1:ncol(y_res), function(r) univariate_res[[r]]$betahat))
    sbhat = do.call(cbind, lapply(1:ncol(y_res), function(r) univariate_res[[r]]$sebetahat))
    saveRDS(list(bhat=bhat, sbhat=sbhat), ${_output[2]:r})
    rm(bhat)
    rm(sbhat)
    rm(fitted)
    rm(non_missing)
    # Multivariate fine-mapping
    st = proc.time()
    mv_res = mvsusieR::mvsusie(X, y_res, L=${max_L}, 
                              prior_variance=prior, residual_variance=resid_Y, 
                              precompute_covariances=F, compute_objective=T, 
                              estimate_residual_variance=F, estimate_prior_variance=T, estimate_prior_method='EM',
                              max_iter = 100, n_thread=1, approximate=F)
    mv_res$time = proc.time() - st
    mv_res$cs_corr = susieR:::get_cs_correlation(mv_res, X=X)
    saveRDS(mv_res, ${_output[0]:r})

In [None]:
# Convert LD store file to RDS format
[ldstore_to_rds]
parameter: ld_dir = path
ld_files = glob.glob(f"{ld_dir:a}/{name}*.matrix")
input: ld_files, group_by = 1
output: f"{cwd:a}/{_input:bn}.ld.rds"
task: trunk_workers = 1, trunk_size = 1, walltime = '12h', mem = '20G', cores = 2, tags = f'{step_name}_{_output:bn}'
R: expand = "${ }", container = container
    ld = as.matrix(data.table::fread(${_input:r}))
    saveRDS(ld, ${_output:r})

In [None]:
[sufficient_summary_stats_preprocessing]
parameter: phenoFile = path
parameter: covarFile = path
# path to z score file
parameter: z_dir = path()
parameter: z_suffix = str
# path to LD file
parameter: ld_dir = path()
parameter: ld_suffix = str
input: genes, group_by = 1
output: suffstats = f"{wd:a}/{_input:bn}.sufficient_stats.rds", 
        sumstats =  f"{wd:a}/{_input:bn}.summary_stats.rds"
task: trunk_workers = 1, trunk_size = 1, walltime = '4h', mem = '200G', cores = 1, tags = f'{step_name}_{_output[0]:bn}'
R: expand = '${ }', stdout = f"{_output[0]:nn}.stdout", stderr = f"{_output[0]:nn}.stderr", container = container
    # FIXME: in practice we might need to 
    geno_file = ${_input:nr}
    z.file = "${z_dir:a}/${_input:bn}.${z_suffix}"
    ld.file = "${ld_dir:a}/${_input:bn}.${ld_suffix}"
    library(data.table)
    library(dplyr)
    
    X <- fread(paste0(geno_file, '.raw.gz'),sep = "\t",header = TRUE,stringsAsFactors = FALSE)
    map <- X[,1:6]
    X = X[, c('FID','IID','PAT','MAT','SEX', 'PHENOTYPE') := NULL]
    X <- as.matrix(X)
    
    X.info = fread(paste0(geno_file, '.pvar'),sep = "\t",header = TRUE,stringsAsFactors = FALSE)
    
    # Read phenotype data
    cat("Reading phenotype data.\n")
    pheno <- suppressMessages(fread(${phenoFile:r}))

    cat("Reading covariate file.\n")
    Z = suppressMessages(fread(${covarFile:r}))

    match.idx = match(map$IID, pheno$IID)
    pheno = pheno[match.idx,]
    match.idx = match(map$IID, Z$IID)
    Z = Z[match.idx,]
  
    Y = pheno %>% select(-FID, -IID) %>% as.matrix
    Z = Z %>% select(-FID, -IID) %>% as.matrix
  
    # centering
    Y = sweep(Y, 2, colMeans(Y), '-')
    Z = sweep(Z, 2, colMeans(Z), '-')
  
    A   <- crossprod(Z) # Z'Z
    # chol decomposition for (Z'Z)^(-1)
    R = chol(solve(A)) # R'R = (Z'Z)^(-1)
    W = R %*% crossprod(Z, X) # RZ'X
    S = R %*% crossprod(Z, Y) # RZ'Y

    SNPnames = colnames(X)
    rm(X)
    rm(Z)

    zscores = readRDS(z.file)

    # Load LD matrix from raw genotype
    ld = readRDS(ld.file)
    XtX = sqrt(zscores$XtXD) * t(ld*sqrt(zscores$XtXD)) - crossprod(W) # W'W = X'ZR'RZ'X = X'Z(Z'Z)^{-1}Z'X
    XtX = as.matrix(XtX)
    rownames(XtX) = colnames(XtX) = SNPnames
    R = cov2cor(XtX)

    # X'Y
    ## flip sign because X flip the REF, ALT
    XtY = -as.matrix(zscores$XtY - crossprod(W, S)) # W'S = X'ZR'RZ'y = X'Z(Z'Z)^{-1}Z'y

    # YtY
    YtY = as.matrix(crossprod(Y) - crossprod(S))

    Z = as.matrix(zscores$Z)
    rownames(Z) = SNPnames
    
    meta = zscores$pos[,1:5]
    if(!all.equal(meta, X.info, check.attributes = FALSE)){
        stop("ALLELE doesn't amtch.")
    }

    saveRDS(list(XtX = XtX, XtY = XtY, YtY = YtY, N = nrow(Y), meta = zscores$pos), ${_output["suffstats"]:r})
    saveRDS(list(Z = Z, LD = R, meta = zscores$pos, ld.file = ld.file), ${_output["sumstats"]:r})

In [None]:
[univariate_analysis_1]
parameter: max_L = 10
input: genes, group_by = 1
output: suff = f"{wd:a}/{_input:bnn}.susiesuff.rds", 
        rss_rem_covariates =  f"{wd:a}/{_input:bnn}.susierss_rem_covariates.rds",
        rss_notrem_covariates =  f"{wd:a}/{_input:bnn}.susierss_notrem_covariates.rds"
task: trunk_workers = 1, trunk_size = 1, walltime = '5h', mem = '55G', cores = 1, tags = f'{step_name}_{_output[0]:bnn}'
R: expand = '${ }', stdout = f"{_output[0]:nn}.stdout", stderr = f"{_output[0]:nn}.stderr", container = container
    library(susieR)
    dat_suff = readRDS('${_input:nn}.sufficient_stats.rds')
    dat_rss = readRDS('${_input:nn}.summary_stats.rds')
    R = readRDS(dat_rss$ld.file)
    rownames(R) = colnames(R) = rownames(dat_rss$LD)
    fitted_suff = list()
    fitted_rss_rem_covariates = list()
    fitted_rss_notrem_covariates = list()
    for (r in 1:ncol(dat_suff$XtY)) {
        ## sufficient stats
        st = proc.time()
        fitted_suff[[r]] <- susieR::susie_suff_stat(XtX = dat_suff$XtX, 
                                               Xty = dat_suff$XtY[,r],
                                               yty = dat_suff$YtY[r,r], n = dat_suff$N,
                                               L=${max_L},
                                               max_iter=1000,
                                               estimate_residual_variance=TRUE,
                                               estimate_prior_variance=TRUE,
                                               refine=TRUE)
        fitted_suff[[r]]$time = proc.time() - st
        fitted_suff[[r]]$cs_corr = susieR:::get_cs_correlation(fitted_suff[[r]], Xcorr=cov2cor(dat_suff$XtX))
        
        ## rss, LD correct for covariates
        st = proc.time()
        fitted_rss_rem_covariates[[r]] <- susieR::susie_rss(z = dat_rss$Z[,r],
                                                            R = dat_rss$LD,
                                                            L=${max_L},
                                                            max_iter=1000,
                                                            estimate_prior_variance=TRUE,
                                                            refine=TRUE)
        fitted_rss_rem_covariates[[r]]$time = proc.time() - st
        fitted_rss_rem_covariates[[r]]$cs_corr = susieR:::get_cs_correlation(fitted_rss_rem_covariates[[r]], 
                                                                             Xcorr=dat_rss$LD)
        
        ## rss, LD not correct for covariates
        st = proc.time()
        fitted_rss_notrem_covariates[[r]] <- susieR::susie_rss(z = dat_rss$Z[,r],R = R,
                                                               L=${max_L},max_iter=1000,
                                                               estimate_prior_variance=TRUE,
                                                               refine=TRUE)
        fitted_rss_notrem_covariates[[r]]$time = proc.time() - st
        fitted_rss_notrem_covariates[[r]]$cs_corr = susieR:::get_cs_correlation(fitted_rss_notrem_covariates[[r]], 
                                                                                Xcorr=R)
    }
    
    names(fitted_suff) = colnames(dat_suff$XtY)
    names(fitted_rss_rem_covariates) = colnames(dat_suff$XtY)
    names(fitted_rss_notrem_covariates) = colnames(dat_suff$XtY)
        
    saveRDS(fitted_suff, ${_output["suff"]:r})
    saveRDS(fitted_rss_rem_covariates, ${_output["rss_rem_covariates"]:r})
    saveRDS(fitted_rss_notrem_covariates, ${_output["rss_notrem_covariates"]:r})

In [None]:
[sufficient_stats_analysis_1]
parameter: max_L = 10
input: genes, group_by = 1
output: f'{wd:a}/{_input:bnn}{resid_cor:bnx}.mvsusiesuff.rds'
task: trunk_workers = 1, trunk_size = 1, walltime = '2h', mem = '55G', cores = 1, tags = f'{step_name}_{_output:bn}'
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container
    get_prior_indices <- function(Z, U) {
      # make sure the prior col/rows match the colnames of the Y matrix
      z_names = colnames(Z)
      u_names = colnames(U)
      if (is.null(z_names) || is.null(u_names)) {
          return(NULL)
      } else if (identical(z_names, u_names)) {
          return(NULL)
      } else {
          return(match(z_names, u_names))
      }
    }

    library(mvsusieR)
    dat = readRDS(${_input:r})
    V = readRDS(${resid_cor:r})
    prior = readRDS(${prior:r})
    print(paste("Number of components in the mixture prior:", length(prior$U)))
    prior = mvsusieR::create_mash_prior(mixture_prior=list(weights=prior$w, matrices=prior$U), 
                                        include_indices = get_prior_indices(dat$XtY, prior$U[[1]]), 
                                        max_mixture_len=-1)
    st = proc.time()
    mv_res = mvsusieR::mvsusie_suff_stat(dat$XtX, dat$XtY, dat$YtY, dat$N, L=${max_L}, 
                                         prior_variance=prior, residual_variance=V, 
                                         precompute_covariances=T, compute_objective=T, 
                                         estimate_residual_variance=F, estimate_prior_variance=T, 
                                         estimate_prior_method='EM',max_iter = 1000, n_thread=1)
    mv_res$time = proc.time() - st
    if(mv_res$convergence$converged == FALSE){
        stop('Fail to converge.')
    }
    mv_res$cs_corr = susieR:::get_cs_correlation(mv_res, Xcorr=cov2cor(dat$XtX))
    saveRDS(mv_res, ${_output:r})

In [None]:
[summary_stats_analysis_1]
parameter: max_L = 10
parameter: ld_type = 'original'
input: genes, group_by = 1
output: f'{wd:a}/{_input:bnn}.LD{ld_type}{resid_cor:bnx}.mvsusierss.rds'
task: trunk_workers = 1, trunk_size = 1, walltime = '2h', mem = '55G', cores = 1, tags = f'{step_name}_{_output:bn}'
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container
    get_prior_indices <- function(Z, U) {
      # make sure the prior col/rows match the colnames of the Y matrix
      z_names = colnames(Z)
      u_names = colnames(U)
      if (is.null(z_names) || is.null(u_names)) {
          return(NULL)
      } else if (identical(z_names, u_names)) {
          return(NULL)
      } else {
          return(match(z_names, u_names))
      }
    }

    library(mvsusieR)
    dat = readRDS(${_input:r})
    V = readRDS(${resid_cor:r})
    prior = readRDS(${prior:r})
    print(paste("Number of components in the mixture prior:", length(prior$U)))
    prior = mvsusieR::create_mash_prior(mixture_prior=list(weights=prior$w, matrices=prior$U), 
                                        include_indices = get_prior_indices(dat$Z, prior$U[[1]]), 
                                        max_mixture_len=-1)
    if("${ld_type}" == 'original'){
        R = readRDS(dat$ld.file)
    }else if("${ld_type}" == 'remove_cov'){
        R = dat$LD
    }
    st = proc.time()
    mv_res = mvsusieR::mvsusie_rss(dat$Z, R, L=${max_L}, 
                                   prior_variance=prior, residual_variance=V, 
                                   precompute_covariances=T, compute_objective=T, 
                                   estimate_prior_variance=T, estimate_prior_method='EM',
                                   max_iter = 1000, n_thread=1)
    mv_res$time = proc.time() - st
    if(mv_res$convergence$converged == FALSE){
        stop('Fail to converge.')
    }
    mv_res$cs_corr = susieR:::get_cs_correlation(mv_res, Xcorr=R)
    saveRDS(mv_res, ${_output:r})

In [None]:
[*_analysis_2]
output: f"{_input[0]:nn}.manhattan.png", f"{_input[0]:nn}.bubble_finemap.png", f"{_input[0]:nn}.bubble_original.png"
R: expand = '${ }', container = container
    res = readRDS(${_input[0]:r})
    pdf('${_output[0]:n}.pdf', width=8, height=4)
    susieR::susie_plot(res,y='PIP', main = 'Cross-condition Posterior Inclusion Probability', xlab = 'SNP positions', add_legend = F)
    dev.off()
    p = mvsusieR::mvsusie_plot(res)
    pdf('${_output[1]:n}.pdf', width = p$width, height = p$height)
    print(p$plot)
    dev.off()
    p = mvsusieR::mvsusie_plot(res, plot_z=TRUE)
    pdf('${_output[2]:n}.pdf', width = p$width, height = p$height)
    print(p$plot)
    dev.off()

bash: expand = '${ }'
    convert -density 150 ${_output[0]:n}.pdf ${_output[0]}
    convert -density 150 ${_output[1]:n}.pdf ${_output[1]}
    convert -density 150 ${_output[2]:n}.pdf ${_output[2]}