# MASH analysis pipeline with posterior computation


## Compute MASH posteriors

In the GTEx V6 paper we assumed one eQTL per gene and applied the model learned above to those SNPs. Under that assumption, the input data for posterior calculation will be the `dat$strong.*` matrices.
It is a fairly straightforward procedure as shown in [this vignette](https://stephenslab.github.io/mashr/articles/eQTL_outline.html).

But it is often more interesting to apply MASH to given list of eQTLs, eg, from those from fine-mapping results. In GTEx V8 analysis we obtain such gene-SNP pairs from DAP-G fine-mapping analysis. See [this notebook](https://stephenslab.github.io/gtex-eqtls/analysis/Independent_eQTL_Results.html) for how the input data is prepared. The workflow below takes a number of input chunks (each chunk is a list of matrices `dat$Bhat` and `dat$Shat`) 
and computes posterior for each chunk. It is therefore suited for running in parallel posterior computation for all gene-SNP pairs, if input data chunks are provided.


```
JOB_OPT="-c midway2.yml -q midway2"
DATA_DIR=/project/compbio/GTEx_eQTL/independent_eQTL
sos run workflows/mashr_flashr_workflow.ipynb posterior \
    $JOB_OPT \
    --posterior-input $DATA_DIR/DAPG_pip_gt_0.01-AllTissues/DAPG_pip_gt_0.01-AllTissues.*.rds \
                      $DATA_DIR/ConditionalAnalysis_AllTissues/ConditionalAnalysis_AllTissues.*.rds
```

In [11]:
# Apply posterior calculations
#[posterior_1]
parameter: analysis_units = path
regions = [x.replace("\"","").strip().split() for x in open(analysis_units).readlines() if x.strip() and not x.strip().startswith('#')]
parameter: mash_model = path(f"{cwd:a}/RDS/{output_prefix}.{effect_model}.V_{vhat}.mash_model.rds")
parameter: posterior_input = [path(x[0]) for x in regions]
parameter: posterior_vhat_files = paths()
# eg, if data is saved in R list as data$strong, then
# when you specify `--data-table-name strong` it will read the data as
# readRDS('{_input:r}')$strong
parameter: data_table_name = ''
parameter: bhat_table_name = 'bhat'
parameter: shat_table_name = 'sbhat'
mash_model = f"{mash_model:a}"
##  conditions can be excluded if needs arise. If nothing to exclude keep the default 0
parameter: exclude_condition = ["1","3"]

skip_if(len(posterior_input) == 0, msg = "No posterior input data to compute on. Please specify it using --posterior-input.")
fail_if(len(posterior_vhat_files) > 1 and len(posterior_vhat_files) != len(posterior_input), msg = "length of --posterior-input and --posterior-vhat-files do not agree.")
for p in posterior_input:
    fail_if(not p.is_file(), msg = f'Cannot find posterior input file ``{p}``')

depends: mash_model
input: posterior_input, group_by = 1
output: f"{cwd}/{_input:bn}.posterior.rds"
task: trunk_workers = 1, walltime = '20h', trunk_size = 1, mem = '20G', cores = 1, tags = f'{_output:bn}'
R: expand = "${ }", workdir = cwd, stderr = f"{_output:n}.stderr", stdout = f"{_output:n}.stdout", container = container
    library(mashr)
    handle_nan_etc = function(x) {
      x$bhat[which(is.nan(x$bhat))] = 0
      x$sbhat[which(is.nan(x$sbhat) | is.infinite(x$sbhat))] = 1E3
      return(x)
    }
    data = readRDS("${_input}")${('$' + data_table_name) if data_table_name else ''}
    if(c(${",".join(exclude_condition)})[1] > 0 ){
      message(paste("Excluding condition ${exclude_condition} from the analysis"))
      data$bhat = data$bhat[,-c(${",".join(exclude_condition)})]
      data$sbhat = data$sbhat[,-c(${",".join(exclude_condition)})]
      data$Z = data$Z[,-c(${",".join(exclude_condition)})]
    }
    data <- handle_nan_etc(data)
    vhat = readRDS("${vhat_data if len(posterior_vhat_files) == 0 else posterior_vhat_files[_index]}")
    mash_data = mash_set_data(data$${bhat_table_name}, Shat=data$${shat_table_name}, alpha=${1 if effect_model == 'EZ' else 0}, V=vhat, zero_Bhat_Shat_reset = 1E3)
    mash_output = mash_compute_posterior_matrices(readRDS("${mash_model}"), mash_data,output_posterior_cov=TRUE)
    mash_output$snps = data$snps
    saveRDS(mash_output, ${_output:r})

In [None]:
#[posterior_2]
input: group_by = "all"
output:f"{cwd}/mash_output_list"
python: expand = "$[ ]", workdir = cwd, stderr = f"{_output:n}.stderr", stdout = f"{_output:n}.stdout", container = container
    library(mashr)
    import pandas as pd
    pd.DataFrame({"#mash_result" :  [$[_input:ar,]] }).to_csv("$[_output]",index = False ,header = False, sep = "t")

In [11]:
# Apply posterior calculations, output_posterior_cov = T
[posterior_1]
parameter: analysis_units = path
regions = [x.replace("\"","").strip().split() for x in open(analysis_units).readlines() if x.strip() and not x.strip().startswith('#')]
parameter: mash_model = path(f"{cwd:a}/RDS/{output_prefix}.{effect_model}.V_{vhat}.mash_model.rds")
parameter: posterior_input = [path(x[0]) for x in regions]
parameter: posterior_vhat_files = paths()
# eg, if data is saved in R list as data$strong, then
# when you specify `--data-table-name strong` it will read the data as
# readRDS('{_input:r}')$strong
parameter: per_chunk = '100'
parameter: data_table_name = ''
parameter: bhat_table_name = 'bhat'
parameter: shat_table_name = 'sbhat'
mash_model = f"{mash_model:a}"
##  conditions can be excluded if needs arise. If nothing to exclude keep the default 0
parameter: exclude_condition = ["1","3"]

skip_if(len(posterior_input) == 0, msg = "No posterior input data to compute on. Please specify it using --posterior-input.")
fail_if(len(posterior_vhat_files) > 1 and len(posterior_vhat_files) != len(posterior_input), msg = "length of --posterior-input and --posterior-vhat-files do not agree.")
for p in posterior_input:
    fail_if(not p.is_file(), msg = f'Cannot find posterior input file ``{p}``')

depends: mash_model
input: posterior_input, group_by = per_chunk
output: f"{cwd}/cache/mash_output_list_{output_suffix}.{_index+1}"
task: trunk_workers = 1, walltime = '20h', trunk_size = 1, mem = '20G', cores = 1, tags = f'{_output:bn}'
R: expand = "${ }", workdir = cwd, stderr = f"{_output:n}.stderr", stdout = f"{_output:n}.stdout", container = container
    library(mashr)
    library(dplyr)
    library(stringr)
    handle_nan_etc = function(x) {
      x$bhat[which(is.nan(x$bhat))] = 0
      x$sbhat[which(is.nan(x$sbhat) | is.infinite(x$sbhat))] = 1E3
      return(x)
    }
    outlist = data.frame()
    for (f in c(${_input:r,})) try({
     data = readRDS(f)${('$' + data_table_name) if data_table_name else ''}
    if(c(${",".join(exclude_condition)})[1] > 0 ){
      message(paste("Excluding condition ${exclude_condition} from the analysis"))
      data$bhat = data$bhat[,-c(${",".join(exclude_condition)})]
      data$sbhat = data$sbhat[,-c(${",".join(exclude_condition)})]
      data$Z = data$Z[,-c(${",".join(exclude_condition)})]
    }
    data <- handle_nan_etc(data)
    vhat = readRDS("${vhat_data if len(posterior_vhat_files) == 0 else posterior_vhat_files[_index]}")
    mash_data = mash_set_data(data$${bhat_table_name}, Shat=data$${shat_table_name}, alpha=${1 if effect_model == 'EZ' else 0}, V=vhat, zero_Bhat_Shat_reset = 1E3)
    mash_output = mash_compute_posterior_matrices(readRDS("${mash_model}"), mash_data,output_posterior_cov=TRUE)
    mash_output$snps = data$snps
    #saveRDS(mash_output, ${_output:r})
    samplename<-str_split(f,"/",simplify = T)%>%.[length(.)]%>%gsub('.rds','',.)
    saveRDS(mash_output, paste0("${_output:d}","/",samplename,".posterior.rds"))
    outlist<-rbind(outlist,paste0("${_output:d}","/",samplename,".posterior.rds"))
    })
    write.table(outlist,${_output:r},col.names=F, row.names=F, quote=F)

In [None]:
[posterior_2]

input: group_by = "all"
output:f"{cwd}/mash_output_list_{output_suffix}"
bash: expand ='${ }', workdir = cwd, stderr = f"{_output:n}.stderr", stdout = f"{_output:n}.stdout"
     cd ${_input[0]:d}
     cat mash_output_list_*[0-9] >> posterior_file_list
     awk -F 'cis_long_table.' '{print $2}' posterior_file_list| awk -F '.posterior.rds' '{print $1}'|paste - posterior_file_list > ${_output:r}
     rm posterior_file_list



### Posterior results

1. The outcome of the `[posterior]` step should produce a number of serialized R objects `*.batch_*.posterior.rds` (can be loaded to R via `readRDS()`) -- I chopped data to batches to take advantage of computing in multiple cluster nodes. It should be self-explanary but please let me know otherwise.
2. Other posterior related files are:
    1. `*.batch_*.yaml`: gene-SNP pairs of interest, identified elsewhere (eg. fine-mapping analysis). 
    2. The corresponding univariate analysis summary statistics for gene-SNPs from `*.batch_*.yaml` are extracted and saved to `*.batch_*.rds`, creating input to the `[posterior]` step.
    3. Note the `*.batch_*.stdout` file documents some SNPs found in fine-mapping results but not found in the original `fastqtl` output.

## Slice Posterior 

take all the 13K genes, and for those with missing conditions we just drop those corresponding rows and cols in the prior model

In [None]:
# Apply posterior calculations
#[sliceposterior_1]
parameter: analysis_units = path
regions = [x.replace("\"","").strip().split() for x in open(analysis_units).readlines() if x.strip() and not x.strip().startswith('#')]
parameter: mash_model = path(f"{cwd:a}/{output_prefix}.{effect_model}.V_{vhat}.mash_model.rds")
parameter: posterior_input = [path(x[0]) for x in regions]
parameter: posterior_vhat_files = paths()
# eg, if data is saved in R list as data$strong, then
# when you specify `--data-table-name strong` it will read the data as
# readRDS('{_input:r}')$strong
parameter: data_table_name = ''
parameter: bhat_table_name = 'bhat'
parameter: shat_table_name = 'sbhat'
parameter: per_chunk = '1000'
mash_model = f"{mash_model:a}"
##  conditions can be excluded if needs arise. If nothing to exclude keep the default 0
parameter: exclude_condition = ["1","3"]

skip_if(len(posterior_input) == 0, msg = "No posterior input data to compute on. Please specify it using --posterior-input.")
fail_if(len(posterior_vhat_files) > 1 and len(posterior_vhat_files) != len(posterior_input), msg = "length of --posterior-input and --posterior-vhat-files do not agree.")
for p in posterior_input:
    fail_if(not p.is_file(), msg = f'Cannot find posterior input file ``{p}``')

depends: mash_model
input: posterior_input, group_by = per_chunk
output: f"{cwd}/cache/mash_output_list_{_index+1}"
task: trunk_workers = 1, walltime = '20h', trunk_size = 1, mem = '20G', cores = 1, tags = f'{_output:bn}'
R: expand = "${ }", workdir = cwd, stderr = f"{_output:n}.stderr", stdout = f"{_output:n}.stdout"
    library(mashr)
    library(dplyr)
    library(stringr)
    outlist = data.frame()
    for (f in c(${_input:r,})) try({
    data = readRDS(f)${('$' + data_table_name) if data_table_name else ''}
    #data = readRDS("${_input}")${('$' + data_table_name) if data_table_name else ''}

    if(c(${",".join(exclude_condition)})[1] > 0 ){
      message(paste("Excluding condition ${exclude_condition} from the analysis"))
      data$bhat = data$bhat[,-c(${",".join(exclude_condition)})]
      data$sbhat = data$sbhat[,-c(${",".join(exclude_condition)})]
      data$Z = data$Z[,-c(${",".join(exclude_condition)})]
    }
    
    all.samples<-colnames(data$bhat)
    all.snps<-rownames(data$bhat)
    
    vhat = readRDS("${vhat_data if len(posterior_vhat_files) == 0 else posterior_vhat_files[_index]}")
    mash_model <- readRDS("${mash_model}")
    
    #remove the rows and cols containing NA
    na.test<-data$bhat %>% as.data.frame()%>% select_if(~any(!is.na(.)))%>% na.omit%>%as.matrix
    #recording meaningful rows and cols
    samples<-colnames(na.test)
    snps<-rownames(na.test)
    
    if(length(all.snps)!=length(snps) | length(all.samples)!=length(samples)){
        #slice the data
        data$bhat<-data$bhat[snps,samples]%>%as.matrix
        colnames(data$bhat)<-samples
        data$sbhat<-data$sbhat[snps,samples]%>%as.matrix
        colnames(data$sbhat)<-samples
        data$Z<-data$Z[snps,samples]%>%as.matrix
        colnames(data$Z)<-samples
        data$snp<-data$snp[data$snp%in%snps]
        vhat<-vhat[samples,samples]%>%as.matrix
        colnames(vhat)<-samples
  
        if(length(all.samples)!=length(samples)){
            ##slice the prior
            cov<-mash_model$fitted_g$Ulist
            for (d in names(cov)) {
                if(d %in% setdiff(all.samples,samples)){
                    cov[[d]]<-NULL
                }
                if(d %in% paste0("ED_",setdiff(all.samples,samples))){
                    cov[[d]]<-NULL
                }
                if(d %in% samples){
                    cov[[d]]<-matrix(0,length(samples),length(samples))
                    cov[[d]][which(samples==d),which(samples==d)]<-1
                }else if(d == "identity"){
                    cov[[d]]<-matrix(0,length(samples),length(samples))
                    cov[[d]][1,1]<-1  
                }else if(is.null(colnames(cov[[d]]))){
                    cov[[d]] <- cov[[d]][1:length(samples),1:length(samples)]
                } else {
                    cov[[d]] <- cov[[d]][samples,samples]
                }
                    }
                for (d in names(cov)) {
                    cov[[d]] <- cov[[d]]%>%as.matrix
                }
                #slide the prior and related file
                mash_model$fitted_g$Ulist<-cov
                diff.sam<-setdiff(all.samples,samples)
                for(s in diff.sam){mash_model$fitted_g$pi<-mash_model$fitted_g$pi[-grep(s,names(mash_model$fitted_g$pi))]}
            }
        }
    
  
  
    
    mash_data = mash_set_data(data$${bhat_table_name}, Shat=data$${shat_table_name}, alpha=${1 if effect_model == 'EZ' else 0}, V=vhat, zero_Bhat_Shat_reset = 1E3)
    mash_output = mash_compute_posterior_matrices(mash_model, mash_data)
    mash_output$snps = data$snps
    #saveRDS(mash_output, ${_output:r})
    samplename<-str_split(f,"/",simplify = T)%>%.[length(.)]%>%gsub('.rds','',.)
    saveRDS(mash_output, paste0("${_output:d}","/",samplename,".posterior.rds"))
    outlist<-rbind(outlist,paste0("${_output:d}","/",samplename,".posterior.rds"))
    })
    write.table(outlist,${_output:r},col.names=F, row.names=F)

In [None]:
# Apply posterior calculations with slice NA and set NaN/Inf 0/1E3, output_posterior_cov = T 
[sliceposterior_1]
parameter: analysis_units = path
regions = [x.replace("\"","").strip().split() for x in open(analysis_units).readlines() if x.strip() and not x.strip().startswith('#')]
parameter: mash_model = path(f"{cwd:a}/{output_prefix}.{effect_model}.V_{vhat}.mash_model.rds")
parameter: posterior_input = [path(x[0]) for x in regions]
parameter: posterior_vhat_files = paths()
# eg, if data is saved in R list as data$strong, then
# when you specify `--data-table-name strong` it will read the data as
# readRDS('{_input:r}')$strong
parameter: data_table_name = ''
parameter: bhat_table_name = 'bhat'
parameter: shat_table_name = 'sbhat'
parameter: per_chunk = '100'
mash_model = f"{mash_model:a}"
##  conditions can be excluded if needs arise. If nothing to exclude keep the default 0
parameter: exclude_condition = ["1","3"]
parameter: output_suffix = "all"

skip_if(len(posterior_input) == 0, msg = "No posterior input data to compute on. Please specify it using --posterior-input.")
fail_if(len(posterior_vhat_files) > 1 and len(posterior_vhat_files) != len(posterior_input), msg = "length of --posterior-input and --posterior-vhat-files do not agree.")
for p in posterior_input:
    fail_if(not p.is_file(), msg = f'Cannot find posterior input file ``{p}``')

depends: mash_model
input: posterior_input, group_by = per_chunk
output: f"{cwd}/cache/mash_output_list_{_index+1}"
task: trunk_workers = 1, walltime = '20h', trunk_size = 1, mem = '20G', cores = 1, tags = f'{_output:bn}'
R: expand = "${ }", workdir = cwd, stderr = f"{_output:n}.stderr", stdout = f"{_output:n}.stdout"
    library(mashr)
    library(dplyr)
    library(stringr)
    handle_nan_etc = function(x) {
      x$bhat[which(is.nan(x$bhat))] = 0
      x$sbhat[which(is.nan(x$sbhat) | is.infinite(x$sbhat))] = 1E3
      return(x)
    }
    
    outlist = data.frame()
    for (f in c(${_input:r,})) try({
    data = readRDS(f)${('$' + data_table_name) if data_table_name else ''}
    data <- handle_nan_etc(data)
      
    if(c(${",".join(exclude_condition)})[1] > 0 ){
      message(paste("Excluding condition ${exclude_condition} from the analysis"))
      data$bhat = data$bhat[,-c(${",".join(exclude_condition)})]
      data$sbhat = data$sbhat[,-c(${",".join(exclude_condition)})]
      data$Z = data$Z[,-c(${",".join(exclude_condition)})]
    }
    
    all.samples<-colnames(data$bhat)
    all.snps<-rownames(data$bhat)
    
    vhat = readRDS("${vhat_data if len(posterior_vhat_files) == 0 else posterior_vhat_files[_index]}")
    mash_model <- readRDS("${mash_model}")
    
    #remove the rows and cols containing NA
    na.test<-data$bhat %>% as.data.frame()%>% select_if(~any(!is.na(.)))%>% na.omit%>%as.matrix
    #recording meaningful rows and cols
    samples<-colnames(na.test)
    snps<-rownames(na.test)
    
    if(length(all.snps)!=length(snps) | length(all.samples)!=length(samples)){
        #slice the data
        data$bhat<-data$bhat[snps,samples]%>%as.matrix
        colnames(data$bhat)<-samples
        data$sbhat<-data$sbhat[snps,samples]%>%as.matrix
        colnames(data$sbhat)<-samples
        data$Z<-data$Z[snps,samples]%>%as.matrix
        colnames(data$Z)<-samples
        data$snp<-data$snp[data$snp%in%snps]
        vhat<-vhat[samples,samples]%>%as.matrix
        colnames(vhat)<-samples
  
        if(length(all.samples)!=length(samples)){
            ##slice the prior
            cov<-mash_model$fitted_g$Ulist
            for (d in names(cov)) {
                if(d %in% setdiff(all.samples,samples)){
                    cov[[d]]<-NULL
                }
                if(d %in% paste0("ED_",setdiff(all.samples,samples))){
                    cov[[d]]<-NULL
                }
                if(d %in% samples){
                    cov[[d]]<-matrix(0,length(samples),length(samples))
                    cov[[d]][which(samples==d),which(samples==d)]<-1
                }else if(d == "identity"){
                    cov[[d]]<-matrix(0,length(samples),length(samples))
                    cov[[d]][1,1]<-1  
                }else if(is.null(colnames(cov[[d]]))){
                    cov[[d]] <- cov[[d]][1:length(samples),1:length(samples)]
                } else {
                    cov[[d]] <- cov[[d]][samples,samples]
                }
                    }
                for (d in names(cov)) {
                    cov[[d]] <- cov[[d]]%>%as.matrix
                }
                #slide the prior and related file
                mash_model$fitted_g$Ulist<-cov
                diff.sam<-setdiff(all.samples,samples)
                for(s in diff.sam){mash_model$fitted_g$pi<-mash_model$fitted_g$pi[-grep(s,names(mash_model$fitted_g$pi))]}
            }
        }
    
  
  
    
    mash_data = mash_set_data(data$${bhat_table_name}, Shat=data$${shat_table_name}, alpha=${1 if effect_model == 'EZ' else 0}, V=vhat, zero_Bhat_Shat_reset = 1E3)
    mash_output = mash_compute_posterior_matrices(mash_model, mash_data,output_posterior_cov=TRUE)
    mash_output$snps = data$snps
    #saveRDS(mash_output, ${_output:r})
    samplename<-str_split(f,"/",simplify = T)%>%.[length(.)]%>%gsub('.rds','',.)
    saveRDS(mash_output, paste0("${_output:d}","/",samplename,".posterior.rds"))
    outlist<-rbind(outlist,paste0("${_output:d}","/",samplename,".posterior.rds"))
    })
    write.table(outlist,${_output:r},col.names=F, row.names=F, quote=F)

In [None]:
[sliceposterior_2]
input: group_by = "all"
output:f"{cwd}/mash_output_list_{output_suffix}"
bash: expand ='${ }', workdir = cwd, stderr = f"{_output:n}.stderr", stdout = f"{_output:n}.stdout"
     cd ${_input[0]:d}
     cat mash_output_list_*[0-9] >> posterior_file_list
     awk -F 'cis_long_table.' '{print $2}' posterior_file_list| awk -F '.posterior.rds' '{print $1}'|paste - posterior_file_list > ${_output:r}
     rm posterior_file_list


In [None]:
#[sliceposterior_2]

input: group_by = 1
output:f"{cwd}/cache/{cwd:b}_{_index+1}.posterior.rds"
R: expand ='${ }', workdir = cwd, stderr = f"{_output:n}.stderr", stdout = f"{_output:n}.stdout"
    library(tidyverse)     
    ps<-list(pm=NULL,lfsr=NULL)
    samples<-read.table(c(${_input:r,}))$V1

    for (f in samples) {
    tmp<-readRDS(f)
    #get gene name
    #g<-f%>%gsub(".posterior.rds","",.)%>%stringr::str_split(.,"[.]",simplify=T)
    #gene<-ifelse(g[length(g)]%>%grep("\\d",.)==g[length(g)], paste0(paste(g[length(g)-1],g[length(g)],sep=".")),g[length(g)])
    gene<-stringr::str_split(string = f, pattern = "norminal.cis_long_table.",simplify = T)[[2]]%>%
               gsub(".posterior.rds","",.)
    #get the matrix of pm and lfsr
    tmp.pm<-tmp$PosteriorMean%>%as.data.frame()
    tmp.lf<-tmp$lfsr%>%as.data.frame()
    
    #change the rownames
    names <- c( rownames(ps$pm) , paste(gene,str_split(rownames(tmp.pm),":",simplify=T)[,2],sep="_") )
    ps$pm<-plyr::rbind.fill(ps$pm, tmp.pm)
    rownames(ps$pm) <- make.names(names,unique=T)
    
    namess <- c( rownames(ps$lfsr) , paste(gene,str_split(rownames(tmp.lf),":",simplify=T)[,2],sep="_") )
    ps$lfsr<-plyr::rbind.fill(ps$lfsr, tmp.lf)
    rownames(ps$lfsr) <- make.names(namess,unique=T)
    }
    
    saveRDS(ps,${_output:r})

In [None]:
#[sliceposterior_3]
input: group_by = "all"
output: f"{cwd}/{cwd:b}.posterior.rds"
task: trunk_workers = 1, walltime = '1h', trunk_size = 1, mem = '100G', cores = 1, tags = f'{_output:bn}'
R: expand = "${ }", container = container,stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', volumes = [f'{cwd:ad}:{cwd:ad}']
    merge_data = function(res, one_data) {
      if (length(res) == 0) {
          return(one_data)
      } else {
          for (d in names(one_data)) {
            res[[d]] = rbind(res[[d]], one_data[[d]])
          }
          return(res)
      }
    }
    dat = list()
    for (f in c(${_input:r,})) {
      dat = merge_data(dat, readRDS(f))
    }
    
    saveRDS(dat, ${_output:r})