# Multivariate fine-mapping with missing data examples

This notebook applies mvSuSiE on some GTEx genes using two approaches to handle missing data.

## Input

We focus on the 27 genes identified in MASH paper that seems to have different direction of effects in brain vs non-brain tissues. It is of interesting to look at these examples with multivariate fine-mapping.

## Analysis

### Compute prior

```
sos run analysis/gtex-v8/20200906_mvSuSiE_GTEx.ipynb factor_analysis
sos run analysis/gtex-v8/20200906_mvSuSiE_GTEx.ipynb mixture_model
```

### Analyze the data

```
sos run analysis/gtex-v8/20200906_mvSuSiE_GTEx.ipynb submit \
    --data-dir /project2/compbio/GTEx_eQTL/cis_eqtl_analysis_ready \
    -c midway2.yml -q midway2
```

## Results

### Mixture analysis results

In [None]:
%preview -s png /project2/compbio/GTEx_eQTL/mvSuSiE_prior/FastQTLSumstats.mvSuSiE_prior.ed_bovy.pdf

In [None]:
%preview -s png /project2/compbio/GTEx_eQTL/mvSuSiE_prior/FastQTLSumstats.mvSuSiE_prior.ed.pdf -s png

In [None]:
%preview -s png /project2/compbio/GTEx_eQTL/mvSuSiE_prior/FastQTLSumstats.mvSuSiE_prior.teem.pdf -s png

In [None]:
[global]
parameter: data_dir = path('/project2/compbio/GTEx_eQTL/cis_eqtl_analysis_ready')
parameter: wd = path('/project2/compbio/GTEx_eQTL/mvSuSiE_output')
parameter: name = "GTEx_V8_strong_z"
parameter: analysis_units = path('data/27_brain_non_brain_genes_v8.txt')
parameter: suffix = 'GTEx_V8.rds'
regions = [x.strip() for x in open(analysis_units).readlines() if x.strip() and not x.strip().startswith('#')]
genes = [f"{data_dir:a}/{x}.{suffix}" for x in regions if path(f"{data_dir:a}/{x}.{suffix}").exists()]

In [None]:
[factor_analysis_1]
parameter: sumstat = path("/project2/compbio/GTEx_eQTL/mashr_flashr_workflow_output/FastQTLSumStats.mash.rds")
input: sumstat
output: f"{wd:a}/{name}.rds"
R: expand = True
    dat = readRDS({_input:r})
    attach(dat)
    random.tmp = rbind(random.b/random.s, random.test.b/random.test.s)
    null.id =  which(apply(abs(random.tmp), 1, max) < 2)
    null.z = random.tmp[null.id,,drop=F]
    null.cor = cor(null.z)
    z = strong.b/strong.s
    XtX = t(as.matrix(z)) %*% as.matrix(z) / nrow(z)
    res = list(null.cor = null.cor, strong.z = z, XtX = XtX)
    saveRDS(res, {_output:r})

In [None]:
[factor_analysis_2]
script: interpreter= 'qsub', expand = True
    #!/bin/bash

    #SBATCH --time=36:00:00
    #SBATCH --partition=broadwl
    #SBATCH --nodes=1
    #SBATCH --ntasks-per-node=1
    #SBATCH --cpus-per-task=1
    #SBATCH --mem-per-cpu=2000
    #SBATCH --job-name={step_name}
    #SBATCH --mail-type=BEGIN,END,FAIL

    module load R
    sos run ~/GIT/bioworkflows/multivariate-fine-mapping/mixture_prior.ipynb flash --name {name} --cwd {wd} -s build &> {wd}/{name}_factor_analysis.log
    sos run ~/GIT/bioworkflows/multivariate-fine-mapping/mixture_prior.ipynb flash_nonneg --name {name} --cwd {wd} -s build &>> {wd}/{name}_factor_analysis.log
    sos run ~/GIT/bioworkflows/multivariate-fine-mapping/mixture_prior.ipynb pca --name {name} --cwd {wd} -s build &>> {wd}/{name}_factor_analysis.log
    sos run ~/GIT/bioworkflows/multivariate-fine-mapping/mixture_prior.ipynb canonical --name {name} --cwd {wd} -s build &>> {wd}/{name}_factor_analysis.log

In [None]:
[mixture_model]
def get_cmd(m):
    c1 = f'''
    sos run ~/GIT/bioworkflows/multivariate-fine-mapping/mixture_prior.ipynb ud --name {m} --cwd {wd} \
        -c ~/GIT/mvarbvs/midway2.yml -q midway2 -s build &> {wd}/ed_{m}.log
    sos run ~/GIT/bioworkflows/multivariate-fine-mapping/mixture_prior.ipynb plot_U --name "" --cwd {wd} --model-data {wd}/{m}.ed.rds --remove-label \
        -c ~/GIT/mvarbvs/midway2.yml -q midway2 -s build
    '''
    c2 = f'''
    sos run ~/GIT/bioworkflows/multivariate-fine-mapping/mixture_prior.ipynb ud --ud-method teem --name {m} --cwd {wd} \
        -c ~/GIT/mvarbvs/midway2.yml -q midway2 -s build &> {wd}/teem_{m}.log
    sos run ~/GIT/bioworkflows/multivariate-fine-mapping/mixture_prior.ipynb plot_U --name "" --cwd {wd} --model-data {wd}/{m}.teem.rds --remove-label \
        -c ~/GIT/mvarbvs/midway2.yml -q midway2 -s build
    '''
    c3 = f'''
    sos run ~/GIT/bioworkflows/multivariate-fine-mapping/mixture_prior.ipynb ed --name {m} --cwd {wd} \
        -c ~/GIT/mvarbvs/midway2.yml -q midway2 -s build &> {wd}/bovy_{m}.log
    sos run ~/GIT/bioworkflows/multivariate-fine-mapping/mixture_prior.ipynb plot_U --name "" --cwd {wd} --model-data {wd}/{m}.ed_bovy.rds --remove-label \
        -c ~/GIT/mvarbvs/midway2.yml -q midway2 -s build
    '''
    return [c1,c2,c3]
cmds = get_cmd(name)
input: for_each = 'cmds'
script: interpreter= 'qsub', expand = True
#!/bin/bash
  
#SBATCH --time=36:00:00
#SBATCH --partition=broadwl
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=1
#SBATCH --mem-per-cpu=2000
#SBATCH --job-name={step_name}
#SBATCH --mail-type=BEGIN,END,FAIL

module load R
{_cmds}

In [None]:
[default_1]
# Only analyze `cis` variants -- cis = N means using N variants around the center column of X matrix  
parameter: cis = 'NULL'
parameter: approximate = 'F'
parameter: prior = path('/project2/compbio/GTEx_eQTL/mvSuSiE_prior/FastQTLSumstats.mvSuSiE_prior.ed_bovy.rds')
input: genes, group_by = 1
output: f'{wd:a}/{_input:bn}_mvSuSiE_cis{("_%s" % cis) if cis != "NULL" else ""}.rds'
#task: trunk_workers = [4] * 10, trunk_size = 1, walltime = '12h', mem = '8G', cores = 1, tags = f'{step_name}_{_output:bn}'
task: trunk_workers = 1, trunk_size = 1, walltime = '12h', mem = '12G', cores = 2, tags = f'{step_name}_{_output:bn}'
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr"
    ###Functions to compute MAF and missing genotype rate
    compute_maf <- function(geno){
      f <- mean(geno,na.rm = TRUE)/2
      return(min(f, 1-f))
    }

    compute_missing <- function(geno){
      miss <- sum(is.na(geno))/length(geno)
      return(miss)
    }
    
    mean_impute <- function(geno){
      f <- apply(geno, 2, function(x) mean(x,na.rm = TRUE))
      for (i in 1:length(f)) geno[,i][which(is.na(geno[,i]))] <- f[i]
      return(geno)
    }

    is_zero_variance <- function(x) {
      if (length(unique(x))==1) return(T)
      else return(F)
    }
  
    filter_X <- function(X, missing_rate_thresh, maf_thresh) {
        rm_col <- which(apply(X, 2, compute_missing) > missing_rate_thresh)
        if (length(rm_col)) X <- X[, -rm_col]
        rm_col <- which(apply(X, 2, compute_maf) < maf_thresh)
        if (length(rm_col)) X <- X[, -rm_col]
        rm_col <- which(apply(X, 2, is_zero_variance))
        if (length(rm_col)) X <- X[, -rm_col]
        return(mean_impute(X))
    }

    compute_cov_flash <- function(Y, error_cache = NULL){
        covar <- diag(ncol(Y))
        tryCatch({
        fl <- flashier::flash(Y, var.type = 2, prior.family = c(flashier::prior.normal(), flashier::prior.normal.scale.mix()), backfit = TRUE, verbose.lvl=0)
        if(fl$n.factors==0){
          covar <- diag(fl$residuals.sd^2)
        } else {
          fsd <- sapply(fl$fitted.g[[1]], '[[', "sd")
          covar <- diag(fl$residuals.sd^2) + crossprod(t(fl$flash.fit$EF[[2]]) * fsd)
        }
        if (nrow(covar) == 0) {
          covar <- diag(ncol(Y))
          stop("Computed covariance matrix has zero rows")
        }
        }, error = function(e) {
          if (!is.null(error_cache)) {
            saveRDS(list(data=Y, message=warning(e)), error_cache)
            warning("FLASH failed. Using Identity matrix instead.")
            warning(e)
          } else {
            stop(e)
          }
        })
        s <- apply(Y, 2, sd, na.rm=T)
        if (length(s)>1) s = diag(s)
        else s = matrix(s,1,1)
        covar <- s%*%cov2cor(covar)%*%s
        return(covar)
    }
  
    compute_cov_diag <- function(Y){
        covar <- diag(apply(Y, 2, var, na.rm=T))
        return(covar)
    }

    get_center <- function(k,n) {
      ## For given number k, get the range k surrounding n/2
      ## but have to make sure it does not go over the bounds
      if (is.null(k)) {
          return(1:n)
      }
      start = floor(n/2 - k/2)
      end = floor(n/2 + k/2)
      if (start<1) start = 1
      if (end>n) end = n
      return(start:end)
    }
    dat = readRDS(${_input:r})
    prior = readRDS(${prior:r})
    prior = mvsusieR::create_mash_prior(mixture_prior=list(weights=prior$w, matrices=prior$U))
    resid_Y = compute_cov_flash(dat$y_res)
    X = filter_X(dat$X, 0.1, 0.05)
    X = X[,get_center(${cis}, ncol(X))]
    st = proc.time()
    res = mvsusieR::mvsusie(X, dat$y_res, prior_variance=prior, residual_variance=resid_Y, precompute_covariances=T, compute_univariate_zscore=T, max_iter = 500, n_thread=2, approximate=${approximate})
    res$time = proc.time() - st
    saveRDS(res,${_output:r})

In [None]:
[default_2]
output: f"{_input:n}.manhattan.png", f"{_input:n}.bubble_finemap.png", f"{_input:n}.bubble_original.png"
R: expand = '${ }'
    res = readRDS(${_input:r})
    pdf('${_output[0]:n}.pdf', width=8, height=4)
    susieR::susie_plot(res,y='PIP', main = 'Cross-condition Posterior Inclusion Probability', xlab = 'SNP positions', add_legend = F)
    dev.off()
    p = mvsusieR::mvsusie_plot(res)
    pdf('${_output[1]:n}.pdf', width = p$width, height = p$height)
    print(p$plot)
    dev.off()
    p = mvsusieR::mvsusie_plot(res, plot_z=TRUE)
    pdf('${_output[2]:n}.pdf', width = p$width, height = p$height)
    print(p$plot)
    dev.off()

bash: expand = '${ }'
    convert -density 150 ${_output[0]:n}.pdf ${_output[0]}
    convert -density 150 ${_output[1]:n}.pdf ${_output[1]}
    convert -density 150 ${_output[2]:n}.pdf ${_output[2]}

In [None]:
[submit]
script: interpreter= 'qsub', expand = True
#!/bin/bash
  
#SBATCH --time=36:00:00
#SBATCH --partition=broadwl
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=8
#SBATCH --mem-per-cpu=2000
#SBATCH --job-name={step_name}
#SBATCH --mail-type=BEGIN,END,FAIL

module load R
sos run analysis/gtex-v8/20200906_mvSuSiE_GTEx.ipynb \
    --data-dir /project2/compbio/GTEx_eQTL/cis_eqtl_analysis_ready \
    --analysis_units data/27_brain_non_brain_genes_v8.txt\
    -c midway2.yml -q midway2 \
    -s build -e ignore &> gtex_v8.log