# Association analysis for GTEx V8 data

We use this pipeline to perform association analysis for GTEx V8 data, including fine-mapping and prediction applications.

## Authors

Gao (fine-mapping) and Fabio (prediction) at UChicago.

## Pipeline interface

In [2]:
sos run GTEx_V8_Association.ipynb -h

usage: sos run GTEx_V8_Association.ipynb
               [workflow_name | -t targets] [options] [workflow_options]
  workflow_name:        Single or combined workflows defined in this script
  targets:              One or more targets to generate
  options:              Single-hyphen sos parameters (see "sos run -h" for details)
  workflow_options:     Double-hyphen workflow-specific parameters

Workflows:
  mnm
  fqtl

Global Workflow Options:
  --output-dir output (as path)
  --data-dir /project2/compbio/GTEx_eQTL/cis_eqtl_analysis_ready (as path)
  --gene-id-file  path(f"{data_dir}/multitissue_genes_list.txt")

  --missing-rate-cutoff 0.05 (as float)
  --maf-cutoff 0.01 (as float)
  --mash-model /project2/compbio/GTEx_eQTL/mashr_flashr_workflow_output/FastQTLSumStats.mash.FL_PC3.mash_model_est_v.rds (as path)
  --prop-test 0.0 (as float)
                        If test_prop is set > 0 then prediction will also be
                        performed and evaluated

Sections
  mnm_0, fqtl

A test command is (on a local computer):

```
cat /project2/compbio/GTEx_eQTL/cis_eqtl_analysis_ready/multitissue_genes_list.txt | grep -v "#" | head -1 > test.txt
sos run GTEx_V8_Association.ipynb mnm \
    --gene-id-file test.txt \
    --prop-test 0.2 \
    --output-dir ./output
```

To submit jobs, for say, 1000 genes from the list,

```
cat /project2/compbio/GTEx_eQTL/cis_eqtl_analysis_ready/multitissue_genes_list.txt | grep -v "#" | head -1000 > 1000_genes.txt
sos run GTEx_V8_Association.ipynb mnm \
    --gene-id-file 1000_genes.txt \
    --prop-test 0.2 \
    --output-dir /project2/compbio/GTEx_eQTL/cis_eqtl_mnm_v8 \
    -c midway2.yml -q midway2
```

To run `fqtl` pipeline that uses a different job template (configured as a different queue),

```
cat /project2/compbio/GTEx_eQTL/cis_eqtl_analysis_ready/multitissue_genes_list.txt | grep -v "#" | head -1000 > 1000_genes.txt
sos run GTEx_V8_Association.ipynb fqtl \
    --gene-id-file 1000_genes.txt \
    --prop-test 0.2 \
    --output-dir /project2/compbio/MnM-TWAS/Output/cis_eqtl_fqtl_v8 \
    -c midway2.yml -q midway2_rmkl
```

In [3]:
[global]
parameter: output_dir = path('./output')
parameter: data_dir = path('/project2/compbio/GTEx_eQTL/cis_eqtl_analysis_ready')
parameter: gene_id_file = path(f"{data_dir}/multitissue_genes_list.txt")
parameter: missing_rate_cutoff = 0.05
parameter: maf_cutoff = 0.01
parameter: mash_model = path('/project2/compbio/GTEx_eQTL/mashr_flashr_workflow_output/FastQTLSumStats.mash.FL_PC3.mash_model_est_v.rds')
# If test_prop is set > 0 then prediction will also be performed and evaluated
parameter: prop_test = 0.0
# specify seed
parameter: seed = 1
genes = [x.strip() for x in open(gene_id_file).readlines() if not x.startswith('#')]
fail_if(len(genes) == 0, msg = 'No gene to analyze!')

## Utilities

Here are some predefined functions from Fabio to use elsewhere in the workflow. In the spirit of minimal changes made I'm saving this as a utility file for other workflows to use as necessary. 

In [4]:
[mnm_0,fqtl_0]
report: expand = "${ }", output = '.sos/utils.R'

    ###Function needed for fqtl prediction
    predict_fqtl <- function(coeffs, newx){
        for(i in 1:ncol(coeffs)){
            if(i==1){res <- newx %*% coeffs[, i]} else if(i>1){res <- cbind(res, newx %*% coeffs[, i])}
        }
        return(res)
    }

    ###Functions to compute MAF and missing genotype rate
    compute_maf <- function(geno){
      f <- mean(geno,na.rm = TRUE)/2
      return(min(f, 1-f))
    }

    compute_missing <- function(geno){
      miss <- sum(is.na(geno))/length(geno)
      return(miss)
    }
    
    mean_impute <- function(geno){
      f <- apply(geno, 2, function(x) mean(x,na.rm = TRUE))
      for (i in 1:length(f)) geno[,i][which(is.na(geno[,i]))] <- f[i]
      return(geno)
    }

    is_zero_variance <- function(x) {
      if (length(unique(x))==1) return(T)
      else return(F)
    }

    ###Function to sample test set individuals
    create_test_set <- function(Y, test_prop){
        Ntot <- nrow(Y)
        test_ind <- sample(x=c(1:Ntot), size=round(Ntot*test_prop), replace=F)

        return(test_ind)
    }

    ###Function to calculate the covariance matrix of Y via diag
    compute_cov_diag <- function(Y, miss=NULL){
        if(is.null(miss)){
            covar <- diag(apply(Y, 2, var, na.rm=T))
        } else {
            covar <- diag(apply(Y[-miss, ], 2, var, na.rm=T))
        }

        return(covar)
    }

    ###Function to calculate the covariance matrix of Y via flash
    compute_cov_flash <- function(Y, miss=NULL){
        if(is.null(miss)){
            fl <- flashier::flash(Y, var.type = 2, prior.family = c(flashier::prior.normal(), flashier::prior.normal.scale.mix()), backfit = TRUE, verbose.lvl=0)
        } else {
            fl <- flashier::flash(Y[-miss, ], var.type = 2, prior.family = c(flashier::prior.normal(), flashier::prior.normal.scale.mix()), backfit = TRUE, verbose.lvl=0)
        }  
    
        if(fl$n.factors==0){
          covar <- diag(fl$residuals.sd^2)
        } else {
          fsd <- sapply(fl$fitted.g[[1]], '[[', "sd")
          covar <- diag(fl$residuals.sd^2) + crossprod(t(fl$flash.fit$EF[[2]]) * fsd)
        }

        return(covar)
    }

    ### mashr null correlation code
    estimate_null_correlation_simple = function(bhat, sbhat, z_thresh=2, est_cor = TRUE){
      z = bhat/sbhat
      z[which(is.nan(z) | is.infinite(z))] = 0
      max_absz = apply(abs(z),1, max)
      nullish = which(max_absz < z_thresh)
      if(length(nullish)<ncol(z)){
        print("not enough null data to estimate null correlation; return diag instead")
        return(diag(ncol(z)))
      }
      nullish_z = z[nullish,]
      Vhat = cov(nullish_z)
      if(est_cor){
        Vhat = cov2cor(Vhat)
      }
      return(Vhat)
    }

    ###Functions to do prediction
    fqtl_fit_pred <- function(X, Y, miss, K=1){
        ###Scale X and Y
        X_scaled <- scale(X, center=T, scale=T)
        Y_scaled <- scale(Y, center=T, scale=T)

        ###Fit fqtl
        opt.gtex <- list(vbiter = 5000, gammax = 1e4, tol = 1e-8,
                         rate = 1e-2, decay = -1e-2,
                         pi.ub = -1/2, pi.lb = -2, tau = -4, do.hyper = TRUE,
                         jitter = 0.1, svd.init = TRUE, out.residual = FALSE,
                         print.interv = 100, k = K, mf.right.nn = FALSE)

        out <- fqtl::fit.fqtl(Y_scaled[-miss, ], X_scaled[-miss, ], factored=TRUE, nthread = 1, options = opt.gtex)
        tis_effect <- out$mean.right$theta
        snp_effect <- out$mean.left$theta
        combined_effect <- tcrossprod(snp_effect, tis_effect)

        ###Predict phenotypes in the test set
        pred_scaled <- predict_fqtl(combined_effect, X_scaled[miss, ])
        pred <- pred_scaled * attr(Y_scaled, 'scaled:scale')[col(pred_scaled)] + attr(Y_scaled, 'scaled:center')[col(pred_scaled)]

        return(list(fit=out, ymiss=Y[miss, ], ymiss_pred=pred))
    }

    ### Filter X matrix
    filter_X <- function(X, missing_rate_thresh, maf_thresh) {
        rm_col <- which(apply(X, 2, compute_missing) > missing_rate_thresh)
        if (length(rm_col)) X <- X[, -rm_col]
        rm_col <- which(apply(X, 2, compute_maf) < maf_thresh)
        if (length(rm_col)) X <- X[, -rm_col]
        rm_col <- which(apply(X, 2, is_zero_variance))
        if (length(rm_col)) X <- X[, -rm_col]
        return(mean_impute(X))
    }

    TISSUE_LIST = c('Adipose_Subcutaneous',
    'Adipose_Visceral_Omentum',
    'Adrenal_Gland',
    'Artery_Aorta',
    'Artery_Coronary',
    'Artery_Tibial',
    'Brain_Amygdala',
    'Brain_Anterior_cingulate_cortex_BA24',
    'Brain_Caudate_basal_ganglia',
    'Brain_Cerebellar_Hemisphere',
    'Brain_Cerebellum',
    'Brain_Cortex',
    'Brain_Frontal_Cortex_BA9',
    'Brain_Hippocampus',
    'Brain_Hypothalamus',
    'Brain_Nucleus_accumbens_basal_ganglia',
    'Brain_Putamen_basal_ganglia',
    'Brain_Spinal_cord_cervical_c-1',
    'Brain_Substantia_nigra',
    'Breast_Mammary_Tissue',
    'Cells_Cultured_fibroblasts',
    'Cells_EBV-transformed_lymphocytes',
    'Colon_Sigmoid',
    'Colon_Transverse',
    'Esophagus_Gastroesophageal_Junction',
    'Esophagus_Mucosa',
    'Esophagus_Muscularis',
    'Heart_Atrial_Appendage',
    'Heart_Left_Ventricle',
    'Kidney_Cortex',
    'Liver',
    'Lung',
    'Minor_Salivary_Gland',
    'Muscle_Skeletal',
    'Nerve_Tibial',
    'Ovary',
    'Pancreas',
    'Pituitary',
    'Prostate',
    'Skin_Not_Sun_Exposed_Suprapubic',
    'Skin_Sun_Exposed_Lower_leg',
    'Small_Intestine_Terminal_Ileum',
    'Spleen',
    'Stomach',
    'Testis',
    'Thyroid',
    'Uterus',
    'Vagina',
    'Whole_Blood')

## M&M analysis

In [5]:
# Fit M&M
[mnm_1]
depends: R_library('mvsusieR'), R_library('flashier'), R_library('susieR'), mash_model
# maximum number of mixture components to use
parameter: max_mixtures = 20
# maximum number of single effects to capture
parameter: max_effects = 10
# covariance matrix computed by "flash", "diag" or "znull" method
parameter: cov_method = 'flash'
input: [f'{data_dir}/{gene}.Multi_Tissues.rds' for gene in genes], group_by = 1
output: f'{output_dir}/{_input:bn}.mnm_{cov_method}.rds'
# each job uses 10 nodes, each node 4 cores in parallel each core using 2G memory; and jobs are created in batches of 40.
task: trunk_workers = [4] * 10, trunk_size = 40, walltime = '30m', mem = '8G', cores = 1, tags = f'{step_name}_{_output:bn}'

R: expand = '${ }', input = '.sos/utils.R'
    set.seed(${seed})
    attach(readRDS(${_input:r}))
    # setup data
    X <- filter_X(X, ${missing_rate_cutoff}, ${maf_cutoff})
    prop_test <- ${prop_test}
    
    # Get original effects results
    # to keep for a comparison, not to be used in M&M
    univariate_res = lapply(1:ncol(y_res), function(i) susieR:::univariate_regression(X,y_res[,i]))
    bhat = do.call(cbind, lapply(1:ncol(y_res), function(i) univariate_res[[i]]$betahat))
    shat = do.call(cbind, lapply(1:ncol(y_res), function(i) univariate_res[[i]]$sebetahat))
    # Get mash model
    mash <- readRDS(${mash_model:r})
    res = list()
    # For fine-mapping
    if ("${cov_method}" == 'znull') {
      vhat = estimate_null_correlation_simple(bhat, shat)
      s_y = apply(y_res, 2, function(x) sd(x, na.rm=T))
      cov_Y = diag(s_y) %*% vhat %*% diag(s_y)
    } else {
      cov_Y <- compute_cov_${cov_method}(y_res)
    }
    if (mash$fitted_g$usepointmass) {
      prior_weights = mash$fitted_g$pi[-1]
      null_weight = mash$fitted_g$pi[1]
    } else {
      prior_weights = mash$fitted_g$pi
      null_weight = 0
    }
    include_cond = match(colnames(y_res), TISSUE_LIST)
    m_init <- mvsusieR:::MashInitializer$new(Ulist=mash$fitted_g$Ulist, grid=mash$fitted_g$grid, prior_weights=prior_weights, null_weight=null_weight, top_mixtures=${max_mixtures}, include_conditions=include_cond)
    res$fitted <- mvsusieR::mvsusie(X, y_res, L=${max_effects}, prior_variance=m_init, residual_variance=cov_Y, compute_objective=F, estimate_residual_variance=F, estimate_prior_variance=T, estimate_prior_method="simple")
    res$fitted$bhat = bhat
    res$fitted$shat = shat
    # For prediction
    if (prop_test>0) {
        miss <- create_test_set(y_res, prop_test)
        cov_Y <- compute_cov_${cov_method}(y_res, miss)
        m <- mvsusieR::mvsusie(X[-miss, ], y_res[-miss, ], L=${max_effects}, prior_variance=m_init, residual_variance=cov_Y, compute_objective=F, estimate_residual_variance=F, estimate_prior_variance=T, estimate_prior_method="simple")
        res$ymiss_pred <- mvsusieR::predict.mvsusieR(m, X[miss, ])
        res$ymiss <- y_res[miss, ]
    }
    saveRDS(res, ${_output:r})

In [None]:
# Make some M&M plots
[mnm_2]
depends: executable('convert')
output: f'{_input:n}.png'
# each job uses 10 nodes, each node 4 cores in parallel each core using 2G memory; and jobs are created in batches of 40.
task: trunk_workers = [4] * 10, trunk_size = 40, walltime = '5m', mem = '2G', cores = 1, tags = f'{step_name}_{_output:bn}'
R: expand = '${ }'
    m = readRDS(${_input:r})$fitted
    pdf("${_output:n}.dot.pdf", width=12, height=6)
    susieR::susie_plot(m,y='PIP', main = 'Cross-condition Posterior Inclusion Probability', xlab = 'SNP positions', add_legend = T)
    dev.off()
    p = mvsusieR::mvsusie_plot(m)
    pdf("${_output:n}.bubble.pdf", width=20, height=15)
    print(p$plot)
    dev.off()
    p = mvsusieR::mvsusie_plot(m, original_sumstat=T)
    pdf("${_output:nn}.z_scores.pdf", width=20, height=15)
    print(p$plot)
    dev.off()

bash: expand = True
  convert -append {_output:n}.dot.pdf {_output:n}.bubble.pdf {_output:nn}.z_scores.pdf {_output}

## fqtl analysis

In [6]:
[fqtl_1]
depends: R_library('fqtl')
input: [f'{data_dir}/{gene}.Multi_Tissues.rds' for gene in genes], group_by = 1
output: f'{output_dir}/{_input:bn}.fqtl.rds'
# each job uses 10 nodes, each node 4 cores in parallel each core using 2G memory; and jobs are created in batches of 40.
task: trunk_workers = 4, trunk_size = 8, walltime = '150m', mem = '8G', cores = 1, tags = f'{step_name}_{_output:bn}'

R: expand = '${ }', input = '.sos/utils.R'
    set.seed(${seed})
    attach(readRDS(${_input:r}))
    # setup data
    X <- filter_X(X, ${missing_rate_cutoff}, ${maf_cutoff})
    prop_test <- ${prop_test}
    #
    miss <- create_test_set(y_res, prop_test)
    K <- min((nrow(y_res)-length(miss)), ncol(y_res))
    res <- fqtl_fit_pred(X, y_res, miss, K=K)
    saveRDS(res, ${_output:r})