# Fine-mapping with SuSiE model
This notebook conduct fine_mapping with complete data, unlike the susie_RSS module, this module perform analysis on 1 of the theme each time

## Input

1. 1 region list documenting the regions to be analysised
2. a list to the path where phenotype per gene data stored
3. a list to the path where genotype per gene data stored

By default the input 2 and 3 is the output from the data_preprocessing module.

### Example genotype list

region        dir
ENSG00000000457 /mnt/mfs/statgen/xqtl_workflow_testing/demo/genotype_reformmating/demo_per_gene_plink/ENSG00000000457.bed
ENSG00000000460 /mnt/mfs/statgen/xqtl_workflow_testing/demo/genotype_reformmating/demo_per_gene_plink/ENSG00000000460.bed
ENSG00000000938 /mnt/mfs/statgen/xqtl_workflow_testing/demo/genotype_reformmating/demo_per_gene_plink/ENSG00000000938.bed
ENSG00000000971 /mnt/mfs/statgen/xqtl_workflow_testing/demo/genotype_reformmating/demo_per_gene_plink/ENSG00000000971.bed
ENSG00000001036 /mnt/mfs/statgen/xqtl_workflow_testing/demo/genotype_reformmating/demo_per_gene_plink/ENSG00000001036.bed
ENSG00000001084 /mnt/mfs/statgen/xqtl_workflow_testing/demo/genotype_reformmating/demo_per_gene_plink/ENSG00000001084.bed
ENSG00000001167 /mnt/mfs/statgen/xqtl_workflow_testing/demo/genotype_reformmating/demo_per_gene_plink/ENSG00000001167.bed
ENSG00000001460 /mnt/mfs/statgen/xqtl_workflow_testing/demo/genotype_reformmating/demo_per_gene_plink/ENSG00000001460.bed


### Example phenotype list


region dir
ENSG00000000457 /mnt/mfs/statgen/xqtl_workflow_testing/demo/phenotype_reformat/demo.ENSG00000000457.mol_phe.bed.gz
ENSG00000000460 /mnt/mfs/statgen/xqtl_workflow_testing/demo/phenotype_reformat/demo.ENSG00000000460.mol_phe.bed.gz
ENSG00000000938 /mnt/mfs/statgen/xqtl_workflow_testing/demo/phenotype_reformat/demo.ENSG00000000938.mol_phe.bed.gz
ENSG00000000971 /mnt/mfs/statgen/xqtl_workflow_testing/demo/phenotype_reformat/demo.ENSG00000000971.mol_phe.bed.gz
ENSG00000001036 /mnt/mfs/statgen/xqtl_workflow_testing/demo/phenotype_reformat/demo.ENSG00000001036.mol_phe.bed.gz
ENSG00000001084 /mnt/mfs/statgen/xqtl_workflow_testing/demo/phenotype_reformat/demo.ENSG00000001084.mol_phe.bed.gz
ENSG00000001167 /mnt/mfs/statgen/xqtl_workflow_testing/demo/phenotype_reformat/demo.ENSG00000001167.mol_phe.bed.gz
ENSG00000001460 /mnt/mfs/statgen/xqtl_workflow_testing/demo/phenotype_reformat/demo.ENSG00000001460.mol_phe.bed.gz


## Output

For each analysis unit we output:

1. Analysis results in RDS format: A mvsusie Model
2. A vcf file with selected snps
        ES:PIP:CS

## examples



In [1]:
[global]
import glob
import pandas as pd
# Input
parameter: region_list = path
parameter: phenotype_list = path
parameter: genotype_list = path
## An identifier for your run of analysis
parameter: name = str
gene_ID = pd.read_csv(region_list, sep = "\t")["gene_ID"].values.tolist()
phenotype = pd.read_csv(phenotype_list, sep = "\t")
genotype = pd.read_csv(genotype_list, sep = "\t")
input_inv = phenotype.merge(genotype, on = "region")
input_inv = input_inv.query(f'region in {gene_ID}')
input_inv = input_inv.values.tolist()
## Path to work directory where output locates
parameter: wd = path("./output")
## Containers that contains the necessary packages
parameter: container = "/mnt/mfs/statgen/containers/twas_latest.sif"
## Only 1 LD for each gene is required for each analysis

## Univariate SuSiE

In [1]:
[uni_susie_1]
parameter: max_L = 10
# remove a variant if it has more than imiss missing individual data
parameter: imiss = 0.1
parameter: maf = 0.05
input: for_each = "input_inv"
output: fit = f"{wd:a}/cache/{name}.{_input_inv[0]}.unisusie.fit.rds"
task: trunk_workers = 1, trunk_size = 1, walltime = '2h', mem = '55G', cores = 1, tags = f'{step_name}_{_output[0]:bn}'
R: expand = '${ }', stdout = f"{_output:nn}.stdout", stderr = f"{_output:nn}.stderr", container = container
    ## Define function
    compute_maf <- function(geno){
      f <- mean(geno,na.rm = TRUE)/2
      return(min(f, 1-f))
    }

    compute_missing <- function(geno){
      miss <- sum(is.na(geno))/length(geno)
      return(miss)
    }
    
    mean_impute <- function(geno){
      f <- apply(geno, 2, function(x) mean(x,na.rm = TRUE))
      for (i in 1:length(f)) geno[,i][which(is.na(geno[,i]))] <- f[i]
      return(geno)
    }

    is_zero_variance <- function(x) {
      if (length(unique(x))==1) return(T)
      else return(F)
    }
  
    filter_X <- function(X, missing_rate_thresh, maf_thresh) {
        rm_col <- which(apply(X, 2, compute_missing) > missing_rate_thresh)
        if (length(rm_col)) X <- X[, -rm_col]
        rm_col <- which(apply(X, 2, compute_maf) < maf_thresh)
        if (length(rm_col)) X <- X[, -rm_col]
        rm_col <- which(apply(X, 2, is_zero_variance))
        if (length(rm_col)) X <- X[, -rm_col]
        return(mean_impute(X))
    }
  
    compute_cov_flash <- function(Y, error_cache = NULL){
        covar <- diag(ncol(Y))
        tryCatch({
        fl <- flashier::flash(Y, var.type = 2, prior.family = c(flashier::prior.normal(), flashier::prior.normal.scale.mix()), backfit = TRUE, verbose.lvl=0)
        if(fl$n.factors==0){
          covar <- diag(fl$residuals.sd^2)
        } else {
          fsd <- sapply(fl$fitted.g[[1]], '[[', "sd")
          covar <- diag(fl$residuals.sd^2) + crossprod(t(fl$flash.fit$EF[[2]]) * fsd)
        }
        if (nrow(covar) == 0) {
          covar <- diag(ncol(Y))
          stop("Computed covariance matrix has zero rows")
        }
        }, error = function(e) {
          if (!is.null(error_cache)) {
            saveRDS(list(data=Y, message=warning(e)), error_cache)
            warning("FLASH failed. Using Identity matrix instead.")
            warning(e)
          } else {
            stop(e)
          }
        })
        s <- apply(Y, 2, sd, na.rm=T)
        if (length(s)>1) s = diag(s)
        else s = matrix(s,1,1)
        covar <- s%*%cov2cor(covar)%*%s
        return(covar)
    }
    ## Load Library
    library("susieR")
    library("plink2R")
    library("dplyr")
    library("readr")
    library("stringr")
    ###
    # Core code
    ###
    # Input
    y_res = read_delim( "${_input_inv[1]}" ,"\t")
    y_res = y_res[,5:ncol(y_res)]%>%t()

    X = read_plink("${_input_inv[2].replace(".bed","")}")$bed
    X = mean_impute(X)

      
    print(paste("Dimension of X matrix:", nrow(X), ncol(X)))
    print(paste("Dimension of Y matrix:", nrow(y_res), ncol(y_res)))
    non_missing = list()
    fitted = list()
    # Fine-mapping with SuSiE
       for (r in 1:1) {
        non_missing[[r]] = which(!is.na(y_res[,r]))
        st = proc.time()
        fitted[[r]] <- susieR::susie(X[non_missing[[r]],], y_res[non_missing[[r]],r],
                           L=${max_L},
                           max_iter=1000,
                           estimate_residual_variance=TRUE,
                           estimate_prior_variance=TRUE,
                           refine=TRUE)
        fitted[[r]]$time = proc.time() - st
        fitted[[r]]$cs_corr = susieR:::get_cs_correlation(fitted[[r]], X=X[non_missing[[r]],])
        fitted[[r]]$cs_snps = names(fitted[[r]]$X_column_scale_factors[unlist(fitted[[r]]$sets$cs)])
        fitted[[r]]$variable_name =  names(fitted[[r]]$X_column_scale_factors)
        fitted[[r]]$coef =  coef.susie(fitted[[r]])
    }
    saveRDS(fitted, ${_output[0]:r})

In [None]:
[uni_susie_2]
input: group_with = "input_inv"
output: fit = f"{wd:a}/{name}.{_input_inv[0]}.unisusie.vcf.bgz"
task: trunk_workers = 1, trunk_size = 1, walltime = '2h', mem = '55G', cores = 1, tags = f'{step_name}_{_output[0]:bn}'
R: expand = '${ }', stdout = f"{_output:nn}.stdout", stderr = f"{_output:nn}.stderr"
   ## Define create_vcf function
           create_vcf = function (chrom, pos, nea, ea, snp = NULL, ea_af = NULL, effect = NULL, 
        se = NULL, pval = NULL, name = NULL,cs = NULL, pip = NULL) 
    {
        stopifnot(length(chrom) == length(pos))
        if (is.null(snp)) {
            snp <- paste0(chrom, ":", pos)
        }
        snp <- paste0(chrom, ":", pos)
        nsnp <- length(chrom)
        gen <- list()
        ## Setupt data content for each sample column
        if (!is.null(ea_af)) 
            gen[["AF"]] <- matrix(ea_af, nsnp)
        if (!is.null(effect)) 
            gen[["ES"]] <- matrix(effect, nsnp)
        if (!is.null(se)) 
            gen[["SE"]] <- matrix(se, nsnp)
        if (!is.null(pval)) 
            gen[["LP"]] <- matrix(-log10(pval), nsnp)
        if (!is.null(cs)) 
            gen[["CS"]] <- matrix(cs, nsnp)
        if (!is.null(pip)) 
            gen[["PIP"]] <- matrix(pip, nsnp)
        gen <- S4Vectors::SimpleList(gen)
        
      ## Setup snps info for the fix columns
        gr <- GenomicRanges::GRanges(chrom, IRanges::IRanges(start = pos, 
            end = pos + pmax(nchar(nea), nchar(ea)) - 1, names = snp))
         coldata <- S4Vectors::DataFrame(Studies = name, row.names = name)
    ## Setup header informations
        hdr <- VariantAnnotation::VCFHeader(header = IRanges::DataFrameList(fileformat = S4Vectors::DataFrame(Value = "VCFv4.2", 
            row.names = "fileformat")), sample = name)
        VariantAnnotation::geno(hdr) <- S4Vectors::DataFrame(Number = c("A", 
            "A", "A", "A", "A", "A"), Type = c("Float", "Float", 
            "Float", "Float", "Float", "Float"), Description = c("Effect size estimate relative to the alternative allele", 
            "Standard error of effect size estimate", "-log10 p-value for effect estimate",  
            "Alternate allele frequency in the association study",
            "The CS this variate are captured, 0 indicates not in any cs", "The posterior inclusion probability to a CS"), 
            row.names = c("ES", "SE", "LP", "AF", "CS", "PIP"))
      ## Save only the meta information in the sample columns 
        VariantAnnotation::geno(hdr) <- subset(VariantAnnotation::geno(hdr), 
            rownames(VariantAnnotation::geno(hdr)) %in% names(gen))
      ## Save VCF 
        vcf <- VariantAnnotation::VCF(rowRanges = gr, colData = coldata, 
            exptData = list(header = hdr), geno = gen)
        VariantAnnotation::alt(vcf) <- Biostrings::DNAStringSetList(as.list(ea))
        VariantAnnotation::ref(vcf) <- Biostrings::DNAStringSet(nea)
      ## Add fixed values
        VariantAnnotation::fixed(vcf)$FILTER <- "PASS"
          return(sort(vcf))
        }
    library("susieR")
    library("dplyr")
    library("tibble")
    library("purrr")
    library("readr")
    library("tidyr")
    
    # Get list of cs snps
    res = readRDS(${_input:r})[[1]]
    output_snps = tibble( snps = res$variable_name[which(res$pip >= 0)], snps_index = which((res$pip >= 0))  )
    output_snps = output_snps%>%mutate( cs = map(snps_index,~which(res$sets$cs %in% .x))%>%as.numeric%>%replace_na(0),
                             pip = map_dbl(snps_index,~(res$pip[.x])),
                     chr = map_chr(snps,~read.table(text = .x,sep = ":",as.is = T)$V1),
                     pos_alt_ref = map_chr(snps,~read.table(text = .x,sep = ":",as.is = TRUE)$V2),
                     pos = map_dbl(pos_alt_ref,~read.table(text = .x,sep = "_",as.is = TRUE)$V1),
                     alt = map_chr(pos_alt_ref,~read.table(text = .x,sep = "_",as.is = TRUE, colClass = "character")$V2),
                     ref = map_chr(pos_alt_ref,~read.table(text = .x,sep = "_",as.is = TRUE, colClass = "character")$V3))
    
    effect_mtr = res$coef[output_snps$snps_index+1]%>%as.matrix
    colnames(effect_mtr) = "${name}"
    rownames(effect_mtr) = output_snps$snps
    cs_mtr = effect_mtr
    for(i in 1:nrow(cs_mtr)) cs_mtr[i,] =  output_snps$cs[[i]]  
    pip_mtr = effect_mtr
    for(i in 1:nrow(pip_mtr)) pip_mtr[i,] =  output_snps$pip[[i]]  
    
    output_vcf = create_vcf(
           chrom = output_snps$chr,
            pos = output_snps$pos,
            ea = output_snps$alt,
            nea = output_snps$ref,
            effect = effect_mtr ,
            pip = pip_mtr,
            cs = cs_mtr,
            name = colnames(effect_mtr)
              )
  
    VariantAnnotation::writeVcf(output_vcf,${_output:nr},index = TRUE)

In [None]:
[uni_susie_3]
input: group_by = "all"
output: f'{wd}/{name}.unisusie.output_list.txt'
python: expand= "$[ ]", stderr = f'{_output}.stderr', stdout = f'{_output}.stdout'
    import pandas as pd
    pd.DataFrame({"output_vcf" : [$[_input:ar,]]}).to_csv("$[_output]",index = False ,header = False, sep = "t")