## Example

In [None]:
sos run ~/codes/xqtl-pipeline/code/intact_pipeline/Padding.ipynb padding \
    --qtl_file `ls /mnt/vast/hpc/csg/molecular_phenotype_calling/eqtl/output/susie_per_gene_tad/cache/*rds`  \
    --gwas_path /mnt/mfs/hgrcgrid/homes/hs3393/ADHD/ADHD_finemap/ -J 50 -c ~/test/csg.yml -q csg --job_size 10

In [None]:
[global]
# Workdir
parameter: cwd = path("output")
# dataset 
parameter: tissue = ''
# QTL data type
parameter: QTL = 'eQTL'
parameter: GWAS = 'GWAS'
parameter: container = ''
parameter: entrypoint={('micromamba run -n' + ' ' + container.split('/')[-1][:-4]) if container.endswith('.sif') else f''}
parameter: job_size = 1
parameter: walltime = "5h"
parameter: mem = "8G"
parameter: numThreads = 1


In [None]:
[padding]
parameter: qtl_file = paths
parameter: gwas_path = path
input: qtl_file, group_by = 1
output:  f'{cwd:a}/{QTL}/{QTL}.{_input:bn}.newpadding.rds',f'{cwd:a}/{GWAS}/{GWAS}.{_input:bn}.newpadding.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R: expand = '${ }', stdout = f"{_output[0]:nn}.stdout", stderr = f"{_output[0]:nn}.stderr", container = container, entrypoint = entrypoint
    suppressPackageStartupMessages(library(tidyverse))
    # read all GWAS finemapping result file name to extract the chr, start, end and corresponding file paths  
    gwas_finemap_result = list.files("${gwas_path}", full.names=T, pattern = "\\.rds$")
    gwas_file_tb = tibble(file_path = gwas_finemap_result)

    # use regex to extract the start, end and chromosome of LD blocks from the file name
    match_pattern = function(filename){
        pattern <- "chr[0-9XY]+_(\\d+_\\d+)"
        result = regmatches(filename, regexpr(pattern, filename))
        return(result)
    }
      
    LD_block_position = gwas_file_tb %>% mutate(chr_pos = map(file_path, match_pattern)) %>% 
      separate(chr_pos, into = c("chr", "start", "end")) %>% relocate(file_path, .after = end) %>%
        mutate(start = as.numeric(start), end = as.numeric(end)) %>% arrange(chr, start) 
      
    # give the absolute path to read RDS of qtl finemapping result rds file  
    qtl_file = readRDS("${_input}")$dlpfc_eqtl
    # after reading the variants in the rds file, we can use the first variant and last one to extract the chr, start and end of eqtl variants
    qtl_chr = regmatches(qtl_file$variable_name[1], regexpr("(chr[0-9]+)", qtl_file$variable_name[1]))
    qtl_start  <- as.numeric(sub(".*:(\\d+)_.*", "\\1", qtl_file$variable_name[1]))
    qtl_end  <- as.numeric(sub(".*:(\\d+)_.*", "\\1", qtl_file$variable_name[length(qtl_file$variable_name)]))

    # use the regions in qtl to get those LD blocks that overlap with qtl regions
    related_LD = LD_block_position %>% filter(chr == qtl_chr) %>% filter((start <= qtl_start & end >= qtl_start) |
                                            (start >= qtl_start & end <= qtl_end) | 
                                            (start <= qtl_end & end >= qtl_end))

    # extract those related GWAS finemapping result lbf matrix to form the larger one
    cnt = 1
    variants = c()
    lbf_mtx = list()
    for (file in related_LD$file_path){
        rds = readRDS(file)
        variants = c(variants, rds$variants)
        lbf_mtx[[cnt]] = as.data.frame(rds$lbf_variable)
        colnames(lbf_mtx[[cnt]]) = rds$variants
        cnt = cnt + 1
    }

    # after combining the matrices, fill those NA with 0 to form the whole matrix
    lbf_whole_mtx = bind_rows(lbf_mtx) %>% replace(is.na(.), 0)

    # get the shared variants between gwas and qtl

    # here is one problem: maybe now the output of finemapping does not need $dlpfc... and the variant name have different format
    # chr:9999 or chr_999, so here we change : to _; this brings a lot of trouble
    # if now variable name are uniformed, then things are good, remove the str_replcace line
      
    shared_variant = unlist(intersect(qtl_file$variable_name %>% 
      map(~ str_replace_all(.x, ":", "_")), colnames(lbf_whole_mtx)))

      
    # remove those columns that does not share SNP, only keep those shared by two datasetl
    GWAS_lbf_matrix = lbf_whole_mtx[, shared_variant]
      

    # again, change the variant names
    colnames(qtl_file$lbf_variable) = unlist(qtl_file$variable_name %>% 
      map(~ str_replace_all(.x, ":", "_")))

    # because in the susie output the cs are recorded by index, so we get the index that are removed
    rm_index = which(!(colnames(qtl_file$lbf_variable) %in% shared_variant))
      
    # also, for qtl data, only keep variants that are shared
    qtl_lbf_mtx = qtl_file$lbf_variable[,shared_variant]
      

    # convert lbf to alpha
    lbf_to_alpha_vector = function(lbf, prior_weights = NULL) {
      if (is.null(prior_weights)) prior_weights = 1/length(lbf)
      maxlbf = max(lbf)
      # w is proportional to BF, but subtract max for numerical stability.
      w = exp(lbf - maxlbf)
      # Posterior prob for each SNP.
      w_weighted = w * prior_weights
      weighted_sum_w = sum(w_weighted)
      alpha = w_weighted / weighted_sum_w
      return(alpha)
    }

    lbf_to_alpha = function(lbf) t(apply(lbf, 1, lbf_to_alpha_vector))

    # convert lbf to pip
    lbf_to_pip = function(lbf) {
        alpha = lbf_to_alpha(lbf)
        return(as.vector(1 - apply(1 - alpha,2,prod)))
    }


    # use new lbf matrix to compute alpha and pip
    shared_variable_name<-str_replace(shared_variant, "_", ":")

    GWAS_alpha = lbf_to_alpha(GWAS_lbf_matrix)
    GWAS_pip = lbf_to_pip(GWAS_lbf_matrix)

    qtl_alpha = lbf_to_alpha(qtl_lbf_mtx)
    qtl_pip = lbf_to_pip(qtl_lbf_mtx)
    colnames(qtl_alpha) =colnames(GWAS_alpha) = names(qtl_pip)  = names(GWAS_pip) = shared_variable_name
    # for now, we remove the variants in cs, if that variant is not shared by two traits
    ## note: this part may be changed!! now is just a rough strategy
    cs_number = length(qtl_file$sets$cs)
    new_cs = list()
    if(cs_number == 0){
        new_cs = NA
    }else{
        for (i in (1:cs_number)){
        new_cs[[i]] = setdiff(qtl_file$sets$cs[[i]], rm_index)
        }
    }

    # the output can be: GWAS+ qtl lbf matrix; alpha; pip
    qtl_mu <- qtl_file$mu[,shared_variable_name]
    names(qtl_file$X_column_scale_factors) = qtl_file$variable_name
    qtl_coef<-colSums(qtl_alpha*qtl_mu)/(qtl_file$X_column_scale_factors[shared_variable_name])


    new_qtl_obj <- list( pip =qtl_pip,
                        coef = qtl_coef,
                        mu = qtl_mu,
                        alpha = qtl_alpha,
                        sets=list(cs = new_cs),
                        variable_name = shared_variable_name
    )
    names(new_qtl_obj$sets$cs) = names(qtl_file$sets$cs)

    new_gwas_obj <- list( pip =GWAS_pip,
                        alpha = GWAS_alpha,
                        variable_name = shared_variable_name
    )
    saveRDS(new_qtl_obj,"${_output[0]}")
    saveRDS(new_gwas_obj,"${_output[1]}")
