# Summary statistics in VCF format

modified from the create_vcf of mrcieu/gwasvcf package to transform the mash output matrixs from the rds format into a vcf file, with a effect size = to the coef and the se = 1, named as EF:SE.

Input:
a collection of gene-level rds file, each file is a matrix of mash output, with colnames = studies, rownames = snps, snps shall be in the form of chr:pos_alt_ref,
A list of aforementioned MASH output

output:
A collection of gene-level vcf output:vcf file and corresponding index
a list of aforementioned vcf

Required R packages:
   dplyr
   readr
   VariantAnnotation
   

The output format of this workflow is following this specification https://github.com/MRCIEU/gwas-vcf-specification, with each study stands for a column

## Sample output

```
##fileformat=VCFv4.2
##fileDate=20220103
##FORMAT=<ID=ES,Number=A,Type=Float,Description="Effect size estimate relative to the alternative allele">
##FORMAT=<ID=SE,Number=A,Type=Float,Description="Standard error of effect size estimate">
##contig=<ID=10>
#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	AC	DLPFC	PCC
10	104559	10:104559	G	A	.	PASS	.	ES:SE	-0.0526809:0.0305599	0.0535111:0.0251836	-0.0526809:0.0305599
10	107045	10:107045	T	C	.	PASS	.	ES:SE	-0.0306955:0.0129309	-0.168938:0.0440613	-0.0306955:0.0129309
10	112455	10:112455	G	A	.	PASS	.	ES:SE	-0.0533215:0.0290866	-0.159363:0.071302	-0.0533215:0.0290866
10	116387	10:116387	C	A	.	PASS	.	ES:SE	0.0328615:0.0226517	0.267323:0.0797036	0.0328615:0.0226517
```


In [None]:
[global]
import glob
# single column file each line is the data filename
parameter: analysis_units = path
# Path to data directory
parameter: data_dir = "/"
# data file suffix
parameter: data_suffix = ""
# Path to work directory where output locates
parameter: wd = path("./output")
# An identifier for your run of analysis
parameter: name = ""

regions = [x.replace("\"", "" ).strip().split() for x in open(analysis_units).readlines() if x.strip() and not x.strip().startswith('#')]
genes = regions
# Containers that contains the necessary packages
parameter: container = 'gaow/twas'


In [None]:
[rds_to_vcf_1]
input: genes, group_by = 1
output: vcf = f'{wd:a}/mash_vcf/{_input:bn}.vcf.bgz'
task: trunk_workers = 1, walltime = '1h', trunk_size = 1, mem = '10G', cores = 1, tags = f'{_output:bn}'
R: expand = '$[ ]', stdout = f"{_output[0]:nn}.stdout", stderr = f"{_output[0]:nn}.stderr"
    library("dplyr")
    library("stringr")
    library("readr")
    library("purrr")
    ## Define a wrapper, modified from the gwasvcf packages, to create the vcf of needed.
    
        create_vcf = function (chrom, pos, nea, ea, snp = NULL, ea_af = NULL, effect = NULL, 
        se = NULL, pval = NULL, n = NULL, ncase = NULL, name = NULL) 
    {
        stopifnot(length(chrom) == length(pos))
        if (is.null(snp)) {
            snp <- paste0(chrom, ":", pos)
        }
        snp <- paste0(chrom, ":", pos)
        nsnp <- length(chrom)
        gen <- list()
        ## Setupt data content for each sample column
        if (!is.null(ea_af)) 
            gen[["AF"]] <- matrix(ea_af, nsnp)
        if (!is.null(effect)) 
            gen[["ES"]] <- matrix(effect, nsnp)
        if (!is.null(se)) 
            gen[["SE"]] <- matrix(se, nsnp)
        if (!is.null(pval)) 
            gen[["LP"]] <- matrix(-log10(pval), nsnp)
        if (!is.null(n)) 
            gen[["SS"]] <- matrix(n, nsnp)
        if (!is.null(ncase)) 
            gen[["NC"]] <- matrix(ncase, nsnp)
        gen <- S4Vectors::SimpleList(gen)
        
      ## Setup snps info for the fix columns
        gr <- GenomicRanges::GRanges(chrom, IRanges::IRanges(start = pos, 
            end = pos + pmax(nchar(nea), nchar(ea)) - 1, names = snp))
      ## Setup meta informations
         coldata <- S4Vectors::DataFrame(Studies = name, row.names = name)
        hdr <- VariantAnnotation::VCFHeader(header = IRanges::DataFrameList(fileformat = S4Vectors::DataFrame(Value = "VCFv4.2", 
            row.names = "fileformat")), sample = name)
        VariantAnnotation::geno(hdr) <- S4Vectors::DataFrame(Number = c("A", 
            "A", "A", "A", "A", "A"), Type = c("Float", "Float", 
            "Float", "Float", "Float", "Float"), Description = c("Effect size estimate relative to the alternative allele", 
            "Standard error of effect size estimate", "-log10 p-value for effect estimate", 
            "Alternate allele frequency in the association study", 
            "Sample size used to estimate genetic effect", "Number of cases used to estimate genetic effect"), 
            row.names = c("ES", "SE", "LP", "AF", "SS", "NC"))
      ## Save only the meta information in the sample columns 
        VariantAnnotation::geno(hdr) <- subset(VariantAnnotation::geno(hdr), 
            rownames(VariantAnnotation::geno(hdr)) %in% names(gen))
      ## Save VCF values
        vcf <- VariantAnnotation::VCF(rowRanges = gr, colData = coldata, 
            exptData = list(header = hdr), geno = gen)
        VariantAnnotation::alt(vcf) <- Biostrings::DNAStringSetList(as.list(ea))
        VariantAnnotation::ref(vcf) <- Biostrings::DNAStringSet(nea)
      ## Write fixed values
        VariantAnnotation::fixed(vcf)$FILTER <- "PASS"
            return(sort(vcf))
        }
  
        input = readRDS($[_input:r])
        input_effect = input$PosteriorMean
        if(is.null(input$PosteriorSD)){
           input$PosteriorSD = matrix(1,nrow = nrow(input_effect),ncol = ncol(input_effect) )
          }
        input_se = input$PosteriorSD
        df = tibble(snps = input$snps)
        df = df%>%mutate(    chr = map_dbl(snps,~str_remove(read.table(text = .x,sep = ":",as.is = T)$V1, "chr")%>%as.numeric),
                     pos_alt_ref = map_chr(snps,~read.table(text = .x,sep = ":",as.is = TRUE)$V2),
                     pos = map_dbl(pos_alt_ref,~read.table(text = .x,sep = "_",as.is = TRUE)$V1),
                     alt = map_chr(pos_alt_ref,~read.table(text = .x,sep = "_",as.is = TRUE, colClass = "character")$V2),
                     ref = map_chr(pos_alt_ref,~read.table(text = .x,sep = "_",as.is = TRUE, colClass = "character")$V3))
    
  
        vcf = create_vcf(
           chrom = df$chr,
            pos = df$pos,
            ea = df$alt,
            nea = df$ref,
            effect = input_effect ,
            se =  input_se,
            name = colnames(input_effect))
    
      VariantAnnotation::writeVcf(vcf,$[_output:nr],index = TRUE)
        

In [None]:
[rds_to_vcf_2]
input:  group_by = "all"
output: vcf_list = f'{_input[0]:d}/vcf_output_list.txt'
bash: expand = '${ }', stdout = f"{_output[0]:nn}.stdout", stderr = f"{_output[0]:nn}.stderr"
        cd ${_input[0]:d}
        ls *.vcf.bgz > ${_output}