# Summary statistics formatting
This notebook takes in more than one collections of sumstat RDS file,  to produce a collections of merged.rds files that can served as the input of both MASH and MVSuSiE analysis.

Each of the input sumstat RDS file must be a list with bhat and sbhat table, and the rowname for each of the table must be snp name in the form of chr:pos_alt_ref

Allele flip issues will also be detected and resolved in the process of merging.


In [None]:
[global]
import glob
# Path to work directory where output locates
parameter: wd = path("./output")
# Containers that contains the necessary packages
parameter: container = 'gaow/twas'

## Merge univariate association summary statistic to RDS format

In [None]:
[merge_and_alleleQC_1]
# The list of study to be merged, as out put of the partitioned step
parameter: theme_list = path
# The colname of the study list 
parameter: theme_col = "#Theme"
# Analysis units RDS file. For RDS files it can be generated by `ls *.rds | sed 's/\.rds//g' > analysis_units.txt`, the regions shall be named in the form of {_regions[0]}.rds
parameter: analysis_units = path
regions = [x.strip().split() for x in open(analysis_units).readlines() if x.strip() and not x.strip().startswith('#')]
input:  theme_list, for_each = "regions"
output: f'{wd:a}/RDS/{_regions[0]}'

task: trunk_workers = 1, trunk_size = 20, walltime = '4h',  mem = '6G', tags = f'{step_name}_{_output:bn}'  

R: expand = "$[ ]", stderr = f'{_output}.stderr', stdout = f'{_output}.stdout', container = container
    library("dplyr")
    library("tibble")
    library("purrr")
    library("readr")
    library("tidyr")
    
    ## Define function
    allele.qc  = function(a1,a2,ref1,ref2) {
            a1 = toupper(a1)
            a2 = toupper(a2)
            ref1 = toupper(ref1)
            ref2 = toupper(ref2)
    
        ref = ref1
        flip = ref
        flip[ref == "A"] = "T"
        flip[ref == "T"] = "A"
        flip[ref == "G"] = "C"
        flip[ref == "C"] = "G"
        flip1 = flip
    
        ref = ref2
        flip = ref
        flip[ref == "A"] = "T"
        flip[ref == "T"] = "A"
        flip[ref == "G"] = "C"
        flip[ref == "C"] = "G"
        flip2 = flip;
    
        snp = list()
        snp[["keep"]] = !((a1=="A" & a2=="T") | (a1=="T" & a2=="A") | (a1=="C" & a2=="G") | (a1=="G" & a2=="C"))
        snp[["keep"]][ a1 != "A" & a1 != "T" & a1 != "G" & a1 != "C" ] = F
        snp[["keep"]][ a2 != "A" & a2 != "T" & a2 != "G" & a2 != "C" ] = F
        snp[["flip"]] = (a1 == ref2 & a2 == ref1) | (a1 == flip2 & a2 == flip1)
    
        return(snp)
    }
    
     allele.process = function(sumstat,name,n,stat){
        qc = allele.qc(sumstat$V2.y,sumstat$V3.y,sumstat$V2.x,sumstat$V3.x)
        # Flip Z-scores for mismatching alleles
        qc$flip = qc$flip%>%which%>%na.omit
        qc$keep = qc$keep%>%which%>%na.omit
        stat = which(colnames(sumstat) %in% stat)
        sumstat[ qc$flip,stat ] = -1 * sumstat[ qc$flip,stat ]
        sumstat$V2.y[ qc$flip ] = sumstat$V2.x[ qc$flip ]
        sumstat$V3.y[ qc$flip ] = sumstat$V3.x[ qc$flip ]
        # Remove strand ambiguous SNPs
            sumstat = sumstat[qc$keep,]
        # Merge the tags
            sumstat = sumstat%>%mutate(rowname.y = pmap_chr( list(V1,V2.y,V3.y),function(a,b,c) paste(c(a,b,c),collapse = "_")),
                                        rowname.x = coalesce(rowname.x,NA,rowname.y),
                                        V2.x = coalesce(V2.x,NA,V2.y),
                                        V3.x = coalesce(V3.x,NA,V3.y),                   
                                        )%>%select(-rowname.y,-V2.y,-V3.y)
        # Clean name
        colnames(sumstat)[c(1,2,4,5,n+4)] = c("rowname","bhat","V2","V3",name[n]) 
        return(sumstat)
        }
    

      ## Start
    
    
    Theme = read_delim('$[_input]', delim = "\t")
    Theme = Theme%>%mutate(dir = map_chr(`$[theme_col]`,~paste(c(`.x`,"/$[_regions[0]]"),collapse = "")),
                                          i = 1:nrow(Theme),
                                          bhat = map(dir, ~readRDS(.x)$bhat%>%as.data.frame%>%rownames_to_column%>%rename(bhat = 2)%>%mutate(snps = map(rowname,~read.table(text = .x, sep = "_",colClass = "character")%>%as_tibble ))%>%unnest(snps)),
                                          sbhat = map(dir, ~readRDS(.x)$sbhat%>%as.data.frame%>%rownames_to_column%>%rename(sbhat = 2)%>%mutate(snps = map(rowname,~read.table(text = .x, sep = "_",colClass = "character")%>%as_tibble ))%>%unnest(snps))
                                          )
    n = nrow(Theme)
    name = Theme%>%mutate(name = map(`$[theme_col]`, ~read.table(text = .x,sep = "/")),
                                name = map_chr(name, ~.x[,ncol(.x)]%>%as.character) )%>%pull(name)
    

    # Join first two conditions
    genos_join_bhat = full_join((Theme%>%pull(bhat))[[1]],(Theme%>%pull(bhat))[[2]],by = "V1")
    
    genos_join_bhat = allele.process(genos_join_bhat,name,2,"bhat.y")
    
    
    
    genos_join_sbhat = full_join((Theme%>%pull(sbhat))[[1]],(Theme%>%pull(sbhat))[[2]],by = "V1")
                                                            
    genos_join_sbhat  = allele.process(genos_join_sbhat,name,2,"sbhat.y")                                
    
    
    # If there are more conditions, join the rest
    if(n > 2){
        for(j in 3:n){
            genos_join_bhat = full_join(genos_join_bhat,(Theme%>%pull(bhat))[[j]],by = "V1")
            genos_join_bhat = allele.process(genos_join_bhat,name,j,"bhat.y")
            genos_join_sbhat = full_join(genos_join_sbhat,(Theme%>%pull(sbhat))[[j]],by = "V1")
            genos_join_sbhat  = allele.process(genos_join_sbhat,name,j,"sbhat.y")  
        }
    }
    colnames(genos_join_bhat)[2] = name[1]
    colnames(genos_join_sbhat)[2] = name[1]                                      
    
    snps = genos_join_bhat$rowname
    genos_join_bhat = genos_join_bhat%>%select(-rowname,-V1,-V2,-V3)%>%as.matrix
    genos_join_sbhat = genos_join_sbhat%>%select(-rowname,-V1,-V2,-V3)%>%as.matrix%>%abs
    
    # save the rds file
    saveRDS(file = "$[_output]", list(bhat=genos_join_bhat, sbhat=genos_join_sbhat,Z = genos_join_bhat/genos_join_sbhat ,snps = snps))

In [None]:
[merge_and_alleleQC_2]
input: group_by = "all"
output: f'{wd}/merged_analysis_unit.txt'
python: expand= "$[ ]", stderr = f'{_output}.stderr', stdout = f'{_output}.stdout'
    import pandas as pd
    pd.DataFrame({"#analysis_unit" : [$[_input:ar,]]}).to_csv("$[_output]",index = False ,header = False, sep = "t")