# Summary statistics formatting
This notebook takes in more than one collections of sumstat text file,  to produce a collections of merged.rds per gene files that can served as the input of both MASH and MVSuSiE analysis.

## Input
1. a sumstat list with columns: "#chr", theme1, theme2, theme3, each cells not under #chr represent the path to 1 sumstat file(generated by yml generator)
2. region_list:a table with columns: chr, start, end, gene_ID for partition
## Output
1. 23 merged sumstat file in txt format, 1 for each chrom
2. merged sumstat file in rds format, 1 for each gene
3. 2 file documenting 1 and 2

In [None]:
[global]
import glob
# Path to work directory where output locates
parameter: wd = path("./output")
# Containers that contains the necessary packages
parameter: container = 'gaow/twas'

# Columns: "#chr", theme1, theme2, theme3
parameter: sumstat_list_path = path

sumstat_list = pd.read_csv(sumstat_list_path, sep = "\t")
sumstat_inv = sumstat_list.values.tolist()
theme = sumstat_list.columns.values[1:len(sumstat_list.columns.values)].tolist()
name = "_".join(theme)

parameter: chrom = "#chrom"
parameter: pos = "pos"
parameter: variant_id = "variant_id"
parameter: beta = "beta"
parameter: se = "se"


## Merge univariate association summary statistic to RDS format

In [None]:
[merge_1]
input:  for_each = "sumstat_inv"
output: f'{wd:a}/merge/{name}.{_sumstat_inv[0]}.merged.txt'
task: trunk_workers = 1, trunk_size = 20, walltime = '4h',  mem = '6G', tags = f'{step_name}_{_output:bn}'  
R: expand = "$[ ]", stderr = f'{_output}.stderr', stdout = f'{_output}.stdout', container = container
    library("dplyr")
    library("tibble")
    library("purrr")
    library("readr")
    library("tidyr")
    ## Start
    Theme = c("$[",".join(theme)]")
    dir = c("$[",".join(_sumstat_inv[1:])]")
    tb = tibble(Theme = Theme, dir = dir)%>%mutate(data = map(dir,~read_delim(.x,"\t")%>%select(`$[chrom]`,$[pos],$[variant_id],$[beta],$[se])))
    data = tb$data%>%reduce(inner_join, by = c(`$[chrom]`,"$[pos]","$[variant_id]"))%>%
      rename_if(str_detect(names(.),"$[beta]"), ~paste0("$[beta]_",theme))%>%
      rename_if(str_detect(names(.),"$[se]"), ~paste0("$[se]_",theme))
    data%>%write_delim("$[_output]")
[merge_2]
input: group_by = "all"
output: f'{wd:a}/merge/{name}_sumstat_list_per_chrom'
import pandas as pd
df = sumstat_list[].assign(
dir = _input
)
df.to_csv(_output,sep = "\t",index = 0)


In [None]:
[partition_1]
parameter: region_list = path
regions = [x.strip().split() for x in open(region_list).readlines() if x.strip() and not x.strip().startswith('#')]
parameter: windows = 5000000
input: output_from("merge_2"), for_each = "regions"
output: f'{wd}/merge/RDS/{name}_{_regions[3]}.rds'
task: trunk_workers = 1, trunk_size = 1, walltime = '12h',  mem = '10G', tags = f'{step_name}_{_output:bn}'
R: expand= "$[ ]", stderr = f'{_output}.stderr', stdout = f'{_output}.stdout',container = container
    library("dplyr")
    library("tibble")
    library("readr")
    library("purrr")
    library("tidyr")
    sumstat_list = read_delim("$[_input]","\t")
    sumstat_path = (sumstat_list%>%filter(chr == $[_regions[0]]))[[1,1]]
    sumstat = read_delim(sumstat_path,delim = "\t" )%>%filter( `$[pos]` >=  $[_regions[1]] - $[windows], `$[pos_col]` <= $[_regions[1]] + "$[windows]")%>%
    ### remove all the NA,nan,Inf sumstat
    #filter(!is.na(Z) && !is.nan(Z) && is.finite(Z))
    output = list()
    output$bhat = as.matrix(sumstat_ftr%>%select(contains("$[beta]"))%>%rename_all(str_replace("$[beta]","")))
    rownames(output$bhat) = (sumstat_ftr$$[variant_id])%>%unlist%>%as.character
    output$sbhat = as.matrix(sumstat_ftr%>%select(contains("$[se]"))%>%rename_all(str_replace("$[se]","")))
    rownames(output$sbhat) = (sumstat_ftr$$[variant_id])%>%unlist%>%as.character
    output$Z = output$bhat/output$sbhat
    keep_index = which(!is.na(output$Z) && !is.nan(output$Z) && is.finite(output$Z))
    output$bhat = output$bhat[keep_index]
    output$sbhat = output$sbhat[keep_index]
    output$Z = output$Z[keep_index]
    output$snps = rownames(output$Z)
    output%>%saveRDS("$[_output]")

In [None]:
[partition_2]
input: group_by = "all"
output: f'{wd}/merge/RDS/{name}.sumstat_list'
python: expand= "$[ ]", stderr = f'{_output}.stderr', stdout = f'{_output}.stdout'
    import pandas as pd
    pd.DataFrame({"analysis_unit" : [$[_input:br,]]}).to_csv("$[_output]",index = False ,header = False, sep = "t")