# Association result processing
This notebook analysis the result of association scan based on a merged vcf file.
see https://github.com/cumc/xqtl-pipeline/issues/215 for detail
Input: 
   - list of vcf

Output: 
   - list of vcf with adjusted pvalue
   - a table summerized the number of significant finding, after multiple testing correction
   - a table of adjusted, best norminal pvalue for each gene.

In [None]:
sos run pipeline/assoc_result_processing.ipynb genome \
    --vcf  `ls output/data_intergration/TensorQTL/*merged.vcf.gz` \
    --padjust-method "bonferroni"  

In [None]:
[global]
# list of vcf to be analysised, best to be for all chromosome by `ls *.merged.vcf.gz` in the output folder of sumstat merger
parameter: vcf = paths
# For cluster jobs, number commands to run per job
parameter: job_size = 1
# Wall clock time expected
parameter: walltime = "5h"
# Memory expected
parameter: mem = "16G"
# Number of threads
parameter: numThreads = 20
parameter: container = ""

In [1]:
[per_chrom]
# Way to do gene level multiple correction for each snps, bonferroni or fdr
parameter: padjust_method = "fdr"
input: for_each = "vcf"
output: rds = f'{_vcf:nn}.rds', processed_vcf = f'{_vcf:nn}.p_adjusted.vcf.bgz'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'
R: expand= "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout', container = container
   library("dplyr")
   # Read VCF
   data = VariantAnnotation::readVcf("$[_vcf]")
   # Extract p-val
   pval = VariantAnnotation::geno(data)$P%>%as_tibble(rownames = "ID")%>%mutate(GENE = VariantAnnotation::info(data)$GENE, ID = sub("[^:]*:","", ID) )
   # Adjust pvalue per gene based on fdr or beferroni
   pval_adj = pval%>%group_by(GENE)%>%mutate(across(colnames(VariantAnnotation::geno(data)$P),~.x%>%p.adjust("$[padjust_method]")))
   ## Get the number of sig association
   n_asso = (pval_adj%>%ungroup%>%select(-ID,-GENE) < 0.05)%>%colSums(na.rm = TRUE)%>%t%>%as_tibble
   ## Get the number of sig snps
   n_snp = (pval_adj%>%ungroup%>%group_by(ID)%>%summarize(across(colnames(VariantAnnotation::geno(data)$P),~.x%>%coalesce(9)%>%min))%>%select(-ID)< 0.05)%>%colSums(na.rm = TRUE)%>%t%>%as_tibble
   ## Get the top  snps for each genes
   pval_adj_gene = pval_adj%>%group_by(GENE)%>%summarize(across(colnames(VariantAnnotation::geno(data)$P),~.x%>%min))
   pval_adj_gene_snps = pval_adj%>%group_by(GENE)%>%summarize(across(colnames(VariantAnnotation::geno(data)$P),~list(pval_adj[(which(.x == min(.x))),1])))
   output = list("n_asso" = n_asso,"n_snp" = n_snp,"pval_adj_gene" = pval_adj_gene,"pval_adj_gene_snps" = pval_adj_gene_snps )
   saveRDS(output,"$[_output[0]]")
   # Edit the VCF and output it
   VariantAnnotation::geno(VariantAnnotation::header(data))["P_ADJ",] = list("1","Float","P value adjusted by $[padjust_method] among each genes")
   VariantAnnotation::geno(data)$P_ADJ = VariantAnnotation::geno(data)$P
   VariantAnnotation::geno(data)$P_ADJ = pval_adj%>%ungroup%>%select(-ID,-GENE)%>%as.matrix
   VariantAnnotation::meta(VariantAnnotation::header(data))$n_sig_asso = paste0(n_asso,collapse = ":")
   VariantAnnotation::meta(VariantAnnotation::header(data))$n_sig_snp = paste0(n_snp,collapse = ":")
   VariantAnnotation::writeVcf(data,"$[_output[1]:n]",index = TRUE)

In [None]:
[genome]
input: output_from("per_chrom")["rds"], group_by = "all"
output: f'{_input[0]:nn}.n_sig.txt',f'{_input[0]:nn}.gene_p_adj.txt'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'
R: expand= "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout', container = container
   library("dplyr")
   library("readr")
   library("tidyr")
   library("purrr")
   # Read data
   input_ls = list($[_input:r,])
   data = tibble(input_ls)%>%mutate(rds = map(input_ls,~read_rds(.x)),
                                    n_assoc = map(rds,~.x$n_asso),
                                    n_snp = map(rds,~.x$n_snp),
                                    pval_adj_gene = map(rds,~.x$pval_adj_gene)
                                      )
   # Total number of significant association, snps, and genes
   n_assoc = data%>%select(n_assoc)%>%unnest("n_assoc")%>%colSums(na.rm = TRUE)
   n_snp = data%>%select(n_snp)%>%unnest("n_snp")%>%colSums(na.rm = TRUE)
   pval_adj_gene_adj = data%>%select(pval_adj_gene)%>%unnest("pval_adj_gene")%>%mutate(across(colnames(n_assoc),~.x%>%p.adjust("fdr") ))
   n_gene = (pval_adj_gene_adj%>%select(-GENE)< 0.05 )%>%colSums(na.rm = TRUE)
   n_sum = cbind(n_assoc,n_snp,n_gene)%>%as_tibble(rownames = "tissue")
   n_sum%>%write_delim("$[_output[0]]","\t")
   pval_adj_gene_adj%>%write_delim("$[_output[1]]","\t")