# PTWAS Implementation in R

In [None]:
[global]
# Workdir
parameter: cwd = path("output")
# susie_table is the table of eQTL fine mapped results, which has two columns for gene and susie_fils
parameter: susie_table = ""
# This vcf is derived from the conversion of the susie rds for each gene, with the relevant information noted in the INFO column.
#parameter: out_vcf = ""
# file_table is the table of GWAS fine mapped results, which has two columns for LD blocks and susie_fils
parameter: file_table = ""
# out_file is a temporary file in the environment
parameter: out_file = ""
# the prefix of fastENLOC output 
parameter: out_pre = ""
# the zipped file of out_vcf, which is derived from the conversion of the susie rds for each gene
parameter: eqtl_vcf = ""
# dataset 
parameter: tissue = ''
# QTL data type
parameter: QTL = 'eQTL'
parameter: container = ''
parameter: entrypoint={('micromamba run -n' + ' ' + container.split('/')[-1][:-4]) if container.endswith('.sif') else f''}
parameter: job_size = 1
parameter: walltime = "5h"
parameter: mem = "8G"
parameter: numThreads = 1
eqtl_vcf = file_target(f"{cwd:a}/{QTL}.susie_to_DAPG.vcf.gz")
import os
if not os.path.exists(f'{cwd}/cache/'):
    os.makedirs(f'{cwd}/cache/')

## SuSiE weight 

 what is weight here, mu ?  posterior mean?

In [None]:
[susie_get_weight]
input: ptwas_weights, gwas_path, region_list
output: f'{cwd}/cache/input_dataframe.txt'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'
R: expand= "${ }", stderr = f'{_output:nn}.stderr', stdout = f'{_output:nn}.stdout'


## PTWAS scan
This portion contains code for running the PTWAS scan as implemented in GAMBIT. 



### Input

- eQTL Weights
    File that contains eQTL weights (formatting is up-for-debate). Maybe column 1 is SNP and column 2 is the weight.
- GWAS Z-Scores
    File that contains GWAS z-scores (or what makes up the z-scores). Column 1 is SNP, column 2 can be z-scores.
- LD reference
- region list


### Output

Same output as GAMBIT

In [None]:
[ptwas_scan_1]
input: ptwas_weights, gwas_path, region_list
output: f'{cwd}/cache/input_dataframe.txt'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'
R: expand= "${ }", stderr = f'{_output:nn}.stderr', stdout = f'{_output:nn}.stdout'
    library(tidyverse)
    library(harmonicmeanp)
    handle_weights <- function(gwas_ids, weight_ids) {
        return(unlist(lapply(
            strsplit(gwas_ids, ':'),
            function(x) {
                POSs  <- as.double(sapply(strsplit(weight_ids, ':'), '[[', 2))
                REFs <- sapply(strsplit(weight_ids, ':'), '[[', 3)
                ALTs <- sapply(strsplit(weight_ids, ':'), '[[', 4)
                for (i in 1:length(REFs)) {
                    if (as.double(x[2]) == POSs[i]) {
                        if (x[3] == REFs[i] & x[4] == ALTs[i]) {
                            return(1)
                        } else if (x[3] == ALTs[i] & x[4] == REFs[i]) {
                            return(-1)
                        } else if (x[3] == aflip(REFs[i]) & x[4] == aflip(ALTs[i])) {
                            return(1)
                        } else if (x[3] == aflip(ALTs[i]) & x[4] == aflip(REFs[i])) {
                            return(-1)
                        } else {
                            return(0)
                        }
                    }
                }
            }
        )))
    }

    debug_print <- function(gwas_ids, modifiers, weights, zscores, stat, denom, output) {
        snpdf <- data.frame(
            CHROM = sapply(strsplit(gwas_ids, ':'), '[[', 1),
            POS = as.double(sapply(strsplit(gwas_ids, ':'), '[[', 2)),
            VARIANT = gsub(":", "_", gwas_ids),
            REF = sapply(strsplit(gwas_ids, ':'), '[[', 3),
            ALT = sapply(strsplit(gwas_ids, ':'), '[[', 4),
            MODIFIERS = modifiers,
            WEIGHTS = weights,
            ZSCORES = zscores) %>% arrange(POS)
        cat("\n\nSNP info:\n\n", file = output, sep = "\n", append = TRUE)
        write_delim(
            snpdf,
            output,
            append = TRUE,
            col_names = TRUE,
            quote = "none")
        cat("\n\nmodifiers:\n\n", file = output, sep = "\n", append = TRUE)
        cat(paste(snpdf$MODIFIERS, collapse = " "), file = output, sep = "\n", append = TRUE)
        cat("\n\nweights:\n\n", file = output, sep = "\n", append = TRUE)
        cat(paste(snpdf$WEIGHTS, collapse = " "), file = output, sep = "\n", append = TRUE)
        cat("\n\nz-stats:\n\n", file = output, sep = "\n", append = TRUE)
        cat(paste(snpdf$ZSCORES, collapse = " "), file = output, sep = "\n", append = TRUE)
        cat(paste0("\n\ntest score = ", stat, "\n\n"), file = output, sep = "\n", append = TRUE)
        cat(paste0("\n\ntest variance = ", denom, "\n\n"), file = output, sep = "\n", append = TRUE)
    }

    burden <- function(weight_ids, gwas_ids, weights, zscores) {
        modifiers <- handle_weights(gwas_ids, weight_ids)
        weights <- modifiers * weights
        
        stat <- sum(weights * zscores)
        denom <- sum(weights * weights)
        zscore <- stat/sqrt(denom)

        if (length(zscores) == 1) {
            zscore <- zscores[[1]]
            if (weights[[1]] < 0) {
                zscore <- zscore * -1
            }
        }
        
        pval <- pchisq( zscore * zscore, 1, lower.tail = FALSE)
        
        debug_print(gwas_ids, modifiers, weights, zscores, stat, denom, paste0(output_base, "ptwas-scan.debug"))

        return(pval)
    }

    pval_HMP <- function(pvals) {
        # https://search.r-project.org/CRAN/refmans/harmonicmeanp/html/pLandau.html
        pvalues <- unique(pvals)
        L <- length(pvalues)
        HMP <- L/sum(pvalues^-1)

        LOC_L1 <- 0.874367040387922
        SCALE <- 1.5707963267949

        return(pLandau(1/HMP, mu = log(L) + LOC_L1, sigma = SCALE, lower.tail = FALSE))
    }

    pcauchy <- function(x) {
        return(0.5 + atan(x)/pi)
    }

    qcauchy <- function(q) {
        return(tan(pi*(q - 0.5)))
    }

    pval_ACAT <- function(pvals) {
        if (length(pvals) == 1) {
            return(pvals[0])
        }
        stat <- 0.00
        pval_min <- 1.00

        stat <- sum(qcauchy(pvals))
        pval_min <- min(pval_min, min(qcauchy(pvals)))

        return(pcauchy(stat/length(pvals), lower.tail = FALSE))
    }


    globalPvalue <- function(pvals, comb_method = "HMP", naive=FALSE) {
        # assuming sstats has tissues as columns and rows as pvals
        min_pval <- min(pvals)
        n_total_tests <- pvals %>% unique() %>% length() # There should be one unique pval per tissue
        global_pval <- if (comb_method == "HMP") pval_HMP(pvals) else pval_ACAT(pvals) # pval vector
        naive_pval <- min(n_total_tests*min_pval, 1.0)
        return(if (naive) naive_pval else global_pval) # global_pval and naive_pval
    }

            
    generate_index <- function(variant) {
        return(
            unlist(lapply(
                strsplit(variant, ':'),
                function(x) {
                    alleles <- list(c(x[3], x[4]), c(aflip(x[3]), aflip(x[4])), c(x[4], x[3]), c(aflip(x[4]), aflip(x[3])))
                    alleles <- alleles[order(sapply(alleles, '[[', 1))]
                    return(
                        paste(
                            c(
                                x[1],
                                x[2],
                                paste(sapply(alleles, '[[', 1), collapse = "|"),
                                paste(sapply(alleles, '[[', 2), collapse = "|")
                            ),
                            collapse = ":"
                        ))
                })))
    }
            
    aflip <- function(allele) {
        if( allele == "A" ) {
            return("T")
        }
        else if( allele == "C" ) {
            return("G")
        }
        else if( allele == "T" ) {
            return("A")
        }
        else if( allele == "G" ) {
            return("C")
        }
        else {
            return("")
        }
    }

    ptwas_weights <- read_delim(
        "${ptwas_weights}",
        delim = "\t",
        show_col_types = FALSE) %>% 
        mutate(
            uber_id = generate_index(variant))
    gwas <- read_delim(
        "${gwas_path}", delim = "\t", comment = "##", show_col_types = FALSE) %>%
        rename(Z = ZSCORE) %>%
        mutate(
            SNP_ID = gsub("_", ":", SNP_ID),
            uber_id = generate_index(SNP_ID))

    regionlist <- read_delim(
        "${regionlist_path}",
        delim = "\t",
        show_col_types = FALSE)
    colnames(regionlist) <- c("#CHR", "start", "end", "gene_id", "gene_name")

    input_dataframe <- merge(
        regionlist,
        merge(ptwas_weights, gwas, by = c("uber_id"), all = FALSE),
        by = c("gene_id"), all = FALSE)

    write.table(input_dataframe, "${_output}", col.names = T, row.names = F, quote = F)    
    



In [None]:
[ptwas_scan_2]
import pandas as pd
# Extract unique values, remove 'chr', convert to int, sort, and then add 'chr' back
input_df = file_target(f'{cwd}/cache/input_dataframe.txt')
input_dataframe = pd.read_csv(input_df, sep='\t')
chroms = ["chr" + str(x) for x in sorted([int(chrom.replace("chr", "")) for chrom in input_dataframe["#CHR.x"].unique()])]

input: chroms, group_by = 1
output: f'{cwd}/{tissue}.{_input}.ptwas.output'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'
R: expand= "${ }", stderr = f'{_output:nn}.stderr', stdout = f'{_output:nn}.stdout'
    LDRef <- read_delim(
        paste0("${vcf_path_prefix}", tolower("${_input}"), "${vcf_path_suffix}"),
        delim = "\t",
        comment = "##",
        show_col_types = FALSE)

    LDRef$FORM_ID <- paste0(tolower("${_input}"), ':', LDRef$POS, ':', LDRef$REF, ':', LDRef$ALT)
    LDRef <- LDRef %>% 
        mutate(uber_id = generate_index(FORM_ID))

    results <- input_dataframe %>%
        filter(uber_id %in% LDRef$uber_id) %>%
        mutate(weight = as.double(weight)) %>%
        group_by(gene_id, tissue) %>%
        mutate(nsnps = length(variant), burden_pval = burden(variant, SNP_ID, weight, Z)) %>%
        ungroup() %>%
        group_by(gene_id) %>%
        mutate(
            global_pval = globalPvalue(burden_pval, comb_method = "HMP", naive=FALSE),
            naive_pval = globalPvalue(burden_pval, comb_method = "HMP", naive=TRUE),
            min_pval = min(burden_pval)) %>%
        ungroup()
    
    write_delim(
        results %>%
            subset(select=c("gene_id", "tissue", "nsnps", "burden_pval", "global_pval", "naive_pval", "min_pval")) %>%
            distinct(gene_id, tissue, .keep_all = TRUE),
        "{_output}",
        delim = "\t",
        append = FALSE,
        quote = "none")
    

## PTWAS validation 
The procedures for the PTWAS estimation and model validation that were originally implemented in Perl are now reimplemented in this R script. The unit of analysis is a single gene-trait pair. This notebook tracks how I generate the units of analysis and carry out the validation procedure.



### Input

- Susie objects (rds files)
- 


### Output



In [None]:
[ptwas_est]