# Processing cTWAS Input Files
This pipeline performs tasks including the formatting and compiling of inputs such as TWAS weights, GWAS summary statistics, and LD matrices, which are essential for running ctwas-multigroup as part of the pecotmr pipeline integration. 

### Resource-Intensive Steps
- Formatting LD files 
- Compiling and formatting weight files. 

### Input
- Summary table: From the twas_sparse pipeline, this metafile summarizes gene-context imputability, best method of the computation, and selected variants.
- LD meta file: We assume Assumes that LD matrices are located in the same directory as the LD meta file, with both the ld_meta_file and chrX folders organized at the same hierarchical level. 
- GWAS Sumary Statistics: This file is post-quality control (QC) and harmonized against LD, including columns for `chrom`, `A1`, `A2`, `variant_id`, `z`. 
    - The `variant_id` represent harmonized ref and alt allele after QC formatted as {int(chr):pos:A2:A1}, where A2 is the reference allele and A1 is the alternate allele. Allele QC and harmonization are performed by Haochen Sun.
    - Allele QC and harmonization is performed by Haochen Sun. 

### Output
- z_snp (dataframe): A dataframe containing GWAS summary statistics with header of `id`,`A1`,`A2`,`z`. 
- weights (list): A list of weights for each gene-context pair.
- region_info (dataframe): A meta file for formatted LD files.

### Example Command

In [None]:
sos run xqtl-pipeline/code/pecotmr_integration/ctwas_input_processing.ipynb \
    --cwd /mnt/vast/hpc/csg/cl4215/mrmash/workflow/ctwas_test \
    --ld_meta_data /mnt/vast/hpc/csg/data_public/20240409_ADSP_LD_matrix/ld_test.tsv \
    --gwas_sumstat /mnt/vast/hpc/csg/cl4215/mrmash/workflow/genes_300/ctwas/gwas_wg.tsv \
    --summary_table /mnt/vast/hpc/csg/cl4215/mrmash/workflow/twas_mr/pipeline/sparse/twas_sparse/TADB_enhanced_cis.coding.summary_table.tsv \
    -s build  

In [None]:
[global]
# Parameter definitions
parameter: cwd = path("output/")
parameter: container = ""
parameter: entrypoint = ('micromamba run -a "" -n' + ' ' + re.sub(r'(_apptainer:latest|_docker:latest|\.sif)$', '', container.split('/')[-1])) if container else ""
parameter: customized_association_windows = path()
parameter: ld_meta_data = path()
parameter: summary_table = path()
parameter: gwas_sumstat = path()
parameter: weight_input = path()
parameter: job_size = 10
parameter: walltime = "15h"
parameter: mem = "35G"
parameter: numThreads = 2
ld_outdir = f"{cwd}/LD"
temp_dir = f"{cwd}/temp"
# ensure LD output directory exists
import os
if not os.path.exists(ld_outdir):
    os.makedirs(ld_outdir)
if not os.path.exists(temp_dir):
    os.makedirs(temp_dir)
import pandas as pd

In [None]:
[default]
sos_run(['format_ld', 'format_gwas', 'format_weight'])
sos_run(['Variants_Update'])

In [None]:
[format_ld_1]
# Read the meta-data and generate regions to be processed: chrXX_start_end
region = [f"{x['#chrom']}_{x['start']}_{x['end']}" for x in pd.read_csv(ld_meta_data, sep="\t").to_dict(orient='records')]
input: ld_meta_data, for_each='region', group_by=1
task: trunk_workers = 5, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads
R: expand='${ }', stdout = f"{ld_outdir}/region_info_{_region}.stdout", stderr = f"{ld_outdir}/region_info_{_region}.stderr", container = container, entrypoint = entrypoint    
    library(pecotmr)
    cat("Formatting LD region: ${_region} ...\n")
    region <- paste0(dirname("${ld_meta_data}"), "/", gsub("_.*$", "", "${_region}"), "/${_region}.cor.xz") #ld_dir/chrXX/chrXX_start_end.cor.xz
    format_ctwas_ld(region, "${ld_outdir}")
    # remove update table - so that update the meta table at each time of processing with pipeline. 
    meta_file_path <- paste0('${ld_outdir}', "/LD_region_info.txt")
    if (file.exists(meta_file_path)){
      # Remove the file
      file.remove(meta_file_path)
    }

In [None]:
[format_ld_2]
# This step will consolidate all processed LD information into a single file
depends: dynamic(f"{ld_outdir}/region_info_*.stdout")  # Depends on completion of all format_ld tasks
input: group_by = 'all'
output: f"{ld_outdir}/LD_region_info.txt"
R: expand='${ }', container=container, entrypoint=entrypoint
    library(pecotmr)
    # Call the function to process all region files and save the result
    processed_ld_info <- get_dir_region_info('${ld_outdir}')
    write.table(processed_ld_info, ${_output:r}, sep="\t", col.names=TRUE, row.names=FALSE, quote=FALSE)

In [None]:
[format_weight_1]
# Read the summary report and prepare for processing
summary_df = pd.read_csv(summary_table, sep='\t')
# Create a unique identifier by concatenating 'study' and 'gene' columns
summary_df['study_gene'] = summary_df['study'] + ':' + summary_df['gene']
study_gene_indices = summary_df['study_gene'].tolist()
    
input: for_each='study_gene_indices'
output: f"{temp_dir}/output_{_study_gene_indices}.rds"
task: trunk_workers = 4, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads
R: expand='${ }', stdout = f"{temp_dir}/weight_processing_{_index}.stdout", stderr = f"{temp_dir}/weight_processing_{_index}.stderr", container = container, entrypoint = entrypoint  
    library(pecotmr)
    # get analysis unit, _index maps directly to xqtl_indices in sos
    study <- strsplit('${_study_gene_indices}', ':')[[1]][1]
    gene <- strsplit('${_study_gene_indices}', ':')[[1]][2]
    summary_df <- read.table('${summary_table}', sep="\t", header=TRUE)
    summary_report_unit <- summary_df[summary_df$study==study & summary_df$gene == gene, ] # Adding 1 because R is 1-indexed
    processed_result <- get_ctwas_input(summary_report_unit, outdir=NULL, outname=NULL, 
                  weights_input_file = ${"NULL" if weight_input=='.' or weight_input =='' else weight_input}, auto_save=FALSE)# This step will altomattically save results. 
    # save to temporary file 
    saveRDS(processed_result, ${_output:r})
    
    # in case of re-running, new will have meta data updated without being ignored.
    merged_weight_file <- paste0("${cwd}","/merged_weights_list.rds")
    if (file.exists(merged_weight_file)){
      # Remove the file
      file.remove(merged_weight_file)
    }

In [None]:
[format_weight_2]
depends: sos_step('format_weight_1')
input: f"{temp_dir}/output_*.rds", group_by='all'
output: f"{cwd}/merged_weights_list.rds"
task: trunk_workers = 4, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads
R: expand='${ }', container=container, entrypoint=entrypoint
    library(purrr)
    # Function to recursively merge nested lists
    merge_nested_lists <- function(list1, list2) {
      if (is.list(list1) && is.list(list2)) {
        common_names <- intersect(names(list1), names(list2))
        unique_names1 <- setdiff(names(list1), names(list2))
        unique_names2 <- setdiff(names(list2), names(list1))

        # Combine common elements recursively
        combined <- map(common_names, ~merge_nested_lists(list1[[.x]], list2[[.x]]))
        names(combined) <- common_names  # Preserve names for combined elements

        # Add unique elements from both lists, ensuring names are preserved
        combined <- c(combined, setNames(list1[unique_names1], unique_names1), setNames(list2[unique_names2], unique_names2))

        # Special handling at the 'weights_list' level to concatenate lists
        if ("weights_list" %in% common_names) {
          # Concatenate and preserve names within 'weights_list'
          combined_weights <- c(list1$weights_list, list2$weights_list)
          names(combined_weights) <- c(names(list1$weights_list), names(list2$weights_list))
          combined$weights_list <- combined_weights
        }

        return(combined)
      } else {
        # For non-list items just concatenate
        return(c(list1, list2))
      }
    }

    # Function to merge all RDS files
    merge_all_rds <- function(file_paths) {
      all_lists <- lapply(file_paths, readRDS)
      Reduce(merge_nested_lists, all_lists)
    }

    file_paths <- c(${_input:r,})
    merged_list <- merge_all_rds(file_paths)
    weights <- do.call(c, lapply(1:22, function(chr){merged_list[[1]][[paste0("chr", chr)]][["weights_list"]]}))
    weights <- weights[!duplicated(names(weights))]
    saveRDS(weights, ${_output:r})

In [None]:
[format_gwas]
input: gwas_sumstat
output: f"{cwd}/gwas_z_snp.tsv"
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads
R: expand = '${ }', stdout = f"{cwd}/gwas_z_snp.stdout", stderr = f"{cwd}/gwas_z_snp.stderr", container = container, entrypoint = entrypoint    
    z_snp <- read.table(${_input:r}, sep="\t", header=TRUE)
    z_snp$A1 <- trimws(z_snp$variant_id, whitespace = ".*\\:")
    z_snp$A2 <- sapply(z_snp$variant_id, function(var) {strsplit(var, "\\:")[[1]][3]})
    z_snp <- z_snp[, c("variant_id", "A1", "A2", "z")]
    colnames(z_snp)[1]<-"id"
    z_snp$id <- paste0("chr",z_snp$id)
    write.table(z_snp, ${_output:r}, sep="\t", quote=FALSE, row.names=FALSE)
    # in case of re-running, new will have meta data updated without being ignored.
    if (file.exists('${_output:n}_updated.tsv')){
      # Remove the file
      file.remove('${_output:n}_updated.tsv')
    }

In [None]:
[Variants_Update]
depends: sos_step('format_weight_1'),sos_step('format_weight_2'), sos_step('format_ld_1'), sos_step('format_ld_2'), sos_step('format_gwas')
input: f"{cwd}/gwas_z_snp.tsv", f"{cwd}/merged_weights_list.rds", f"{ld_outdir}/LD_region_info.txt", group_by='all'
output: f"{cwd}/gwas_z_snp_updated.tsv"
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads
R: expand = '${ }', stdout = f"{cwd}/z_snp_variant_update.stdout", stderr = f"{cwd}/z_snp_variant_update.stderr", container = container, entrypoint = entrypoint 

    z_snp <- read.table('${_input[0]}', sep="\t", header=TRUE)
    weights <- readRDS('${_input[1]}')
    region_info <- read.table('${_input[2]}', sep="\t", header=TRUE)

    all_wgt_var <- unique(do.call(c, lapply(names(weights), function(genes_con){
        rownames(weights[[genes_con]]$wgt)
    })))

    ### Checking and updating all snp variant are in LD variant
    all_ld_var<- do.call(c, lapply(region_info$SNP_info, function(rvar){
          read.table(rvar, sep="\t", header=TRUE)$id
    }))
    ###update z_snp
    z_snp <- z_snp[z_snp$id %in% all_ld_var,]
  
    if (!all(all_wgt_var %in% z_snp$id)){
        stop("Weight file '${_input[1]}' included variants that cannot be found in z_snp. ")
    }
    if (!all(z_snp$id %in% all_ld_var)){
        stop("z_snp included variants that cannot be found in LD reference. ")
    }
    write.table(z_snp, '${_output}', sep="\t", row.names=FALSE, quote=FALSE)