# cTWAS pipeline
## Main Steps
1. Harmonize weights, gwas against LD reference panel 
2. Pre-process input data formats for cTWAS input data format. 
3. Perform simple cTWAS analysis without loading LD

## Input
- **LD meta data**: meta data from LD processing with column names of `#chrom`, `start`, `end`, `path`. 
- **regions data**: dataframe of meta data for LD block information with column names of `chr`, `start`, `stop`. 
- **xqtl_region_data**: a dataframe with regions data and file path of the corresponding `refined_twas_weights_data`(*.twas.rds) data from twas pipeline output 
- **gwas_meta_file**: a dataframe with GWAS summary statistics data file paths by chromosome. 

## Output
- A dataframe of cTWAS fine-mapping results for SNP and genes.

## Example
```
sos run ~/githubrepo/xqtl-protocol/code/pecotmr_integration/ctwas.ipynb ctwas \
--cwd /mnt/vast/hpc/csg/cl4215/mrmash/workflow/test \
--xqtl_region_data /mnt/vast/hpc/csg/cl4215/mrmash/workflow/susie_twas/regional_xqtl_data.tsv \
--regions /mnt/vast/hpc/csg/cl4215/mrmash/workflow/twas_mr/pipeline/EUR_LD_blocks_CLU.bed \
--ld_meta_data /mnt/vast/hpc/csg/data_public/20240409_ADSP_LD_matrix/ld_meta_file.tsv \
--gwas_meta_file /mnt/vast/hpc/csg/cl4215/mrmash/workflow/GWAS/gwas_meta.tsv 

```

In [None]:
[global]
parameter: cwd = path("output/")
parameter: ld_meta_data = path()
parameter: gwas_meta_file = path()
# region info for the input refined_twas_weights_data
parameter: xqtl_region_data = path()
# ld region for the input data
parameter: regions = path()
parameter: name = f"{xqtl_region_data:bn}"
parameter: container = ''
parameter: entrypoint= ('micromamba run -a "" -n' + ' ' + re.sub(r'(_apptainer:latest|_docker:latest|\.sif)$', '', container.split('/')[-1])) if container else ""
parameter: job_size = 100
parameter: walltime = "30m"
parameter: mem = "20G"
parameter: numThreads = 1
import os
import pandas as pd
from collections import OrderedDict

def check_required_columns(df, required_columns):
    """Check if the required columns are present in the dataframe."""
    missing_columns = [col for col in required_columns if col not in list(df.columns)]
    if missing_columns:
        raise ValueError(f"Missing required columns: {', '.join(missing_columns)}")
required_xqtl_region_data_columns = ['chrom','start','end','file_path']
required_ld_columns = ['chr', 'start', 'stop']

In [None]:
[ctwas_1]
# this step we load & format input data for a ld region
regions_df = pd.read_csv(regions, sep="\t",skipinitialspace=True)
regions_df.columns = [col.strip() for col in regions_df.columns] 
regions_df['chr'] = regions_df['chr'].str.strip()
xqtl_region_data = pd.read_csv(xqtl_region_data, sep="\t")
#check for required columns
check_required_columns(regions_df, required_ld_columns)
check_required_columns(xqtl_region_data, required_xqtl_region_data_columns)

# Create a dictionary to map each LD region to its corresponding xQTL file paths
region_xqtl_dict = OrderedDict()
for _, region in regions_df.iterrows():
    region_id = f"{region['chr']}_{region['start']}_{region['stop']}"
    matching_files = xqtl_region_data[
        (xqtl_region_data['chrom'] == region['chr']) & (xqtl_region_data['start'] == region['start']) & 
        (xqtl_region_data['end'] == region['stop'])]['file_path'].tolist()
    region_xqtl_dict[region_id] = matching_files
# Generate inputs for the next steps
region_files = [file for files in region_xqtl_dict.values() for file in files]
region_ids = [region_id for region_id, files in region_xqtl_dict.items() for file in files]
region_ids_str = ','.join(f'"{region_id}"' for region_id in region_ids)

if len(region_files) != len(region_ids):
    raise ValueError("Mismatch between region_files and region_ids lengths")

input: region_files, paired_with=['region_ids_str']
output: f'{cwd:a}/{step_name}/{name}_ctwas.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container, entrypoint = entrypoint

    library(IRanges)
    library(R6)
    devtools::load_all("/home/cl4215/githubrepo/pecotmr/") #library(pecotmr)
    devtools::load_all("/home/cl4215/githubrepo/ctwas_multigroup/ctwas/")#library(ctwas)
  
    # Step1: Harmonize weights and gwas
    twas_weights_data <- lapply(c(${_input:r,}), readRDS)
    post_qc_data <- lapply(twas_weights_data, function(twas_data){
                      harmonize_twas(twas_data, "${ld_meta_data}", "${gwas_meta_file}", refined_twas_weights_loader, scale_weights=TRUE)
                    })
    gwas_studies <- unique(names(find_data(post_qc_data, c(3, "gwas_qced"))))
    
    # Step2: Preprocess weights, LD variants data. 
    weights <- do.call(c, lapply(post_qc_data, function(twas_data){
                  get_ctwas_weights(twas_data, "${ld_meta_data}")# reshape weights for all gene-context pairs in the region for cTWAS analysis
              }))
    weights <- weights[!sapply(weights, is.null)]
    # get region_info and snp_info: LD block meta info and variant - bim file data.
    region_of_interest <- region_to_df(c(${region_ids_str}))
    region_info <- read.table("${regions}", sep="\t", header=TRUE) # to get exact LD bim file without over-including neighboring LD's info. 
    colnames(region_info)[1] <- "chrom"
    region_info$chrom <- as.integer(gsub("chr","",region_info$chrom))  
    region_info$region_id <- paste(region_info$chrom, region_info$start, region_info$stop, sep="_")
    
    # load LD variants
    bim_file_paths <- unique(do.call(c, lapply(1:nrow(region_of_interest), function(region_row){
                          get_regional_ld_meta("${ld_meta_data}", region_of_interest[region_row,,drop=FALSE])$intersections$bim_file_paths
                      })))
    snp_info <- lapply(bim_file_paths, function(file){
                       bimfile <- read.table(file, header = FALSE, sep="\t")[, c(1,2,4:8)]# original colnames: "chrom", "variants", "GD", "pos", "A1", "A2", "variance", "allele_freq", "n_nomiss"
                       bimfile$V2 <- gsub("chr", "", gsub("_", ":", bimfile$V2))
                       colnames(bimfile) <- c("chrom", "id", "pos", "alt", "ref", "variance", "allele_freq") # A1:alt, A2: ref
                       return(bimfile)})
    names(snp_info)<- do.call(c, lapply(bim_file_paths, function(x) { parts <- strsplit(basename(x), "[_:/.]")[[1]][1:3]
                              gsub("chr", "", paste(parts, collapse="_"))}))
    
    # Step3: Simple cTWAS with noLD for all regions
    ctwas_res <- list()
    for (study in gwas_studies){
      gwas_z <- do.call(rbind, lapply(post_qc_data, function(x) find_data(x, c(2, "gwas_qced", study), docall=rbind)))
      colnames(gwas_z)[which(colnames(gwas_z)=="variant_id")] <- "id"
      z_snp <- gwas_z[, c("id", "A1", "A2", "z")]
      z_snp <- z_snp[!duplicated(z_snp$id), ]
      ctwas_res[[study]] <- ctwas_sumstats_noLD(z_snp,
                                 weights,
                                 region_info,
                                 snp_info,
                                 thin = 1,
                                 outputdir = ${_output:dr},
                                 niter_prefit = 3,
                                 niter = 30,
                                 group_prior_var_structure = "shared_type",
                                 maxSNP = 20000,
                                 min_nonSNP_PIP = 0.5,
                                 ncore = ${numThreads})
    }

    # Step4: save results 
    saveRDS(ctwas_res, ${_output:r})