# GWAS integration: enrichment and colocalization

This workflow processes fine-mapping results for xQTL, generated by `susie_twas` in the `cis_analysis.ipynb` notebook for cis xQTL, and GWAS fine-mapping results produced by `susie_rss` in the `rss_analysis.ipynb` notebook. It is designed to perform enrichment and colocalization analysis, particularly when fine-mapping results originate from different regions in the case of cis-xQTL and GWAS. The pipeline is capable to integrate and analyze data across these distinct regions. Originally tailored for cis-xQTL and GWAS integration, this pipeline can be applied to other pairwise integrations. An example of such application is in trans analysis, where the fine-mapped regions might be identical between trans-xQTL and GWAS, representing a special case of this broader implementation.

## Input

Lists of SuSiE fine-mapping output objects, in RDS format, of `class(susie)` in R. 

- For GWAS the list is meta-data of format: `chr`, `start`, `end`, `study_id`, `file_path` where `file_path` is an RDS file.
- For xQTL the list is meta-data of format: `chr`, `start`, `end`, `region_id`, `condition_id`, `file_path` where `file_path` is an RDS file. `condition_id` should be optional -- if that is the case, all conditions inside of the xQTL dataset will be analyzed.

## Output

1. Enrichment analysis results --- this is a global enrichment estimate that combines all input data
2. Colocalization results for regions of interest

In [None]:
[global]
# Workdir
parameter: cwd = path("output")
# A list of file paths for fine-mapped GWAS results. 
parameter: gwas_finemapped_meta_data = path
# A list of file paths for fine-mapped xQTL results. 
parameter: xqtl_meta_data = path
# Optional: if a region list is provide the enrichment analysis will be focused on provided region. 
# The LAST column of this list will contain the ID of regions to focus on
parameter: region_list = path()
# Optional: if a region name is provided 
# the analysis would be focused on the union of provides region list and region names
parameter: region_name = []
# It is required to input the name of the analysis
parameter: name = f"{xqtl_meta_data:bn}.{gwas_finemapped_meta_data:bn}"
parameter: container = ""
import re
parameter: entrypoint= ('micromamba run -a "" -n' + ' ' + re.sub(r'(_apptainer:latest|_docker:latest|\.sif)$', '', container.split('/')[-1])) if container else ""
# For cluster jobs, number commands to run per job
parameter: job_size = 200
# Wall clock time expected
parameter: walltime = "5m"
# Memory expected: quite large for enrichment analysis but small for xQTL colocalization
parameter: mem = "16G"
# Number of threads
parameter: numThreads = 1

import os
import pandas as pd

def adapt_file_path(file_path, reference_file):
    """
    Adapt a single file path based on its existence and a reference file's path.

    Args:
    - file_path (str): The file path to adapt.
    - reference_file (str): File path to use as a reference for adaptation.

    Returns:
    - str: Adapted file path.

    Raises:
    - FileNotFoundError: If no valid file path is found.
    """
    reference_path = os.path.dirname(reference_file)

    # Check if the file exists
    if os.path.isfile(file_path):
        return file_path

    # Check file name without path
    file_name = os.path.basename(file_path)
    if os.path.isfile(file_name):
        return file_name

    # Check file name in reference file's directory
    file_in_ref_dir = os.path.join(reference_path, file_name)
    if os.path.isfile(file_in_ref_dir):
        return file_in_ref_dir

    # Check original file path prefixed with reference file's directory
    file_prefixed = os.path.join(reference_path, file_path)
    if os.path.isfile(file_prefixed):
        return file_prefixed

    # If all checks fail, raise an error
    raise FileNotFoundError(f"No valid path found for file: {file_path}")

def adapt_file_path_all(df, column_name, reference_file):
    return df[column_name].apply(lambda x: adapt_file_path(x, reference_file))

In [None]:
[get_analysis_regions: shared = "regional_data"]

def check_required_columns(df, required_columns):
    """Check if the required columns are present in the dataframe."""
    missing_columns = [col for col in required_columns if col not in list(df.columns)]
    if missing_columns:
        raise ValueError(f"Missing required columns: {', '.join(missing_columns)}")

def extract_regional_data(gwas_meta_data, xqtl_meta_data):
    """
    Extracts fine-mapped results data from GWAS and xQTL metadata files and additional GWAS data provided. 

    Args:
    - gwas_meta_data (str): File path to the GWAS metadata file.
    - xqtl_meta_data (str): File path to the xQTL weight metadata file.
    
    Returns:
    - Tuple of two dictionaries:
        - GWAS Dictionary: Nested dictionary with region IDs as keys
        - xQTL Dictionary: Nested dictionary with region IDs as keys.
    """
    required_gwas_columns = ['study_id', 'chrom', 'start', 'end', 'file_path']
    required_xqtl_columns = ['region_id', 'chrom', 'start', 'end', 'condition', 'file_path']

    # Process GWAS metadata
    gwas_df = pd.read_csv(gwas_meta_data, sep="\t")
    check_required_columns(gwas_df, required_gwas_columns)
    gwas_df['file_path'] = adapt_file_path_all(gwas_df, 'file_path', gwas_meta_data)
    gwas_df['region_id'] = gwas_df.apply(lambda row: f"{row['chrom']}:{row['start']}-{row['end']}", axis=1)

    gwas_dict = OrderedDict()
    for _, row in gwas_df.iterrows():
        file_paths = [fp.strip() for fp in row['file_path'].split(',')]
        gwas_dict[row['region_id']] = {"meta_info": [row['chrom'], row['start'], row['end'], row['study_id']],
                                       "files": file_paths}

    # Process xQTL metadata
    xqtl_df = pd.read_csv(xqtl_meta_data, sep="\t")
    check_required_columns(xqtl_df, required_xqtl_columns)
    xqtl_df['file_path'] = adapt_file_path_all(xqtl_df, 'file_path', xqtl_meta_data)

    xqtl_dict = OrderedDict()
    for _, row in xqtl_df.iterrows():
        file_paths = [fp.strip() for fp in row['file_path'].split(',')]
        xqtl_dict[row['region_id']] = {"meta_info": [row['chrom'], row['start'], row['end'], row['region_id'], row['condition']],
                                       "files": file_paths}
    return gwas_dict, xqtl_dict

gwas_dict, xqtl_dict = extract_regional_data(gwas_finemapped_meta_data, xqtl_meta_data)
regional_data = dict([("GWAS", gwas_dict), ("xQTL", xqtl_dict)])

In [None]:
[xqtl_gwas_enrichment]
depends: sos_variable("regional_data")
output: f'{cwd:a}/{name}.enrichment.txt'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container, entrypoint = entrypoint
  # RDS files for GWAS data
  gwas_finemapped_data = c(${paths([x["files"] for x in regional_data["GWAS"].values()]):r,})
  # RDS files for xQTL data
  xqtl_finemapped_data = c(${paths([x["files"] for x in regional_data["xQTL"].values()]):r,})
  result = pecotmr::xqtl_enrichment_wrapper(gwas_finemapped_data, xqtl_finemapped_data)
  writeLines(paste(names(result), unlist(result), sep = ":"), ${_output:ar})

In [None]:
[susie_coloc]
depends: sos_variable("regional_data")
parameter: enrichment_data = path
meta_info = [x["meta_info"] for x in regional_data['xQTL'].values()]
xqtl_files = [x["files"] for x in regional_data['xQTL'].values()]
input: xqtl_files, group_by = 1, group_with = "meta_info"
output: f'{cwd:a}/{step_name[:-2]}/{name}.{_meta_info[3]}.coloc.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container, entrypoint = entrypoint
    chrom = ${_meta_info[0]}
    start = ${_meta_info[1]} 
    end = ${_meta_info[2]}
    region = "${_meta_info[3]}"
    xqtl_condition = "${_meta_info[4]}"
    gwas_regions = c(${paths(regional_data["GWAS"].keys()):r,})
    library(pecotmr)
    # Step 1: find relevant GWAS regions that overlap with the xQTL region of interest
    # gwas_overlapping_regions = ...
    # gwas_finemapping_files = ...
    # Step 2: load enrichment analysis results
    # coloc_priors = get_coloc_prior(${enrichment_data:r})
    # Step 3: Apply colocalization analysis
    # res = coloc_wrapper(${_input:r}, gwas_finemapping_files, coloc_priors)
    # saveRDS(res, ${_output:r})