In [1]:
[global]
parameter: cwd = path("output")
# A list of file paths for genotype data, or the genotype data itself. 
parameter: genoFile = path
# One or multiple lists of file paths for phenotype data.
parameter: phenoFile = paths
# Covariate file path
parameter: covFile = paths
# Optional: if a region list is provide the analysis will be focused on provided region. 
# The LAST column of this list will contain the ID of regions to focus on
# Otherwise, all regions with both genotype and phenotype files will be analyzed
parameter: region_list = path()
# Optional: if a region name is provided 
# the analysis would be focused on the union of provides region list and region names
parameter: region_name = []
# Only focus on a subset of samples
parameter: keep_samples = path()
# An optional list documenting the custom cis window for each region to analyze, with four column, chr, start, end, region ID (eg gene ID).
# If this list is not provided, the default `window` parameter (see below) will be used.
parameter: customized_cis_windows = path()
# Specify the cis window for the up and downstream radius to analyze around the region of interest in units of bp
# When this is zero, we will rely on customized_cis_windows
parameter: window = 0
# It is required to input the name of the analysis
parameter: name = str
# save data object or not
parameter: save_data = False
parameter: container = ""
import re
parameter: entrypoint= ('micromamba run -a "" -n' + ' ' + re.sub(r'(_apptainer:latest|_docker:latest|\.sif)$', '', container.split('/')[-1])) if container else ""
# For cluster jobs, number commands to run per job
parameter: job_size = 200
# Wall clock time expected
parameter: walltime = "1h"
# Memory expected
parameter: mem = "20G"
# Number of threads
parameter: numThreads = 1
# Name of phenotypes
parameter: phenotype_names = [f'{x:bn}' for x in phenoFile]
parameter: seed = 999

def group_by_region(lst, partition):
    # from itertools import accumulate
    # partition = [len(x) for x in partition]
    # Compute the cumulative sums once
    # cumsum_vector = list(accumulate(partition))
    # Use slicing based on the cumulative sums
    # return [lst[(cumsum_vector[i-1] if i > 0 else 0):cumsum_vector[i]] for i in range(len(partition))]
    return partition

import os
import pandas as pd

def adapt_file_path(file_path, reference_file):
    """
    Adapt a single file path based on its existence and a reference file's path.

    Args:
    - file_path (str): The file path to adapt.
    - reference_file (str): File path to use as a reference for adaptation.

    Returns:
    - str: Adapted file path.

    Raises:
    - FileNotFoundError: If no valid file path is found.
    """
    reference_path = os.path.dirname(reference_file)

    # Check if the file exists
    if os.path.isfile(file_path):
        return file_path

    # Check file name without path
    file_name = os.path.basename(file_path)
    if os.path.isfile(file_name):
        return file_name

    # Check file name in reference file's directory
    file_in_ref_dir = os.path.join(reference_path, file_name)
    if os.path.isfile(file_in_ref_dir):
        return file_in_ref_dir

    # Check original file path prefixed with reference file's directory
    file_prefixed = os.path.join(reference_path, file_path)
    if os.path.isfile(file_prefixed):
        return file_prefixed

    # If all checks fail, raise an error
    raise FileNotFoundError(f"No valid path found for file: {file_path}")

def adapt_file_path_all(df, column_name, reference_file):
    return df[column_name].apply(lambda x: adapt_file_path(x, reference_file))

In [1]:
[get_analysis_regions: shared = "regional_data"]
# input is genoFile, phenoFile, covFile and optionally region_list. If region_list presents then we only analyze what's contained in the list.
# regional_data should be a dictionary like:
#{'data': [("genotype_1.bed", "phenotype_1.bed.gz", "covariate_1.gz"), ("genotype_2.bed", "phenotype_1.bed.gz", "phenotype_2.bed.gz", "covariate_1.gz", "covariate_2.gz") ... ],
# 'meta_info': [("chr12:752578-752579","chr12:752577-752580", "gene_1", "trait_1"), ("chr13:852580-852581","chr13:852579-852580", "gene_2", "trait_1", "trait_2") ... ]}

def process_pheno_files(pheno_files, cov_files, phenotype_names):
    '''
    Example output:
    #chr    start      end    start_cis       end_cis           ID  path     cov_path             cond             coordinate     geno_path
    0  chr12   752578   752579  652578   852579  ENSG00000060237_Q9H4A3  protocol_example.protein_1.bed.gz,protocol_example.protein_2.bed.gz  covar_1.gz,covar_2.gz  trait_A,trait_B    chr12:752578-752579  protocol_example.genotype.chr21_22.bed
    '''
    # Initialize an empty DataFrame for accumulation
    accumulated_pheno_df = pd.DataFrame()

    merge_keys = ['#chr', 'start', 'end', 'ID']

    for pheno_path, cov_path, phenotype_name in zip(pheno_files, cov_files, phenotype_names):
        if not os.path.isfile(cov_path):
            raise FileNotFoundError(f"No valid path found for file: {cov_path}")

        # Read and process each phenotype file
        pheno_df = pd.read_csv(pheno_path, sep="\t", header=0)
        
        # Adapt pheno file paths and add additional information
        pheno_df.iloc[:, 4] = adapt_file_path_all(pheno_df, pheno_df.columns[4], f"{pheno_path:a}")
        pheno_df = pheno_df.assign(
            cov_path=str(cov_path), 
            cond=phenotype_name)

        # Merge with the accumulated DataFrame
        if accumulated_pheno_df.empty:
            accumulated_pheno_df = pheno_df
        else:
            # Merge on specified keys with default suffixes
            merged_df = pd.merge(accumulated_pheno_df, pheno_df, on=merge_keys, how='outer', suffixes=('_x', '_y'))

            # Determine non-key columns
            non_key_columns = [col for col in pheno_df.columns if col not in merge_keys]

            # Concatenate non-key columns for matching keys
            for col in non_key_columns:
                col_x = f'{col}_x'
                col_y = f'{col}_y'

                # Handling concatenation for matching keys
                merged_df[col] = merged_df.apply(
                    lambda row: row[col_x] if pd.isna(row[col_y]) else 
                                (row[col_y] if pd.isna(row[col_x]) else f'{row[col_x]},{row[col_y]}'), axis=1)

                # Drop the temporary columns
                merged_df.drop([col_x, col_y], axis=1, inplace=True)

            accumulated_pheno_df = merged_df
    return accumulated_pheno_df

# Load phenotype meta data
if len(phenoFile) != len(covFile):
    raise ValueError("Number of input phenotypes files must match that of covariates files")
if len(phenoFile) != len(phenotype_names):
    raise ValueError("Number of input phenotypes files must match the number of phenotype names")
meta_data = process_pheno_files(phenoFile, covFile, phenotype_names)

# Load genotype meta data
if f"{genoFile:x}" == ".bed":
    geno_meta_data = pd.DataFrame([("chr"+str(x), f"{genoFile:a}") for x in range(1,23)] + [("chrX", f"{genoFile:a}")], columns=['#chr', 'geno_path'])
else:
    geno_meta_data = pd.read_csv(f"{genoFile:a}", sep = "\t", header=0)
    geno_meta_data.iloc[:, 1] = adapt_file_path_all(geno_meta_data, geno_meta_data.columns[1], f"{genoFile:a}")
    geno_meta_data.columns = ['#chr', 'geno_path']
    geno_meta_data['#chr'] = geno_meta_data['#chr'].apply(lambda x: str(x) if str(x).startswith('chr') else f'chr{x}')

# Checking the DataFrame
valid_chr_values = [f'chr{x}' for x in range(1, 23)] + ['chrX']
if not all(value in valid_chr_values for value in geno_meta_data['#chr']):
    raise ValueError("Invalid chromosome values found. Allowed values are chr1 to chr22 and chrX.")

meta_data = meta_data.merge(geno_meta_data, on='#chr', how='inner')

if len(meta_data.index) == 0:
    raise ValueError("No region overlap between genotype and any of the phenotypes")

region_ids = []
# If region_list is provided, read the file and extract IDs
if region_list.is_file():
    region_list_df = pd.read_csv(region_list, delim_whitespace=True, header=None, comment = "#")
    region_ids = region_list_df.iloc[:, -1].unique()  # Extracting the last column for IDs

# If region_name is provided, include those IDs as well
# --region-name A B C will result in a list of ["A", "B", "C"] here
if len(region_name) > 0:
    region_ids = list(set(region_ids).union(set(region_name)))

# If either region_list or region_name is provided, filter the meta_data
if len(region_ids) > 0:
    meta_data = meta_data[meta_data['ID'].isin(region_ids)]

# Adjust cis-window
if os.path.isfile(customized_cis_windows):
    print(f"Loading customized cis-window data from {customized_cis_windows}")
    cis_list = pd.read_csv(customized_cis_windows, comment="#", header=None, names=["#chr","start","end","ID"], sep="\t")
    meta_data = pd.merge(meta_data, cis_list, on=['#chr', 'ID'], how='left', suffixes=('', '_cis')) 
    mismatches = meta_data[meta_data['start_cis'].isna()]
    if not mismatches.empty:
        print("First 5 mismatches:")
        print(mismatches[['ID']].head())
        raise ValueError(f"{len(mismatches)} regions to analyze cannot be found in ``{customized_cis_windows}``. Please check your ``{customized_cis_windows}`` database to make sure it contains all cis-window definitions. ")
else:
    if window <=0 :
        raise ValueError("Please either input valid path to cis-window file via ``--customized-cis-windows``, or set ``--window`` to a positive integer")
    meta_data['start_cis'] = meta_data['start'].apply(lambda x: max(x - window, 0))
    meta_data['end_cis'] = meta_data['end'] + window
# Create the final dictionary
regional_data = {
    'data': [(row['geno_path'], *row['path'].split(','), *row['cov_path'].split(',')) for _, row in meta_data.iterrows()],
    'meta_info': [(f"{row['#chr']}:{row['start']}-{row['end']}", # this is the phenotype region
                   f"{row['#chr']}:{row['start_cis']}-{row['end_cis']}", # this is the cis-window region
                   row['ID'], *row['cond'].split(',')) for _, row in meta_data.iterrows()]
}


In [1]:
[susie_twas_1]
# initial number of single effects for SuSiE
parameter: init_L = 8
# maximum number of single effects to use for SuSiE
parameter: max_L = 30
# remove a variant if it has more than imiss missing individual level data
parameter: imiss = 1.0
# MAF cutoff
parameter: maf = 0.0025
# MAC cutoff, on top of MAF cutoff
parameter: mac = 5
# Remove indels if indel = False
parameter: indel = True
parameter: pip_cutoff = 0.025
parameter: coverage = [0.95, 0.7, 0.5]
# Compute TWAS weights as well
parameter: twas_weights = True
# Perform K folds valiation CV for TWAS
# Set it to zero if this is to be skipped
parameter: twas_cv_folds = 5
parameter: twas_cv_threads = twas_cv_folds
# maximum number of variants to consider for CV
# We will randomly pick a subset of it for CV purpose
parameter: max_cv_variants = 5000
# Further limit CV to only using common variants
parameter: min_cv_maf = 0.05
parameter: ld_reference_meta_file = path()
depends: sos_variable("regional_data")
# Check if both 'data' and 'meta_info' are empty lists
stop_if(len(regional_data['data']) == 0, f'Either genotype or phenotype data are not available for region {", ".join(region_name)}.')

meta_info = regional_data['meta_info']
input: regional_data["data"], group_by = lambda x: group_by_region(x, regional_data["data"]), group_with = "meta_info"
output: f'{cwd:a}/{step_name[:-2]}/{name}.{_meta_info[2]}.susie{"_weights_db" if twas_weights else ""}.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr", container = container, entrypoint = entrypoint
    options(warn=1)
    library(pecotmr)
    # extract subset of samples
    keep_samples = NULL
    if (${"TRUE" if keep_samples.is_file() else "FALSE"}) {
      keep_samples = unlist(strsplit(readLines(${keep_samples:ar}), "\\s+"))
      message(paste(length(keep_samples), "samples are selected to be loaded for analysis"))
    }
    # Load regional association data
    tryCatch({
    fdat = load_regional_univariate_data(genotype = ${_input[0]:anr},
                                          phenotype = c(${",".join(['"%s"' % x.absolute() for x in _input[1:len(_input)//2+1]])}),
                                          covariate = c(${",".join(['"%s"' % x.absolute() for x in _input[len(_input)//2+1:]])}),
                                          region = "${_meta_info[0]}",
                                          cis_window = "${_meta_info[1]}",
                                          conditions = c(${",".join(['"%s"' % x for x in _meta_info[3:]])}),
                                          maf_cutoff = ${maf},
                                          mac_cutoff = ${mac},
                                          imiss_cutoff = ${imiss},
                                          keep_indel = ${"TRUE" if indel else "FALSE"},
                                          keep_samples = keep_samples,
                                          extract_region_name = "${_meta_info[2]}",
                                          phenotype_header = 4,
                                          region_name_col = 4,
                                          scale_residuals = FALSE)
    }, NoSNPsError = function(e) {
        message("Error: ", paste(e$message, "${_meta_info[2] + '@' + _meta_info[1]}"))
        #saveRDS(NULL, ${_output:ar})
        saveRDS(list(${_meta_info[2]} = e$message), ${_output:ar}, compress='xz')
        quit(save="no")
    })
  
    if (${"TRUE" if save_data else "FALSE"}) {
      # save data object for debug purpose
      saveRDS(list(${_meta_info[2]} = fdat), "${_output:ann}.dataset.rds", compress='xz')
    }
    # Univeriate analysis suite
    fitted = setNames(replicate(length(fdat$residual_Y), list(), simplify = FALSE), names(fdat$residual_Y))
    existing_file = gsub("_DeJager", "", ${_output:ar})
    existing_data = readRDS(existing_file)[[1]]
  
    for (r in 1:length(fitted)) {
      st = proc.time()
      init_L=${init_L}
      max_L=${max_L}
      # codes to rerun
      fitted[[r]] = susie_wrapper(fdat$residual_X[[r]], fdat$residual_Y[[r]], init_L=init_L, max_L=max_L, refine=TRUE, coverage = ${coverage[0]})
      fitted[[r]] = susie_post_processor(fitted[[r]], fdat$residual_X[[r]], fdat$residual_Y[[r]], fdat$residual_X_scalar[[r]], fdat$residual_Y_scalar[[r]], 
                                       fdat$maf[[r]], secondary_coverage = c(${",".join([str(x) for x in coverage[1:]])}), signal_cutoff = ${pip_cutoff},
                                       other_quantities = list(dropped_samples = list(X=fdat$dropped_sample$dropped_samples_X[[r]], y=fdat$dropped_sample$dropped_samples_Y[[r]], 
                                                                                      covar=fdat$dropped_sample$dropped_samples_covar[[r]])))
      if (!is.null(fitted[[r]]$susie_result_trimmed)) {
          L = length(fitted[[r]]$susie_result_trimmed$V)
          init_L = max(1, L-2)
          max_L = L+3
      } else {
          init_L = 2
          max_L = 2
      }
  
      variants_kept = filter_variants_by_ld_reference(colnames(fdat$residual_X[[r]]), ${ld_reference_meta_file:r})
      X = fdat$residual_X[[r]][,variants_kept$data,drop=F]
      fitted[[r]]$preset_variants_result = susie_wrapper(X, fdat$residual_Y[[r]], init_L=init_L, max_L=max_L, refine=TRUE, coverage = ${coverage[0]})
      fitted[[r]]$preset_variants_result = susie_post_processor(fitted[[r]]$preset_variants_result, X, fdat$residual_Y[[r]], 
                               if (fdat$residual_X_scalar[[r]]==1) 1 else fdat$residual_X_scalar[[r]][variants_kept$idx], 
                               fdat$residual_Y_scalar[[r]], fdat$maf[[r]][variants_kept$idx], 
                               secondary_coverage = c(${",".join([str(x) for x in coverage[1:]])}), signal_cutoff = ${pip_cutoff})
      fitted[[r]]$preset_variants_result$analysis_script=NULL
      fitted[[r]]$preset_variants_result$sumstats=NULL
      # codes to load
      existing_key = gsub("_DeJager_eQTL", "", names(fitted)[[r]])
      tmp = existing_data[[existing_key]]
      fitted[[r]]$twas_weights = tmp$twas_weights
      fitted[[r]]$twas_predictions = tmp$twas_predictions
      fitted[[r]]$twas_cv_result = tmp$twas_cv_result
      fitted[[r]]$total_time_elapsed = tmp$total_time_elapsed
      fitted[[r]]$region_info = list(region_coord=parse_region("${_meta_info[0]}"), grange=parse_region("${_meta_info[1]}"), region_name="${_meta_info[2]}")
      # original data no longer relevant, set to NA to release memory
      fdat$residual_X[[r]] <- NA
      fdat$residual_Y[[r]] <- NA
    }
    saveRDS(list(${_meta_info[2]} = fitted), ${_output:ar}, compress='xz')