# Gene region extraction
This is the region extraction step for data processing pipeline for xqtl workflow, containing the generation of:
1. Molecular_phenotype per chrom within selected regions in the format APEX and tensorQTL takes

### Input
The input for this workflow is the collection of data for 1 conditions as described in the readme of this git repo
1. 1 complete molecular_phenotype data
2. 1 file documenting the list of region to be analyzed

### Output
For each collection, the output is 23 sets of :
1. Sets of EXP file for selected region, suitable to be fed into both apex and tensorQTL

1 sets of
1. PCA + Factor + Covariate file


In [2]:
[global]
import os
# Work directory & output directory
parameter: wd = path
# The filename namefor output data
parameter: container = 'gaow/twas'
# namefor the analysis output
parameter: name= 'ROSMAP'
# An index text file with 4 columns specifying the chr, start, end and names of regions to analyze
parameter: region_list = path
# For cluster jobs, number commands to run per job
parameter: job_size = 1
# Wall clock time expected
parameter: walltime = "5h"
# Memory expected
parameter: mem = "16G"
# Number of threads
parameter: numThreads = 20

Prefix = name

regions = [x.strip().split() for x in open(region_list).readlines() if x.strip() and not x.strip().startswith('#')]
# Get the unique chormosome that have regions to be analyzed.
def extract(lst):
    return [item[0] for item in lst]
chrom = list(set(extract(regions)))

import os
def get_genotype_file(chrom, genotype_list, geno_inventory):
    chrom = f'{chrom}'
    if chrom.startswith('chr'):
        chrom = chrom[3:]
    if chrom not in geno_inventory:
        geno_file = f'{chrom}'
    else:
        geno_file = geno_inventory[chrom]
    if not os.path.isfile(geno_file):
        # relative path
        if not os.path.isfile(f'{genotype_list:ad}/' + geno_file):
            raise ValueError(f"Cannot find genotype file {geno_file}")
        else:
            geno_file = f'{genotype_list:ad}/' + geno_file
    return path(geno_file)


## Process of molecular phenotype file
This workflow produce a vcf+tabix file for all the molecular pheno data that are included in the region list to feed into APEX factor analysis
This workflow also produce a vcf+tabix for each chromosome for downstream QTL association analysis

In [None]:
[region_extraction_1]
# Path to the input molecular phenotype data.
parameter: molecular_pheno_whole = path
input: molecular_pheno_whole,region_list
output: f'{wd}/{Prefix}.mol_phe.bed.gz',  # For factor
        f'{wd}/{Prefix}.mol_phe.tmp.bed' # For next step
task: trunk_workers = 1, trunk_size = 1, walltime = '4h',  mem = '20G', tags = f'{step_name}_{_output[0]:bn}'
R: expand = "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout',container = container
    library("dplyr")
    library("tibble")
    library("readr")
    library("modelr")
    library("purrr")
    pheno = read_delim("$[_input[0]]",delim = "\t")
    region = read_delim("$[_input[1]]",delim = "\t")
    output = inner_join(region, pheno , by = "gene_ID")%>%arrange(`#chr`,start)
    output%>%write_delim("$[_output[0]:n]",delim = "\t")
    output%>%write_delim("$[_output[1]]",delim = "\t")
  
bash: expand = "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout',container = container
    if test -f "$[_output[0]]"; then
    echo "$[_output[0]] exists."
    else
    bgzip $[_output[0]:n]
    fi
    tabix -p bed $[_output[0]] -f


In [None]:
[region_extraction_2]
# Path to the input molecular phenotype data.
parameter: molecular_pheno_whole = path
input: output_from('region_extraction_1'),for_each = "chrom"
output: f'{wd}/{Prefix}.chr{_chrom}.mol_phe.bed.gz'
task: trunk_workers = 1, trunk_size = 1, walltime = '4h',  mem = '20G', tags = f'{step_name}_{_output[0]:bn}'
R: expand = "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout',container = container
    library("dplyr")
    library("tibble")
    library("readr")
    library("modelr")
    library("purrr")
    pheno = read_delim("$[_input[1]]",delim = "\t")
    colnames(pheno)[2:3] = c("start","end")
    pheno = pheno%>%mutate(end = start +1)%>%filter(`#chr` %in% $[_chrom])
    pheno%>%write_delim("$[_output[0]:n]",delim = "\t")
  
bash: expand = "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout',container = container
    if test -f "$[_output[0]]"; then
    echo "$[_output[0]] exists."
    else
    bgzip $[_output[0]:n]
    fi
    tabix -p bed $[_output[0]] -f