# Phenotype data formatting


This is the region extraction step for data processing pipeline for xqtl workflow, containing the generation of:
1. Molecular_phenotype per chrom within selected regions in the format APEX and tensorQTL takes

### Input
The input for this workflow is the collection of data for 1 conditions as described in the readme of this git repo
1. 1 complete residual molecular_phenotype data
2. 1 region_list
Both of these input can be generated by the annotation module of this pipeline

### Output
For each collection, the output is 
1. 1 lists of phenotype file (bed+index) for each chrom, suitable to be fed into both apex and tensorQTL, annotated with chrom and pos
2. 1 lists of phenotype file (bed+index) for each gene, annotated with chrom and tss

In [None]:
nohup sos run /home/hs3163/GIT/xqtl-pipeline/pipeline/data_preprocessing/phenotype/phenotype_formatting.ipynb reformat \
--region_list /home/hs3163/GIT/ADSPFG-xQTL/MWE/mwe_region_long \
--molecular_pheno_whole /mnt/mfs/statgen/xqtl_workflow_testing/success_example/testing_10/Data_Processing/Phenotype/AC.mol_phe.bed  \
--wd ./  \
--name "Dry" --container "/mnt/mfs/statgen/containers/apex.sif" &

In [2]:
[global]
import os
# Work directory & output directory
parameter: wd = path
# The filename namefor output data
parameter: container = 'gaow/twas'
# namefor the analysis output
parameter: name= 'ROSMAP'
# An index text file with 4 columns specifying the chr, start, end and names of regions to analyze
parameter: region_list = path
# For cluster jobs, number commands to run per job
parameter: job_size = 1
# Wall clock time expected
parameter: walltime = "5h"
# Memory expected
parameter: mem = "16G"
# Number of threads
parameter: numThreads = 20
# Path to the input molecular phenotype data.
parameter: molecular_pheno_whole = path
Prefix = name
parameter: pop_file = "None"
regions = [x.strip().split() for x in open(region_list).readlines() if x.strip() and not x.strip().startswith('#')]
# Get the unique chormosome that have regions to be analyzed.
def extract(lst):
    return [item[0] for item in lst]
chrom = list(set(extract(regions)))

## Process of molecular phenotype file
This workflow produce a bed+tabix file for all the molecular pheno data that are included in the region list to feed into downstream analysis

In [None]:
[reformat_1,partition_by_chrom_1]
# Path to the input molecular phenotype data.
input: molecular_pheno_whole ,for_each = "chrom"
output: f'{wd:a}/{Prefix}.chr{_chrom}.mol_phe.bed.gz'
task: trunk_workers = 1, trunk_size = 1, walltime = '4h',  mem = '20G', tags = f'{step_name}_{_output[0]:bn}'
bash: expand = "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout',container = container
    zcat $[_input] | head -1 > $[_output:n]
    tabix $[_input] $[_chrom] >> $[_output:n]
    bgzip -f $[_output:n]
    tabix -p bed $[_output] -f

In [None]:
[reformat_2,partition_by_chrom_2]
# Path to the input molecular phenotype data.
input: group_by = "all"
output: f'{wd:a}/{name}.processed_phenotype.per_chrom.recipe'
import pandas as pd
chrom_df = pd.DataFrame({"#chr" : chrom ,"#dir" : _input})
chrom_df.to_csv(_output,index = 0,sep = "\t")

In [None]:
[reformat_3, for_pca]
input: molecular_pheno_whole
output: f'{wd:a}/{Prefix}.for_pca.mol_phe.exp'
task: trunk_workers = 1, trunk_size = 1, walltime = '4h',  mem = '20G', tags = f'{step_name}_{_output[0]:bn}'
R: expand = "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout',container = container
    library("dplyr")
    library("tibble")
    library("readr")
    library("purrr")
    phenoFile = read_delim($[_input:r],"\t")
    mtx = phenoFile[,5:ncol(phenoFile)]
    rownames(mtx) = phenoFile$gene_ID
    # each row is a sample each column is a gene
    mtx <- t(as.matrix(mtx, rownames = T))
    mtx = mtx%>%as_tibble(rownames = "IID") 
    phenoMtr = phenoFile%>%t()
    if("$[pop_file]" == "None"){
    # Make artificial pop lable
      output = mtx%>%mutate(RACE = "RACE_1")
      }else{
      pop = read_delim("$[pop_file]", "\t")
      output = inner_join(pop,mtx,by = "IID")
      }
      output%>%write_delim("$[_output]","\t")

In [None]:
[partition_by_gene_1]
# Path to the input molecular phenotype data.
input: molecular_pheno_whole ,for_each = "regions"
output: f'{wd:a}/{Prefix}.{_regions[3]}.mol_phe.bed.gz'
task: trunk_workers = 1, trunk_size = 1, walltime = '48h',  mem = '20G', tags = f'{step_name}_{_output[0]:bn}'
bash: expand = "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout',container = container
    zcat $[_input] | head -1 > $[_output:n]
    zcat $[_input] | grep  $[_regions[3]] >> $[_output:n]
    bgzip -f $[_output:n]
    tabix -p bed $[_output] -f

In [None]:
[partition_by_gene_2]
input: group_by = "all"
output: f'{wd:a}/{name}.processed_phenotype.per_gene.recipe'
import pandas as pd
region_df = pd.DataFrame({"region" : [x[3] for x in regions] ,"dir" : _input})
region_df.to_csv(_output,index = 0,sep = "\t")