# Rosmap Gene expression data reformatting
Pipeline that reformat and annotate the gene expression file forr ROSMAP dataset.

## Usage Example

In [1]:
SOS run /Users/haosun/Documents/WG_Reasearch_Assisstant/GIT/freshcopy/neuro-twas/Workflow/SOS_ROSMAP_gene_exp_processing.ipynb \
    --ref "/Users/haosun/Documents/WG_Reasearch_Assisstant/Rosmap/Data/new_gene_keys_WGS_plink.txt" \
    --name_col 2 \
    --real_name_col 1 \
    --gene_exp "/Users/haosun/Documents/WG_Reasearch_Assisstant/Rosmap/Data/gene_exp/geneCountsResidualsPlusBaselineAgeGenderAdj.txt" \
    --start_at 2 \
    --output "/Users/haosun/Documents/WG_Reasearch_Assisstant/Rosmap/Data/gene_exp/" \
    -j 2

Keyboard Interrupt


## Parameters

In [73]:
[global]
# This pipeline changes the patient names of a gene_exp file based on a index reference file 
parameter: ref = "/Users/haosun/Documents/WG_Reasearch_Assisstant/Rosmap/Data/new_gene_keys_plink.txt"
# A table contaiing the patient ID in the gene_exp file and the patient ID in the plink file
parameter: name_col = 2
# Column position that specify the header of gene_exp in the index
parameter: real_name_col = 1
# Column position that specify the plink patient ID in the index
parameter: gene_exp = "/Users/haosun/Documents/WG_Reasearch_Assisstant/Rosmap/Data/gene_exp/geneCountsResidualsPlusBaselineAgeGenderAdj.txt"
# the patient ID to be changed
parameter: start_at = 2
# the column where the patient ID start

parameter: output = "/Users/haosun/Documents/WG_Reasearch_Assisstant/Rosmap/Data/"
# where to store the output file

# Container option for software to run the analysis: docker or singularity
parameter: container = '/home/hs3163/system_file/twas_latest.sif'

## Change the name of gene expression file modification

In [None]:
[Processing_1]
input: ref, gene_exp
output: f'{output}/{_input[1]:bn}_rename.txt',
        f'{output}/{_input[1]:bn}_discardID.txt'
task: trunk_workers = 1, trunk_size = 1, walltime = '4h',  mem = '6G', tags = f'{step_name}_{_output[0]:bn}'
R:  expand= "$[ ]" , stderr = f'{_output[1]:n}.stderr', stdout = f'{_output[1]:n}.stdout',container = container
    library(dplyr)
    gene_exp = readr::read_delim("$[_input[1]]",delim = " ")
    index = readr::read_delim("$[_input[0]]",delim = "\t")
    name = data.frame(Name = index%>%pull(colnames(index)[$[name_col]]),Real_name = index%>%pull(colnames(index)[$[real_name_col]]))
    df = gene_exp[,$[start_at]:ncol(gene_exp)]
    pos = match(colnames(df), name$Name%>%as.character())%>%na.omit()
    df = df %>%
      select_at(as.character(name$Name[pos]), 
                ~as.character((name$Real_name[pos])))
    final = cbind(gene_exp[,1:$[start_at]-1],df)
    final%>%readr::write_tsv( path = "$[_output[0]]", na = "NA", append = FALSE, col_names = TRUE, quote_escape = "double")
    name$Name[attr(pos,"na.action")]%>%cat(file = "$[_output[1]]", sep = " ")

## Add position to gene ID based on ensembl 

In [None]:
[Processing_2]
input: group_by = 2
output: f'{output}/{_input[0]:bn}_region_list.txt'
task: trunk_workers = 1, trunk_size = 1, walltime = '4h',  mem = '6G', tags = f'{step_name}_{_output[0]:bn}'
R:  expand= "$[ ]", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout',container = container
    library("biomaRt")
    library(dplyr)
    gene_exp = readr::read_delim("$[_input[0]]",delim = "\t")
    ensembl = useDataset("hsapiens_gene_ensembl",mart=useMart("ENSEMBL_MART_ENSEMBL"))
    ensembl_df <- getBM(attributes=c("ensembl_gene_id","chromosome_name", "start_position", "end_position"),mart=ensembl)
    my_genes = gene_exp$gene_ID
    my_genes_ann = ensembl_df[match(my_genes, ensembl_df$ensembl_gene_id),]%>%filter(chromosome_name%in%1:23)%>%dplyr::select( "#chr" = chromosome_name, start_position, end_position,"gene_ID" = ensembl_gene_id)%>%filter(gene_ID!="NA")
    my_genes_ann%>%readr::write_tsv( path = "$[_output]", na = "NA", append = FALSE, col_names = TRUE, quote_escape = "double")