# Post-APA calling: Imputation and QC
## Aim
This notebook is designed to impute the missing values in the PDUI matrix, and perform quantile normailization for the impute values.
## Input
* raw PDUI matrix (row as gene, columns as sample id)
* covariate file  

## Output
* PDUI matrix without missingness  
  - The missing value is calculated using `impute` package

## Minimum working example

In [None]:
sos run /mnt/mfs/statgen/ls3751/github/xqtl-pipeline/pipeline/molecular_phenotypes/QC/apa_impute.ipynb APAimpute \
    --cwd /mnt/mfs/statgen/ls3751/MWE_dapars2/Output \
    --cov /data/example.cov.txt
    --chrlist chr1 \
    --container /mnt/mfs/statgen/ls3751/container/dapars2.sif

## Workflow implementation

In [6]:
[global]
parameter: walltime = '40h'
parameter: mem = '32G'
parameter: ncore = 16
parameter: cwd = path
parameter: thread = 8
parameter: job_size = 1
parameter: container = ''

In [7]:
[APAimpute]
parameter: chrlist = list
input: [f'{cwd}/apa_{x}/Dapars_result_result_temp.{x}.txt' for x in chrlist], group_by = 1
output: [f'{cwd}/apa_{x}/Dapars_result_impute_{x}.bed' for x in chrlist], group_by = 1
R: expand= "${ }", container = container

.libPaths( c('/usr/local/lib/R/site-library' , '/usr/lib/R/site-library', '/usr/lib/R/library', .libPaths()))
suppressPackageStartupMessages(require(dplyr))
suppressPackageStartupMessages(require(tidyr))
suppressPackageStartupMessages(require(impute))
suppressPackageStartupMessages(require(stringr))


 # Read the data
    input_dir <- ${_input:r}
  
    dapars_result = data.table::fread(input_dir) ## much faster than read.table
    tmp = dapars_result[,1:4]
    v1 = v2 = v3 = NULL
    for (i in 1:nrow(tmp)) {
      chro = str_split(tmp$Loci[i], ":")[[1]][1]
      spos = str_split(str_split(tmp$Loci[i], ":")[[1]][2],"-")[[1]][1]
      epos = str_split(str_split(tmp$Loci[i], ":")[[1]][2],"-")[[1]][2]
      v1 = c(v1 , chro)
      v2 = c(v2, spos)
      v3 = c(v3, epos)
    }
    tmp = tmp %>% mutate(`#chr` = v1, start = v2, end = v3) %>% select(`#chr`, start, end, Gene)
    dapars_result = dapars_result[,-c(2:4)]
    
    tmp_vec = pull(dapars_result, Gene)
    dapars_result = dapars_result[,-1]
    id_vec = NULL
    for (i in 1:length(colnames(dapars_result))) {
      id = tail(str_split((colnames(dapars_result))[i], "/")[[1]],1)
      id = str_split(id,"_")[[1]][1]
      id_vec = c(id_vec ,id)
    }
    colnames(dapars_result) = id_vec
    dapars_result = as.matrix(dapars_result)
    rownames(dapars_result) = tmp_vec
    dapars_result = dapars_result[,colMeans(is.na(dapars_result)) <= 0.8]
    dapars_result = dapars_result[rowMeans(is.na(dapars_result)) < 0.5,]
    
    
    class(dapars_result) = "numeric"
    dapars_impute = impute.knn(dapars_result)
    df = as.data.frame(dapars_impute$data)
    
    for (gene in 1:nrow(df)) {
      mat = apply(df[gene,], 1, rank, ties.method = "average")
      mat = qnorm(mat/ (ncol(df) + 1))
      df[gene, ] = mat
    }
    
    df$Gene = rownames(dapars_result)
    final_data <- inner_join(tmp, df)
    write.table(final_data, file = ${_output:r}, quote = F, row.names = F)

In [8]:
[APArename_1]
parameter: match = path
parameter: chrlist = list
input: [f'{cwd}/apa_{x}/Dapars_result_impute_{x}.bed' for x in chrlist], group_by = 1
output: [f'{cwd}/apa_{x}/Dapars_result_impute_renamed_{x}.bed' for x in chrlist], group_by = 1
R: expand= "${ }", container = container

.libPaths( c('/usr/local/lib/R/site-library' , '/usr/lib/R/site-library', '/usr/lib/R/library', .libPaths())) 
suppressPackageStartupMessages(require(dplyr))
suppressPackageStartupMessages(require(tidyr))
  

    input_dir <- ${_input:r}

    df = data.table::fread(input_dir)
    ref = data.table::fread("${match}", colClasses = 'character')
    
    tmp = NULL
    for (i in colnames(df)[5:ncol(df)]){
      tmp = c(tmp, as.character(ref[which(ref$ProjID == i),2]))
    }
    
    colnames(df)[5:ncol(df)] = tmp
    write.table(df, file = ${_output:r}, quote = F, row.names = F)

In [9]:
[APArename_2]
parameter: match = path
parameter: chrlist = list
input: [f'{cwd}/apa_{x}/Dapars_result_impute_renamed_{x}.bed' for x in chrlist], group_by = 1
output: [f'{cwd}/apa_{x}/Dapars_result_impute_renamed_{x}.bed.gz' for x in chrlist], group_by = 1
bash: expand= "${ }"
    bgzip -f ${_input:r}
    gzip -dfk ${_input:r}

In [29]:
[APArandom]
parameter: match = path
parameter: chrlist = list
input: [f'{cwd}/apa_{x}/Dapars_result_impute_renamed_{x}.bed.gz' for x in chrlist], group_by = 1
output: [f'{cwd}/apa_{x}/Dapars_result_impute_renamed_{x}.bed' for x in chrlist], group_by = 1
bash: expand= "${ }"
    gzip -dfk ${_input:r}

In [28]:
sos run /mnt/mfs/statgen/ls3751/github/xqtl-pipeline/code/molecular_phenotypes/QC/apa_impute.ipynb APArandom --cwd /mnt/mfs/statgen/ls3751/aqtl_analysis/wig/DLPFC --chrlist chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 --match /mnt/mfs/statgen/ls3751/aqtl_analysis/ROSMAP_APA_matchtable.txt --container  /mnt/mfs/statgen/ls3751/container/dapars2_final.sif 

KeyboardInterrupt


In [None]:
sos run /mnt/mfs/statgen/ls3751/github/xqtl-pipeline/code/molecular_phenotypes/QC/apa_impute.ipynb APArandom --cwd /mnt/mfs/statgen/ls3751/aqtl_analysis/wig/AC --chrlist chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 --match /mnt/mfs/statgen/ls3751/aqtl_analysis/ROSMAP_APA_matchtable.txt --container  /mnt/mfs/statgen/ls3751/container/dapars2_final.sif

In [None]:
sos run /mnt/mfs/statgen/ls3751/github/xqtl-pipeline/code/molecular_phenotypes/QC/apa_impute.ipynb APArandom --cwd /mnt/mfs/statgen/ls3751/aqtl_analysis/wig/PCC --chrlist chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 --match /mnt/mfs/statgen/ls3751/aqtl_analysis/ROSMAP_APA_matchtable.txt --container  /mnt/mfs/statgen/ls3751/container/dapars2_final.sif