# Post-APA calling: Imputation and QC
## Aim
This notebook is designed to impute the missing values in the PDUI matrix, and perform quantile normailization for the impute values.
## Input
* raw PDUI matrix (row as gene, columns as sample id)
* covariate file  

## Output
* PDUI matrix without missingness  
  - The missing value is calculated using `impute` package

## Minimum working example

In [None]:
sos run /mnt/mfs/statgen/ls3751/github/xqtl-pipeline/pipeline/molecular_phenotypes/QC/apa_impute.ipynb APAimpute \
    --cwd /mnt/mfs/statgen/ls3751/MWE_dapars2/Output \
    --cov /data/example.cov.txt
    --chrlist chr1 \
    --container /mnt/mfs/statgen/ls3751/container/dapars2.sif

## Workflow implementation

In [6]:
[global]
parameter: walltime = '40h'
parameter: mem = '32G'
parameter: ncore = 16
parameter: cwd = path
parameter: thread = 8
parameter: job_size = 1
parameter: container = ''

In [11]:
[APAimpute]
parameter: cov = path
parameter: chrlist = list
input: [f'{cwd}/apa_{x}/Dapars_result_result_temp.{x}.txt' for x in chrlist], group_by = 1
output: [f'{cwd}/Dapars_result_clean_{x}.txt' for x in chrlist], group_by = 1
R: expand= "${ }", container = container

.libPaths( c('/usr/local/lib/R/site-library' , '/usr/lib/R/site-library', '/usr/lib/R/library', .libPaths()))
suppressPackageStartupMessages(require(dplyr))
suppressPackageStartupMessages(require(tidyr))
suppressPackageStartupMessages(require(impute))


 # Read the data
    input_dir <- ${_input:r}
  
    dapars_result = data.table::fread(input_dir) ## much faster than read.table
    tmp = dapars_result[,1:4]
    dapars_result = dapars_result[,-c(2:4)]
    rownames(dapars_result) = dapars_result[,1]
    dapars_result = dapars_result[,-1]
    dapars_result = dapars_result[,colMeans(is.na(dapars_result)) <= 0.8]
    dapars_result = dapars_result[rowMeans(is.na(dapars_result)) < 0.5,]
    tmp_vec = rownames(dapars_result)
  
    class(dapars_result) = "numeric"
    covs = data.table::fread(${cov})
    
    dapars_impute = dapars_result[,colnames(dapars_result) %in% colnames(covs)]
    dapars_impute = impute.knn(dapars_result)
    df = as.data.frame(dapars_impute$data)
  
    for (gene in 1:nrow(df)) {
      mat = apply(df[gene,], 1, rank, ties.method = "average")
      mat = qnorm(mat/ (ncol(df) + 1))
      df[gene, ] = mat
  }
  
    df$Gene = tmp_vec
    final_data <- inner_join(df, tmp)
    write.table(final_data, file = ${_output:r}, quote = F)