# Merge Covariate
This is the module where the output of factor analysis were merged into 1 covariate file that can be fed into both APEX and tensorQTL

### Input
1. factor+cov file as output from peer or BiCV factor module, It is assumed it to have columns as #id + samplesname and each rows is a covariateor factor (start with factor_)

1. pca file as output from the PCA module

### Output
1. PCA + Factor + Covariate file


In [2]:
[global]
# The output directory for generated files. MUST BE FULL PATH
parameter: cwd = path
# The covariate file
parameter: covFile = path
# For cluster jobs, number commands to run per job
parameter: job_size = 1
# Wall clock time expected
parameter: walltime = "5h"
# Memory expected
parameter: mem = "2G"
# Number of threads
parameter: numThreads = 8
# Software container option
parameter: container = ""
parameter: name = ""
# The number of the external covariates to be included, -1 means includs all of them, 0 means include none of them,
# but keeping only the header (Basicaaly just formatting the PCs).
parameter: nCov = -1
# Tolerance of missingness in covariates, -1 means quit, otherwise for covariate with missing rate larger than tol_cov will be removed,
# with missing rate smaller than tol_cov will be mean_imputed.
parameter: tol_cov = -1

In [None]:
[merge_pca_covariate]
# The PCA file. an RDS file as the output of the pca module
parameter: pcaFile = path
input: pcaFile, covFile
output:  f'{cwd:a}/{_input[1]:bn}.pca.cov.gz'
task: trunk_workers = 1, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R: expand= "$[ ]", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'
        library("dplyr")
        library("tibble")
        library("readr")
        compute_missing <- function(mtx){
          miss <- sum(is.na(mtx))/length(mtx)
          return(miss)
        }

        mean_impute <- function(mtx){
          f <- apply(mtx, 2, function(x) mean(x,na.rm = TRUE))
          for (i in 1:length(f)) mtx[,i][which(is.na(mtx[,i]))] <- f[i]
          return(mtx)
        }
    
        filter_mtx <- function(X, missing_rate_thresh) {
            rm_col <- which(apply(X, 2, compute_missing) > missing_rate_thresh)
            if (length(rm_col)) X <- X[, -rm_col]
            return(mean_impute(X))
        }  
        pca_output = readRDS("$[_input[0]]")$pc_scores
        mtx = pca_output%>%select(contains("PC"))%>%t()
        colnames(mtx) <- pca_output$IID
        mtx = mtx%>%as_tibble(rownames = "#id")
        cov = read_delim("$[_input[1]]","\t")
        colnames(cov)[1] = "#id"
        ## Retaining only the overlapped samples
        int = intersect(colnames(cov),colnames(mtx))
        cov = cov%>%select(int)
        # keep only the desired amount of covariates
        if($[nCov] > 0 ){cov = cov[1:$[nCov],]} else if ($[nCov] == 0){cov = cov[$[nCov],]}
        mtx = mtx%>%select(int)
        output = bind_rows(cov,mtx)
        ## Handle missingess in ncov
        if($[tol_cov] == -1){if(is.na(output) > 0 ){ stop("NA in covariates/PCs input: Check input file or raise parameter tol_cov to allow for imputation & filtering")}
        output = filter_mtx(output,$[tol_cov])
        output%>%write_delim("$[_output]","\t")

In [None]:
[Residual_Y_1]
# Path to the input molecular phenotype data.
parameter: phenoFile = path
# name for the analysis output
parameter: name = f'{phenoFile:bnn}'
input: phenoFile,covFile
output: f'{cwd}/{name}.resid.bed'
task: trunk_workers = 1, trunk_size = 1, walltime = '4h',  mem = '20G', tags = f'{step_name}_{_output[0]:bn}'
R: expand = "${ }", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout' , container = container

    library(dplyr)
    library(readr)

    pheno = read_delim(${_input[0]:r},delim = "\t")
    factor= read_delim(${_input[1]:r},delim = "\t")%>%na.omit()# Remove covariates that have na values for issue 199, subject to changes

    # Extract samples in both files
    extraction_sample_list <- intersect(colnames(pheno), colnames(factor)) 
    
    
    if(length(extraction_sample_list) == 0){
      stop("No samples are overlapped in two files!")
    }
    
    # Report identical samples:
    
    print("Listed samples are included in the analysis:")
    print(extraction_sample_list)
    
    # Subset the data:
    factor = factor[,extraction_sample_list]%>%as.matrix()%>%t()
    pheno_id = pheno%>%select(1:4)
    pheno = pheno%>%select(rownames(factor))%>%as.matrix()%>%t()
    
    # Get residual 
    pheno_resid = .lm.fit(x = cbind(1,factor), y = pheno)$residuals
    pheno_output = cbind(pheno_id, pheno_resid%>%t())
    pheno_output%>%write_delim(${_output[0]:r},delim = "\t")

# tabix via samtools
[Residual_Y_2]
output: f'{_input}.gz'
task: trunk_workers = 1, trunk_size = 1, walltime = '4h',  mem = '20G', tags = f'{step_name}_{_output[0]:bn}'
bash: expand = "${ }", stderr = f'{_output}.stderr', stdout = f'{_output}.stdout', container = container
    bgzip -f ${_input}
    tabix -p bed ${_output}

In [3]:
[merge_PEER_covariate]
parameter: peerFile = path
input: peerFile, covFile
output:  f'{cwd:a}/{_input[1]:bn}.PEER.cov.gz'
task: trunk_workers = 1, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R: expand= "$[ ]", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'
        library("dplyr")
        library("readr")
        peer_res = read_delim("$[_input[0]]", delim = "\t")
        cov_pca = read_delim("$[_input[1]]", delim = "\t")
        com_col = intersect(colnames(peer_res), colnames(cov_pca))
        write_delim((rbind(cov_pca[,com_col], peer_res[,com_col])), "$[_output]", "\t")