In [None]:


library(optparse)

message(Sys.time())

option_list = list(make_option("--config_file", type="character", default=0, help="config file"           , metavar="character"),
               make_option("--step"       , type="character", default=0, help="pipeline step (2 or 4)", metavar="character"),
               make_option("--prefix"     , type="character", default=0, help="output folder"         , metavar="character"),
               make_option("--taskid"     , type="integer"  , default=0, help="SGE task ID"           , metavar="character"),
               make_option("--functions_file", type = "character"  , default=0, help="functions file", metavar = "character")
              ) 

# opt_parser        = OptionParser(option_list=option_list)
# opt               = parse_args(opt_parser)

config_file       = "/projects/CARDIPS/analysis/epigenome_resource/haqtls/CVPC/notebooks/qtl.config.sh"
pp_step           = "step_5"
prefix            = "qtl_by_element"
taskid            = 1
functions_file    = "/projects/CARDIPS/analysis/epigenome_resource/haqtls/scripts/functions.R"

source(functions_file)

config            = parse_config(config_file)

tryCatch(
{
    qtl_input_list    = readRDS(paste(config$out_folder, "step_4/qtl_by_element/qtl_input.rds", sep = "/"))

    metadata          = qtl_input_list$metadata
    regress_df        = fread(paste(config$out_folder, "step_4/qtl_by_element/qtl.to_regress_v2.txt",sep="/" ), sep = "\t",data.table=F) %>% filter(new_condition != 0)
    tissue            = unique(regress_df$tissue)

    qtl_df            = fread(paste(config$out_folder, "step_4/qtl_by_element/qtl.no_mhc.txt",sep="/" ), sep = "\t",data.table=F) %>%
                            filter(new_egene == TRUE & type < 4) %>% mutate(qtl_id = paste(tissue,element_id,type,sep="_")) %>% arrange(element_id, type)
    qtl_df            = qtl_df[ qtl_df$qtl_id %in% regress_df$qtl_id, ]

    phenotype_ids     = qtl_input_list$phenotype_ids
    phenotype_info    = qtl_input_list$phenotype_info
    element_ids       = unique(qtl_df$element_id)

    element_id        = element_ids[[taskid]]

    message(paste("Running regression QTL for", element_id))
    out_folder        = paste(config$out_folder, pp_step, prefix, "qtl",             sep = "/")
    tmp_folder        = paste(config$out_folder, pp_step, prefix, "tmp", element_id, sep = "/")


    suppressWarnings(dir.create(out_folder,recursive = TRUE))
    suppressWarnings(dir.create(paste(config$out_folder, pp_step, prefix, sep = "/")))
    suppressWarnings(dir.create(paste(config$out_folder, pp_step, prefix, "tmp", sep = "/")))
    suppressWarnings(dir.create(tmp_folder))

    exp_data_file     = paste(config$out_folder, "step_1", "phenotype", "by_element", paste(           element_id, "txt", sep = "."), sep = "/")
    gtinfo_file       = paste(config$out_folder, "step_1", "genotype" , "by_element", paste("gt_info", element_id, "txt", sep = "."), sep = "/")
    gtdata_file       = paste(config$out_folder, "step_1", "genotype", "by_element" , paste("gt_data", element_id, "txt", sep = "."), sep = "/")
    
    if (file.exists(exp_data_file) & file.exists(gtinfo_file) & file.exists(gtdata_file))
    {
        expdata           = add_rownames(fread(exp_data_file, sep = "\t", header = TRUE , data.table = FALSE))
        gtinfo            =              fread(gtinfo_file  , sep = "\t", header = TRUE , data.table = FALSE)
        gtdata            = add_rownames(fread(gtdata_file  , sep = "\t", header = TRUE , data.table = FALSE))

        expdata           = as.data.frame(t(as.matrix(expdata)))["norm", phenotype_ids]
        gtdata            = gtdata[,metadata$genotype_id]
        colnames(gtdata)  = rownames(metadata)
        rownames(gtinfo)  = gtinfo$id
        rownames(expdata) = c("trait")
        phenotype_info    = phenotype_info[ element_id,]
        geneloc           = phenotype_info[, c("element_id", "chrom", "start", "end")]
        snploc            = gtinfo[,c("id", "chrom", "pos")]
        cov_original    = fread(paste(config$out_folder, "step_4", prefix, "covariates.csv", sep = "/"),sep=",",data.table=F)
        gtdata = as.matrix(gtdata)

        if(length(gtdata[is.na(gtdata) == TRUE]) > 0){gtdata[is.na(gtdata) == TRUE] = 0}
        
        gtdata = as.data.frame(gtdata)
        var2merge = as.data.frame(t(gtdata [ rownames(gtdata) %in% qtl_df$id [ qtl_df$element_id == element_id], ] ))    

        lead_vars = colnames(var2merge)

        cov_gt          = add_rownames(merge(cov_original,var2merge,by.x = "V1", by.y=0))

        kin_file_original    = paste(config$out_folder, "step_4", prefix, "kinship.csv"   , sep = "/")
        cov_file             = paste(tmp_folder, paste("TMP", element_id, "cov"    , "csv", sep = "."), sep = "/")

        kin_file             = paste(tmp_folder, paste("TMP", element_id, "kin"    , "csv", sep = "."), sep = "/")
        gt_file              = paste(tmp_folder, paste("TMP", element_id, "gt"     , "txt", sep = "."), sep = "/")
        exp_file             = paste(tmp_folder, paste("TMP", element_id, "exp"    , "csv", sep = "."), sep = "/")
        bed_file             = paste(tmp_folder, paste("TMP", element_id, "gt"            , sep = "."), sep = "/")
        h5_file              = paste(tmp_folder, paste("TMP", element_id, "gt"     , "h5" , sep = "."), sep = "/")
        qtl_file_tmp         = paste(tmp_folder, paste("TMP", element_id, "qtl"    , "csv", sep = "."), sep = "/")
        geneloc_file         = paste(tmp_folder, paste("TMP", element_id, "geneloc", "txt", sep = "."), sep = "/")
        snploc_file          = paste(tmp_folder, paste("TMP", element_id, "snploc" , "txt", sep = "."), sep = "/")

        qtl_file             = paste(out_folder, paste("qtl", element_id,       "txt", sep = "."), sep = "/")
        fdr_file             = paste(out_folder, paste("fdr", element_id,       "txt", sep = "."), sep = "/")

        gtdata2 = gtdata[ !rownames(gtdata) %in% colnames(var2merge),]
        snploc2 = snploc[ !snploc$id %in% colnames(var2merge),]
        gtinfo2 = gtinfo[ !gtinfo$id %in% colnames(var2merge),]
        
        fwrite(gtdata2        , gt_file       , sep = "\t", row.names = TRUE , col.names = TRUE)
        fwrite(expdata       , exp_file      , sep = "," , row.names = TRUE , col.names = TRUE)
        fwrite(geneloc       , geneloc_file  , sep = "\t", row.names = FALSE, col.names = TRUE)
        fwrite(snploc2        , snploc_file   , sep = "\t", row.names = FALSE, col.names = TRUE)

        fwrite(cov_gt        , cov_file   , sep = ",", row.names = TRUE, col.names = TRUE)

        message(paste("rsync", kin_file_original, kin_file))
        system(paste("rsync", kin_file_original, kin_file))

        write_h5_file(element_id,phenotype_info,expdata,gtinfo2,gtdata2,h5_file)

        qtl_0_list = run_regression_qtl(config, element_id, tmp_folder, phenotype_info, expdata, gtinfo2, gtdata2, h5_file,cov_file, kin_file, qtl_file_tmp, geneloc_file, snploc_file, gt_file, lead_vars)
            
        qtl_list       = list()
        fdr_list       = list()
        
        qtl            = qtl_0_list$qtl 
        fdr            = qtl_0_list$lead 
        lead_var       = fdr[1, "id"]


        qtl_list [[1]]      = qtl
        fdr_list [[1]]      = fdr

        qtl_out = as.data.frame(rbindlist(qtl_list), stringsAsFactors = FALSE)
        fdr_out = as.data.frame(rbindlist(fdr_list), stringsAsFactors = FALSE)

        fwrite(qtl_out, qtl_file, sep = "\t", row.names = FALSE, col.names = TRUE)
        fwrite(fdr_out, fdr_file, sep = "\t", row.names = FALSE, col.names = TRUE)

        message(paste("Saved:", qtl_file))
        message(paste("Saved:", fdr_file))
        message(Sys.time())

    }
})