Merge pull request #56 from mumichae/export_counts

Export counts
gagneurlab · Jun 1, 2020 · b08cc8d · b08cc8d
2 parents f15dd9c + e380c1e
commit b08cc8d
Show file tree

Hide file tree

Showing 13 changed files with 103 additions and 61 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -58,10 +58,12 @@ before_script:
   - cd $PROJECT_DIR
   - drop demo
 
-script: true
-#  - cd $PROJECT_DIR
-#  - snakemake -n
-#  - snakemake aberrantExpression --cores 2
-#  - snakemake aberrantSplicing --cores 2
-#  - snakemake mae --cores 2
-#  - snakemake --cores 2
+script:
+  - cd $PROJECT_DIR
+  - snakemake -n
+  - snakemake aberrantExpression --cores 2
+  - snakemake aberrantSplicing --cores 2
+  - snakemake mae --cores 2
+  - snakemake --cores 2
+  - snakemake exportCounts --cores 2
+
diff --git a/drop/cli.py b/drop/cli.py
@@ -63,8 +63,10 @@ def demo():
     setFiles()
 
     # download data
-    module_dir = str(pathlib.Path(drop.__file__).parent)
-    subprocess.run(["bash", f"{module_dir}/download_data.sh"], stderr=subprocess.PIPE)
+    logger.info("download data")
+    download_script = str(pathlib.Path(drop.__file__).parent / "download_data.sh")
+    response = subprocess.run(["bash", download_script], stderr=subprocess.STDOUT)
+    response.check_returncode()
 
     # fix config file
     with open("config.yaml", "r") as f:

diff --git a/drop/configHelper.py b/drop/configHelper.py
@@ -3,6 +3,7 @@
 import wbuild
 import pathlib
 from snakemake.logging import logger
+from snakemake.io import expand
 import warnings
 warnings.filterwarnings("ignore", 'This pattern has match groups')
 
@@ -110,6 +111,19 @@ def setDefaults(self, config, method=None):
         setKey(config, None, "genomeAssembly", "hg19")
         setKey(config, None, "scanBamParam", "null")
 
+        # export settings
+        setKey(config, None, "exportCounts", dict(), verbose=VERBOSE)
+        gene_annotations = list(config["geneAnnotation"].keys())
+        setKey(config, ["exportCounts"], "geneAnnotation", gene_annotations, verbose=VERBOSE)
+        setKey(config, ["exportCounts"], "excludeGroups", list(), verbose=VERBOSE)
+
+        # check consistency of gene annotations
+        anno_incomp = set(config["exportCounts"]["geneAnnotations"]) - set(gene_annotations)
+        if len(anno_incomp) > 0:
+            message = f"{anno_incomp} are not valid annotation version in 'geneAnnotation'"
+            message += "but required in 'exportCounts'.\n Please make sure they match."
+            raise ValueError(message)
+
         if self.method is None:
             tmp_dir = submodules.getTmpDir()
         else:
@@ -162,6 +176,8 @@ def setDefaults(self, config, method=None):
 
     def setKey(self, dict_, sub, key, default, verbose=False):
         if sub is not None:
+            if not isinstance(sub, list):
+                raise TypeError(f"{sub} is not of type list")
             for x in sub:
                 dict_ = dict_[x]
         if key not in dict_ or dict_[key] is None:
@@ -369,6 +385,32 @@ def getMaeAll(self):
 
     def getGeneAnnotationFile(self, annotation):
         return self.config["geneAnnotation"][annotation]
+
+    def getExportGroups(self, modules=["aberrantExpression", "aberrantSplicing"]):
+        groups = [] # get all active groups
+        for module in modules:
+          groups.extend(self.config[module]["groups"])
+        exclude = self.config["exportCounts"]["excludeGroups"]
+        return set(groups) - set(exclude)
+
+    def getExportCountFiles(self, prefix):
+
+        count_type_map = {"geneCounts":"aberrantExpression",
+                          "splitCounts":"aberrantSplicing",
+                          "spliceSiteOverlapCounts":"aberrantSplicing"}
+        if prefix not in count_type_map.keys():
+            raise ValueError(f"{prefix} not a valid file type for exported counts")
+
+        datasets = self.getExportGroups([count_type_map[prefix]])
+        annotations = self.config["exportCounts"]["geneAnnotations"]
+
+        pattern = self.getProcResultsDir()
+        if prefix == "geneCounts":
+            pattern += f"/exported_counts/{{dataset}}/{prefix}_{{dataset}}--{{annotation}}.tsv.gz"
+            return expand(pattern, annotation=annotations, dataset=datasets)
+        else:
+            pattern += f"/exported_counts/{{dataset}}/{prefix}_{{dataset}}.tsv.gz"
+            return expand(pattern, dataset=datasets)
 
 
 

diff --git a/drop/download_data.sh b/drop/download_data.sh
@@ -2,7 +2,8 @@
 set -e
 
 # get data
-wget -Nc "https://i12g-gagneurweb.informatik.tu-muenchen.de/public/paper/drop_analysis/resource.tar.gz"
+resource_url="https://www.cmm.in.tum.de/public/paper/drop_analysis/resource.tar.gz"
+wget -Nc $resource_url
 tar -zxvf resource.tar.gz
 rm -rf Data
 mv resource Data
@@ -11,7 +12,6 @@ mv resource Data
 cd Data
 python fix_sample_anno.py
 gunzip chr21.fa.gz
-samtools faidx chr21.fa
 
 # copy config
 cp config_relative.yaml ../config.yaml
diff --git a/drop/installRPackages.R b/drop/installRPackages.R
@@ -16,7 +16,7 @@ for (i in 1:nrow(packages)) {
     pckg_name = tail(unlist(strsplit(packages[i,1], split = "/")), n = 1)
 
     if (pckg_name %in% installed) {
-        message(paste(pckg_name, "already installed"))
+        #message(paste(pckg_name, "already installed"))
     } else {
         if (packages[i,2] == TRUE) {
             INSTALL <- BiocManager::install

diff --git a/drop/modules/aberrant-expression-pipeline b/drop/modules/aberrant-expression-pipeline
diff --git a/drop/modules/aberrant-splicing-pipeline b/drop/modules/aberrant-splicing-pipeline
diff --git a/drop/modules/helpers/add_HPO_cols.R b/drop/modules/helpers/add_HPO_cols.R
@@ -4,7 +4,8 @@
 add_HPO_cols <- function(RES, sample_id_col = 'sampleID', gene_name_col = 'hgncSymbol'){
   require(data.table)
 
-  hpo_dt <- fread('https://i12g-gagneurweb.in.tum.de/public/paper/drop_analysis/resource/hpo_genes.tsv.gz')
+  filename <- 'https://www.cmm.in.tum.de/public/paper/drop_analysis/resource/hpo_genes.tsv.gz'
+  hpo_dt <- fread(filename)
 
   # change column names
   setnames(RES, old = c(sample_id_col, gene_name_col), new = c('sampleID', 'hgncSymbol'))
@@ -14,7 +15,7 @@ add_HPO_cols <- function(RES, sample_id_col = 'sampleID', gene_name_col = 'hgncS
 
   if(nrow(f2) > 0){
     f3 <- merge(f2, sa[,.(RNA_ID, HPO_TERMS)], by.x = 'sampleID', by.y = 'RNA_ID')
-    f3[, HPO_match := HPO_id %in% unlist(strsplit(HPO_TERMS, split = ',')), by = 1:nrow(f3)]
+    f3[, HPO_match := HPO_id %in% unlist(strsplit(HPO_TERMS, split = ','))]
     f3 <- f3[HPO_match == TRUE]
     if(nrow(f3) > 0){
       f4 <- f3[, .(HPO_id_overlap = paste(HPO_id, collapse = ', '), 
@@ -26,4 +27,4 @@ add_HPO_cols <- function(RES, sample_id_col = 'sampleID', gene_name_col = 'hgncS
 
   setnames(RES, old = c('sampleID', 'hgncSymbol'), new = c(sample_id_col, gene_name_col))
   return(RES)
-}
+}
diff --git a/drop/requirementsR.txt b/drop/requirementsR.txt
@@ -14,5 +14,5 @@ tidyr FALSE
 magrittr FALSE 
 devtools FALSE 
 BSgenome.Hsapiens.UCSC.hg19 TRUE 
-MafDb.gnomAD.r2.1.hs37d5 TRUE 
-MafDb.gnomAD.r2.1.GRCh38 TRUE 
+#MafDb.gnomAD.r2.1.hs37d5 TRUE 
+#MafDb.gnomAD.r2.1.GRCh38 TRUE 
diff --git a/drop/setupDrop.py b/drop/setupDrop.py
@@ -24,25 +24,6 @@ def installRPackages():
     script = pathlib.Path(drop.__file__).parent / "installRPackages.R"
     requirements = pathlib.Path(drop.__file__).parent / 'requirementsR.txt'
 
-    #packages = [x.strip().split("#")[0] for x in open(requirements, 'r')]
-    #packages = [x for x in packages if x != '']
-
-    #for package in packages:
-    #    logger.info(f"check {package}")   
-    call = subprocess.Popen(
-            ["Rscript", script, requirements], 
-            stdout=subprocess.PIPE, 
-            stderr=subprocess.STDOUT
-        )
+    response = subprocess.run(["Rscript", script, requirements], stderr=subprocess.STDOUT)
+    response.check_returncode()
 
-    # check output for errors
-    stdout, stderr = call.communicate()
-    if stderr:
-        print(stderr)
-        exit(1)
-    stdout = stdout.decode()
-    ep = re.compile("Execution halted|^ERROR", re.M)
-    if ep.search(stdout):
-        print(stdout)
-        exit(1)
-
diff --git a/drop/template/Scripts/Pipeline/SampleAnnotation.R b/drop/template/Scripts/Pipeline/SampleAnnotation.R
@@ -4,10 +4,13 @@
 #' wb:
 #'  params:
 #'   - tmpdir: '`sm drop.getTmpDir()`'
+#'   - export_dir: '`sm parser.getProcResultsDir() + "/exported_counts"`'
+#'   - groups: '`sm parser.getExportGroups()`'
 #'  input: 
 #'   - sampleAnnotation: '`sm config["sampleAnnotation"]`'
 #'  output:
-#'   - done: '`sm parser.getProcDataDir() + "/sample_anno/sample_anno.done"`'
+#'   - export: '`sm touch(parser.getProcResultsDir() + "/exported_counts/sample_anno.done")`'
+#'   - done: '`sm touch(parser.getProcDataDir() + "/sample_anno/sample_anno.done")`'
 #' output:
 #'   html_document:
 #'    code_folding: hide
@@ -77,7 +80,7 @@ unique(sa[,.(RNA_ID, DROP_GROUP)])$DROP_GROUP %>% strsplit(',') %>% unlist %>%
 #+echo=F
 if(!is.null(sa$HPO_TERMS)){
   sa2 <- sa[, .SD[1], by = RNA_ID]
-  hpo_dt <- fread('https://i12g-gagneurweb.in.tum.de/public/paper/drop_analysis/resource/hpo_genes.tsv.gz')
+  hpo_dt <- fread('https://www.cmm.in.tum.de/public/paper/drop_analysis/resource/hpo_genes.tsv.gz')
 
   sapply(1:nrow(sa2), function(i){
     hpos <- strsplit(sa2[i, HPO_TERMS], split = ',') %>% unlist
@@ -91,23 +94,21 @@ if(!is.null(sa$HPO_TERMS)){
          na = NA, sep = "\t", row.names = F, quote = F)
 }
 
-if(snakemake@config$exportCounts == TRUE){
-  sa[, DROP_GROUP := gsub(' ', '', DROP_GROUP)]
-  if(!is.null(sa$SEX)) sa[, SEX := tolower(SEX)]
-
-  list_groups <- strsplit(sa$DROP_GROUP, split = ',')
-  drop_groups <- list_groups %>% unlist %>% unique
-
-  for(dataset in drop_groups){
-    path <- file.path(snakemake@config$root, 'processed_data/exported_counts', dataset)
-    dir.create(path)
-    cols <- intersect(colnames(sa), 
-                      c('RNA_ID', 'DNA_ID', 'PATIENT_ID', 'PAIRED_END', 'STRAND', 
-                        'TISSUE', 'SEX', 'AFFECTED', 'ICD_10'))
-    fwrite(sa[sapply(list_groups, function(x) dataset %in% x), cols, with = F], 
-         file = file.path(path, paste0('sampleAnnotation_', dataset, '.tsv')), 
-         quote = FALSE, row.names = FALSE, sep = '\t')
-  }
+sa[, DROP_GROUP := gsub(' ', '', DROP_GROUP)]
+if(!is.null(sa$SEX)) sa[, SEX := tolower(SEX)]
+
+list_groups <- strsplit(sa$DROP_GROUP, split = ',')
+drop_groups <- snakemake@params$groups # list_groups %>% unlist %>% unique
+
+# export processed sample annotation
+for(dataset in drop_groups){
+  cols <- intersect(colnames(sa),
+                    c('RNA_ID', 'DNA_ID', 'PATIENT_ID', 'PAIRED_END', 'STRAND', 
+                      'TISSUE', 'SEX', 'AFFECTED', 'ICD_10'))
+  sa_sub <- sa[sapply(list_groups, function(x) dataset %in% x), cols, with = F]
+  path <- file.path(snakemake@params$export_dir, dataset)
+  dir.create(path)
+  filename <- file.path(path, paste0('sampleAnnotation_', dataset, '.tsv'))
+  fwrite(sa_sub, file = filename, quote = FALSE, row.names = FALSE, sep = '\t')
 }
 
-file.create(snakemake@output$done) %>% invisible
diff --git a/drop/template/Snakefile b/drop/template/Snakefile
@@ -68,6 +68,13 @@ rule mae:
 rule sampleQC:
     input: MAE(drop.getTmpDir() + "/sampleQC.done")
 
+rule exportCounts:
+    input:
+        AE(parser.getExportCountFiles("geneCounts")),
+        AS(parser.getExportCountFiles("splitCounts")),
+        AS(parser.getExportCountFiles("spliceSiteOverlapCounts")),
+        parser.getProcResultsDir() + "/exported_counts/sample_anno.done"
+
 rule dependencyGraph:
     input:
         MAE(config["htmlOutputPath"] + "/MAE_rulegraph.svg"),

diff --git a/drop/template/config.yaml b/drop/template/config.yaml
@@ -15,7 +15,13 @@ geneAnnotation:
 
 genomeAssembly: hg19  # either hg19 or hg38
 scanBamParam: null # or a path to an Rds file containing a scanBamParam object
-exportCounts: false
+exportCounts:
+    # specify which gene annotations to include and which 
+    # groups to exclude when exporting counts
+    geneAnnotations:
+      - v29
+    excludeGroups:
+      - group1
 
 aberrantExpression:
     groups: