Export counts meta (#144)

* refactored writing export count meta files in separate python script rather than SampleAnnotation script * include python scripts in non-module directories upon installation Co-authored-by: mumichae <mi.mueller@tum.de>
gagneurlab · Jan 14, 2021 · 44bc135 · 44bc135
1 parent 7572ba2
commit 44bc135
Show file tree

Hide file tree

Showing 8 changed files with 143 additions and 92 deletions.
diff --git a/drop/config/DropConfig.py b/drop/config/DropConfig.py
@@ -94,7 +94,8 @@ def setDefaults(self, config_dict):
         setKey = utils.setKey
         setKey(config_dict, None, "fileRegex", r".*\.(R|md)")
         setKey(config_dict, None, "genomeAssembly", "hg19")
-        setKey(config_dict, None, "hpoFile", None)
+        hpo_url = 'https://www.cmm.in.tum.de/public/paper/drop_analysis/resource/hpo_genes.tsv.gz'
+        setKey(config_dict, None, "hpoFile", hpo_url)
 
         # set submodule dictionaries
         setKey(config_dict, None, "aberrantExpression", dict())

diff --git a/drop/config/ExportCounts.py b/drop/config/ExportCounts.py
@@ -19,14 +19,15 @@ def __init__(self, dict_, outputRoot, sampleAnnotation, geneAnnotations, genomeA
         """
         self.CONFIG_KEYS = ["geneAnnotations", "excludeGroups"]
         self.config_dict = self.setDefaults(dict_, geneAnnotations)
-        self.outputRoot = outputRoot
+        self.outputRoot = outputRoot / "exported_counts"
         self.sa = sampleAnnotation
         self.genomeAssembly = genomeAssembly
+        self.geneAnnotations = self.get("geneAnnotations")
         self.modules = {
             "aberrantExpression": aberrantExpression,
             "aberrantSplicing": aberrantSplicing
         }
-        self.pattern = self.outputRoot / "exported_counts" / "{dataset}--{genomeAssembly}--{annotation}"
+        self.pattern = self.outputRoot / "{dataset}--{genomeAssembly}--{annotation}"
 
     def setDefaults(self, config_dict, gene_annotations):
         utils.setKey(config_dict, None, "geneAnnotations", gene_annotations)
@@ -49,7 +50,6 @@ def get(self, key):
     def getFilePattern(self, str_=True, expandStr=False):
         pattern = self.pattern
         if expandStr:
-            str_=True
             pattern = pattern.__str__().replace("{", "{{").replace("}", "}}") 
         return utils.returnPath(pattern, str_=str_)
 
@@ -67,23 +67,35 @@ def getExportGroups(self, modules=None):
         for module in modules:
             groups.extend(self.modules[module].groups)
         export_groups = set(groups) - set(self.get("excludeGroups"))
-        return export_groups
+        return sorted(list(export_groups))
 
-    def getExportCountFiles(self, prefix, expandPattern=None, **kwargs):
+    def getFiles(self, filename, datasets=None, **kwargs):
         """
-        Determine export count files.
-        :param prefix: name of file
-        :return: list of files to
+        Determine files for export count groups.
+        :param filename: name of file
+        :return: list of export files
         """
-        if prefix not in self.COUNT_TYPE_MAP.keys():
-            raise ValueError(f"{prefix} not a valid file type for exported counts")
+        if datasets is None:
+            datasets = self.getExportGroups()
+        file_pattern = str(self.pattern / f"{filename}")
+        return expand(
+            file_pattern,
+            dataset=datasets,
+            annotation=self.geneAnnotations,
+            genomeAssembly=self.genomeAssembly,
+            **kwargs
+        )
 
-        datasets = self.getExportGroups([self.COUNT_TYPE_MAP[prefix]])
-        if expandPattern is None:
-            file_pattern = str(self.pattern / f"{prefix}.tsv.gz")
-        else:
-            file_pattern = str(self.pattern / f"{expandPattern}.tsv.gz")
-        count_files = expand(file_pattern, annotation=self.get("geneAnnotations"),
-                             dataset=datasets, genomeAssembly=self.genomeAssembly, **kwargs)
-        return count_files
+    def getExportCountFiles(self, count_type, suffix="tsv.gz", expandPattern=None, **kwargs):
+        """
+        Determine export count files.
+        :param count_type: count type for mapping the submodule
+        :param suffix: file type suffix (without dot)
+        :return: list of export count files
+        """
+        if count_type not in self.COUNT_TYPE_MAP.keys():
+            raise ValueError(f"'{count_type}' not a valid file type for exported counts")
+        datasets = self.getExportGroups([self.COUNT_TYPE_MAP[count_type]])
+        expandPattern = count_type if expandPattern is None else expandPattern
+        return self.getFiles(f"{expandPattern}.{suffix}", datasets, **kwargs)
 
diff --git a/drop/config/SampleAnnotation.py b/drop/config/SampleAnnotation.py
@@ -97,7 +97,10 @@ def createSampleFileMapping(self):
 
     def createGroupIds(self, group_key="DROP_GROUP", file_type=None, sep=','):
         """
-        Create a mapping of DROP groups to lists of sample IDs
+        :param group_key: name of group column in sample annotation
+        :param file_type: name of file column e.g. "RNA_BAM_FILE", "DNA_VCF_FILE"
+        :param sep: separator of multiple groups in group column
+        :return: mapping of drop group and ID
         """
         if not file_type:
             file_type = "RNA_BAM_FILE"

diff --git a/drop/template/Scripts/Pipeline/SampleAnnotation.R b/drop/template/Scripts/Pipeline/SampleAnnotation.R
@@ -5,14 +5,11 @@
 #'  log:
 #'   - snakemake: '`sm str(tmp_dir / "SampleAnnotation.Rds")`'
 #'  params:
-#'   - export_dir: '`sm cfg.getProcessedResultsDir() + "/exported_counts"`'
-#'   - groups: '`sm cfg.exportCounts.getExportGroups()`'
 #'   - hpoFile: '`sm cfg.get("hpoFile")`'
 #'  input: 
-#'   - sampleAnnotation: '`sm config["sampleAnnotation"]`'
+#'   - sampleAnnotation: '`sm sa.file`'
 #'  output:
-#'   - export: '`sm touch(cfg.getProcessedResultsDir() + "/exported_counts/sample_anno.done")`'
-#'   - done: '`sm touch(cfg.getProcessedDataDir() + "/sample_anno/sample_anno.done")`'
+#'   - hpoOverlap: '`sm touch(cfg.getProcessedDataDir() + "/sample_anno/genes_overlapping_HPO_terms.tsv")`'
 #' output:
 #'   html_document:
 #'    code_folding: hide
@@ -80,8 +77,7 @@ unique(sa[,.(RNA_ID, DROP_GROUP)])$DROP_GROUP %>% strsplit(',') %>% unlist %>%
 
 # Obtain genes that overlap with HPO terms
 #+echo=F
-if(!is.null(sa$HPO_TERMS)){
-  if(!all(is.na(sa$HPO_TERMS)) & ! all(sa$HPO_TERMS == '')){
+if(!is.null(sa$HPO_TERMS) & !all(is.na(sa$HPO_TERMS)) & ! all(sa$HPO_TERMS == '')){
   sa2 <- sa[, .SD[1], by = RNA_ID]
 
   filename <- ifelse(is.null(snakemake@params$hpo_file), 
@@ -96,62 +92,6 @@ if(!is.null(sa$HPO_TERMS)){
   }) %>% invisible()  # don't print result
   sa2 <- sa2[, .(RNA_ID, HPO_matching_genes)]
 
-  fwrite(sa2, 
-         file.path(snakemake@config$root, 
-                   'processed_data/sample_anno/genes_overlapping_HPO_terms.tsv'), 
+  fwrite(sa2, snakemake@output$hpoOverlap,
          na = NA, sep = "\t", row.names = F, quote = F)
-  }
 }
-sa[, DROP_GROUP := gsub(' ', '', DROP_GROUP)]
-if(!is.null(sa$ICD_10))
-  sa[, ICD_10 := toupper(ICD_10)]
-
-list_groups <- strsplit(sa$DROP_GROUP, split = ',')
-drop_groups <- snakemake@params$groups # list_groups %>% unlist %>% unique
-
-genomeAssembly <- snakemake@config$genomeAssembly
-geneAnnotation <- snakemake@config$exportCounts$geneAnnotations
-
-sa[STRAND == 'no', STRAND_SPECIFIC := FALSE]
-sa[STRAND %in% c('yes', 'reverse'), STRAND_SPECIFIC := TRUE]
-sa[, STRAND := NULL]
-
-# export processed sample annotation and DESCRIPTION file
-for(dataset in drop_groups){
-  cols <- intersect(c('RNA_ID', 'INDIVIDUAL_ID', 'TISSUE', 'SEX', 'AFFECTED', 
-                      'ICD_10', 'PAIRED_END', 'STRAND_SPECIFIC'),
-                    colnames(sa))
-  sa_sub <- sa[sapply(list_groups, function(x) dataset %in% x), cols, with = F]
-  path <- file.path(snakemake@params$export_dir, paste(dataset, genomeAssembly, geneAnnotation, sep = '--'))
-  dir.create(path)
-  fwrite(sa_sub, file = file.path(path, 'sampleAnnotation.tsv'), 
-         quote = FALSE, row.names = FALSE, sep = '\t')
-
-  # DESCRIPTION file
-  v0 <- 'Title: # Add a title'
-  v1 <- paste0('Number of samples: ', nrow(sa_sub))
-  v2 <- ifelse(!is.null(sa_sub$TISSUE), 
-               ifelse(uniqueN(sa_sub$TISSUE) > 1, 
-                      'More than 1 tissue in dataset is not allowed!',
-                      paste0('Tissue: ', unique(sa_sub$TISSUE))),
-               'No tissue information')
-  v3 <- 'Organism: Homo sapiens'
-  v4 <- paste0('Genome assembly: ', genomeAssembly)
-  v5 <- paste0('Gene annotation: ', geneAnnotation)
-  v6 <- ifelse(!is.null(sa_sub$ICD_10), 
-               paste0('Disease (ICD-10: N): ', 
-                      paste(unite(data.table(table(sa_sub$ICD_10)), col = 'aux', 'V1', 'N', sep = ': ')$aux, collapse = ', ' )),
-               'No disease information')
-  v7 <- ifelse(uniqueN(sa_sub$STRAND_SPECIFIC) > 1, 
-               'All samples should be either strand- or non-strand-specific!',
-               paste0('Strand specific: ', unique(sa_sub$STRAND_SPECIFIC)))
-  v8 <- ifelse(uniqueN(sa_sub$PAIRED_END) > 1, 
-               'All samples should be either single end or paired end!',
-               paste0('Paired end: ', unique(sa_sub$PAIRED_END)))
-  v9 <- 'Cite as: RNA-Seq count tables were taken from: # add your citation(s)'
-  v10 <- 'Dataset contact: # Use format Name Last_Name, <email address>'
-  v11 <- 'Comments: # add any comments, if needed, otherwise remove'
-
-  writeLines(c(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11), file.path(path, 'DESCRIPTION.txt'))
-}
-
diff --git a/drop/template/Scripts/Pipeline/exportCountsMeta.py b/drop/template/Scripts/Pipeline/exportCountsMeta.py
@@ -0,0 +1,90 @@
+import itertools
+
+cfg = snakemake.params.dropConfig
+exportCounts = cfg.exportCounts
+sampleAnnotation = snakemake.params.sampleAnnotation
+
+# adapt sample annotation data frame
+sa = sampleAnnotation.sa.copy()
+sa["STRAND_SPECIFIC"] = sa["STRAND"] != "no"
+if "ICD_10" in sa:
+    sa["ICD_10"] = sa["ICD_10"].str.upper()
+
+sa_cols = frozenset(['RNA_ID', 'INDIVIDUAL_ID', 'TISSUE', 'SEX', 'AFFECTED', 'ICD_10', 'PAIRED_END',
+                     'STRAND_SPECIFIC']).intersection(sa.columns)
+sa_cols = list(sa_cols)
+
+
+# Getters for DESCRIPTION file
+def get_tissue_info(sa):
+    if "TISSUE" not in sa or sa["TISSUE"].is_null():
+        return 'No tissue information'
+    uniq_tissues = sa.TISSUE.unique()
+    if len(uniq_tissues) > 1:
+        return "More than 1 tissue in dataset is not allowed!"
+    return "".join(uniq_tissues)
+
+
+def get_disease_info(sa):
+    if "ICD_10" in sa:
+        # unite(data.table(table(sa_sub$ICD_10)), col = 'aux', 'V1', 'N', sep = ': ')$aux, collapse = ', ' )
+        table = sa.ICD_10.value_counts()
+        return "\n" + "\n".join([f"\t{d}: {c}" for d, c in zip(table.index, table)])
+    return "No disease information"
+
+
+def get_strand(sa):
+    if sa.STRAND_SPECIFIC.nunique() > 1:
+        return "All samples should be either strand- or non-strand-specific!"
+    else:
+        return sa.STRAND_SPECIFIC.unique()[0]
+
+
+def get_pairedEnd(sa):
+    if sa.PAIRED_END.nunique() > 1:
+        return 'All samples should be either single end or paired end!',
+    return sa.PAIRED_END.unique()[0]
+
+
+desc = """Title: # Add a title
+Number of samples: {}
+Tissue: {}
+Organism: {}
+Genome assembly: {}
+Gene annotation: {}
+Disease (ICD-10: N): {}
+Strand specific: {}
+Paired end: {}
+Cite as: RNA-Seq count tables were taken from # add your citation(s)
+Dataset contact: # Use format Name Last_Name, <email address>
+Comments: # add any comments, if needed, otherwise remove
+"""
+
+group_with_anno = itertools.product(exportCounts.getExportGroups(), exportCounts.geneAnnotations)
+sa_files = snakemake.output.sampleAnnotations
+desc_files = snakemake.output.descriptions
+
+for ga, sa_file, desc_file in zip(group_with_anno, sa_files, desc_files):
+    group, annotation = ga # unpack
+
+    # subset by group
+    rna_ids = sampleAnnotation.rnaIDs[group]
+    dna_ids = sampleAnnotation.dnaIDs[group]
+    sa_sub = sa.loc[(sa["RNA_ID"].isin(rna_ids)) & (sa["DNA_ID"].isin(dna_ids))]
+
+    # save sample annotation subset
+    sa_sub.to_csv(sa_file, sep='\t', index=False, columns=sa_cols)
+
+    # save DESCRIPTION file
+    with open(desc_file, "w") as f:
+        desc_output = desc.format(
+            len(sa_sub),  # number samples
+            get_tissue_info(sa_sub),  # tissue
+            "Homo sapiens",  # organism
+            exportCounts.genomeAssembly,
+            annotation,
+            get_disease_info(sa_sub),  # disease
+            get_strand(sa_sub),  # strand specific
+            get_pairedEnd(sa_sub)  # paired end
+        )
+        f.write(desc_output)
diff --git a/drop/template/Snakefile b/drop/template/Snakefile
@@ -33,14 +33,20 @@ rule all:
         """
 
 rule sampleAnnotation:
-    input: cfg.getProcessedDataDir() + "/sample_anno/sample_anno.done"
+    input: rules.Pipeline_SampleAnnotation_R.output
 
 rule exportCounts:
     input:
         cfg.exportCounts.getExportCountFiles("geneCounts"),
         cfg.exportCounts.getExportCountFiles("splicingCounts", expandPattern="k_{type}_counts", type=["j", "theta"]),
         cfg.exportCounts.getExportCountFiles("splicingCounts", expandPattern="n_{type}_counts", type=["psi5", "psi3", "theta"]),
-        cfg.getProcessedResultsDir() + "/exported_counts/sample_anno.done"
+    output:
+        sampleAnnotations = cfg.exportCounts.getFiles("sampleAnnotation.tsv"),
+        descriptions = cfg.exportCounts.getFiles("DESCRIPTION.txt"),
+    params:
+        dropConfig = cfg,
+        sampleAnnotation = sa
+    script: "Scripts/Pipeline/exportCountsMeta.py"
 
 rule dependencyGraph:
     input:

diff --git a/setup.py b/setup.py
@@ -14,9 +14,8 @@
 
 extra_files = []
 for (path, directories, filenames) in os.walk('drop/'):
-    directories[:] = [d for d in directories if not (d.startswith('.') or d == 'Data')]
-    filenames[:] = [f for f in filenames if 
-                    not (f.startswith('.') or f.endswith('.Rproj') or f.endswith('.py'))]
+    directories[:] = [d for d in directories if not d.startswith('.')]
+    filenames[:] = [f for f in filenames if not f.startswith('.') and not f.endswith('.Rproj')]
     for filename in filenames:
         extra_files.append(os.path.join('..', path, filename))
 

diff --git a/tests/config/test_config.py b/tests/config/test_config.py
@@ -18,9 +18,9 @@ def test_DropConfigPaths(demo_dir, dropConfig):
 @pytest.mark.parametrize(
         "modules,groups",
         [
-            ("aberrantExpression", {"outrider"}),
-            ("aberrantSplicing", {"fraser"}),
-            (["aberrantExpression", "aberrantSplicing"], {"outrider", "fraser"})
+            ("aberrantExpression", ["outrider"]),
+            ("aberrantSplicing", ["fraser"]),
+            (["aberrantExpression", "aberrantSplicing"], ["fraser", "outrider"])
         ]
     )
 def test_cfgExportGroups(dropConfig, modules, groups):