Bugfixes (#120)

* Fix Issue #114 (#116) * refactored drop demo code s.t. config and sample annotation files are in the package * adapted pipeline to handle non-existing external counts new group in demo to distinguish count import from no count import cases some bugfixes regarding exportCounts parsing and import counts subsetting * fixed AE pipeline test * travis with latest R version 4 * revert pipeline in pytest to 2 cores * R 4.0.2 Co-authored-by: mumichae <mi.mueller@tum.de> * Refactoring (#118) * refactored Submodule classes into separate files * using MAE and AS functions from respective classes instead of Snakefile or R script header * fixed input for MAE rules * use chr with bam counts for ASEReadCounter * added test for aberrant splicing count file functions * FRASER compatibility fixes * fixed subsetGroup API change Co-authored-by: Michaela Müller <mi.mueller@tum.de> Co-authored-by: Christian Mertes <mertes@in.tum.de> * using wbuild 1.8.0 for pip installation * fixed travis install error Co-authored-by: Michaela Müller <mi.mueller@tum.de> Co-authored-by: Vicente <yepez@in.tum.de> Co-authored-by: Christian Mertes <mertes@in.tum.de>
gagneurlab · Oct 27, 2020 · 6e9b586 · 6e9b586
1 parent 5ce58f4
commit 6e9b586
Show file tree

Hide file tree

Showing 34 changed files with 621 additions and 364 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -22,9 +22,8 @@ install:
   - conda create -q -n drop -c conda-forge -c bioconda python=$TRAVIS_PYTHON_VERSION r-base=4.0.2
   - conda activate drop
   - conda install -c conda-forge -c bioconda drop
-  - conda remove --force drop wbuild
-  - conda install -c conda-forge -c bioconda wbuild=1.7.1
-  - pip install -r requirements_test.txt
+  - conda remove --force drop
+  - pip install -r tests/requirements.txt
 
   # install drop
   - pip install . -vv

diff --git a/drop/__init__.py b/drop/__init__.py
@@ -1,3 +1,4 @@
 from .setupDrop import *
 from . import config
 from . import utils
+from . import demo
diff --git a/drop/cli.py b/drop/cli.py
@@ -101,26 +101,10 @@ def demo():
     response = subprocess.run(["bash", download_script], stderr=subprocess.STDOUT)
     response.check_returncode()
 
-    # fix config file
-    with open("config.yaml", "r") as f:
-        config = yaml.load(f, Loader=yaml.Loader)
-    path_keys = {"root": None,
-                 "htmlOutputPath": None,
-                 "sampleAnnotation": None,
-                 "v29": ["geneAnnotation"],
-                 "genome": ["mae"], "qcVcf": ["mae"]}
-
-    for key, sub in path_keys.items():
-        # iterate to key and entry
-        dict_ = config
-        if sub is not None:
-            for x in sub:
-                dict_ = dict_[x]
-        # set absolute path
-        dict_[key] = str(Path(dict_[key]).resolve())
-
-    with open("config.yaml", "w") as f:
-        yaml.safe_dump(config.copy(), f, default_flow_style=False,
-                       sort_keys=False)
+    # copy sample annotation and config files with absolute paths
+    demo_repo = Path(drop.__file__).parent / "demo"
+    drop.demo.fixSampleAnnotation(demo_repo / "sample_annotation_relative.tsv",
+                                  Path.cwd() / "Data" / "sample_annotation.tsv")
+    drop.demo.fixConfig(demo_repo / "config_relative.yaml", Path.cwd() / "config.yaml")
 
     logger.info("demo project created")
diff --git a/drop/config/DropConfig.py b/drop/config/DropConfig.py
@@ -1,5 +1,5 @@
 from .SampleAnnotation import SampleAnnotation
-from .Submodules import AE, AS, MAE
+from .submodules import *
 from .ExportCounts import ExportCounts
 from drop import utils
 from pathlib import Path
@@ -35,23 +35,42 @@ def __init__(self, wbuildConfig):
         self.htmlOutputPath = Path(self.get("htmlOutputPath"))
         self.readmePath = Path(self.get("readmePath"))
 
+        # annotations
         self.geneAnnotation = self.get("geneAnnotation")
         self.genomeAssembly = self.get("genomeAssembly")
+        self.fastaFile = self.get("mae")["genome"] # TODO: move fasta outside of mae
+        self.fastaDict = Path(self.fastaFile).with_suffix(".dict")
         self.sampleAnnotation = SampleAnnotation(self.get("sampleAnnotation"), self.root)
 
-        # setup submodules
-        cfg = self.config_dict
-        sa = self.sampleAnnotation
-        pd = self.processedDataDir
-        pr = self.processedResultsDir
-        self.AE = AE(cfg["aberrantExpression"], sa, pd, pr)
-        self.AS = AS(cfg["aberrantSplicing"], sa, pd, pr)
-        self.MAE = MAE(cfg["mae"], sa, pd, pr)
+        # submodules
+        self.AE = AE(
+            config=self.get("aberrantExpression"),
+            sampleAnnotation=self.sampleAnnotation,
+            processedDataDir=self.processedDataDir,
+            processedResultsDir=self.processedResultsDir
+        )
+        self.AS = AS(
+            config=self.get("aberrantSplicing"),
+            sampleAnnotation=self.sampleAnnotation,
+            processedDataDir=self.processedDataDir,
+            processedResultsDir=self.processedResultsDir
+        )
+        self.MAE = MAE(
+            config=self.get("mae"),
+            sampleAnnotation=self.sampleAnnotation,
+            processedDataDir=self.processedDataDir,
+            processedResultsDir=self.processedResultsDir
+        )
 
+        # counts export
         self.exportCounts = ExportCounts(
-            self.config_dict, self.processedResultsDir,
-            self.sampleAnnotation, self.getGeneAnnotations(), self.get("genomeAssembly"),
-            aberrantExpression=self.AE, aberrantSplicing=self.AS
+            dict_=self.get("exportCounts"),
+            outputRoot=self.processedResultsDir,
+            sampleAnnotation=self.sampleAnnotation,
+            geneAnnotations=self.getGeneAnnotations(),
+            genomeAssembly=self.get("genomeAssembly"),
+            aberrantExpression=self.AE,
+            aberrantSplicing=self.AS
         )
 
         # legacy
@@ -110,9 +129,16 @@ def getHtmlFromScript(self, path, str_=True):
 
     def get(self, key):
         if key not in self.CONFIG_KEYS:
-            raise KeyError(f"{key} not defined for Drop config")
+            raise KeyError(f"'{key}' not defined for DROP config")
         return self.wBuildConfig.get(key)
 
+    def getTool(self, tool):
+        try:
+            toolCmd = self.get("tools")[tool]
+        except KeyError:
+            raise KeyError(f"'{toolCmd}' not a defined tool for DROP config")
+        return toolCmd
+
     def getGeneAnnotations(self):
         return self.geneAnnotation
 
@@ -121,3 +147,9 @@ def getGeneVersions(self):
 
     def getGeneAnnotationFile(self, annotation):
         return self.geneAnnotation[annotation]
+
+    def getFastaFile(self, str_=True):
+        return utils.returnPath(self.fastaFile, str_)
+
+    def getFastaDict(self, str_=True):
+        return utils.returnPath(self.fastaDict, str_)
diff --git a/drop/config/SampleAnnotation.py b/drop/config/SampleAnnotation.py
@@ -3,15 +3,17 @@
 from pathlib import Path
 from snakemake.logging import logger
 import warnings
+
 warnings.filterwarnings("ignore", 'This pattern has match groups')
 
+
 class SampleAnnotation:
-
-    SAMPLE_ANNOTATION_COLUMNS = [
-        "RNA_ID", "RNA_BAM_FILE", "DNA_ID", "DNA_VCF_FILE", "DROP_GROUP", "GENE_COUNTS_FILE", "ANNOTATION",
+    FILE_TYPES = ["RNA_BAM_FILE", "DNA_VCF_FILE", "GENE_COUNTS_FILE"]
+    SAMPLE_ANNOTATION_COLUMNS = FILE_TYPES + [
+        "RNA_ID", "DNA_ID", "DROP_GROUP", "ANNOTATION",
         "PAIRED_END", "COUNT_MODE", "COUNT_OVERLAPS", "STRAND"
     ]
-    
+
     def __init__(self, file, root):
         """
         sa_file: sample annotation file location from config
@@ -22,12 +24,12 @@ def __init__(self, file, root):
         self.sa = self.parse()
         self.idMapping = self.createIdMapping()
         self.sampleFileMapping = self.createSampleFileMapping()
-        
+
         self.rnaIDs = self.createGroupIds(file_type="RNA_BAM_FILE", sep=',')
         self.dnaIDs = self.createGroupIds(file_type="DNA_VCF_FILE", sep=',')
         # external counts
         self.extGeneCountIDs = self.createGroupIds(file_type="GENE_COUNTS_FILE", sep=',')
-        
+
     def parse(self, sep='\t'):
         """
         read and check sample annotation for missing columns
@@ -37,16 +39,16 @@ def parse(self, sep='\t'):
         missing_cols = [x for x in self.SAMPLE_ANNOTATION_COLUMNS if x not in sa.columns.values]
         if len(missing_cols) > 0:
             raise ValueError(f"Incorrect columns in sample annotation file. Missing:\n{missing_cols}")
-        
+
         # remove unwanted characters
         col = sa["DROP_GROUP"]
         col = col.str.replace(" ", "").str.replace("(|)", "", regex=True)
         sa["DROP_GROUP"] = col
-        
+
         # set ID type as string
         for ID_key in ["RNA_ID", "DNA_ID"]:
             sa[ID_key] = sa[ID_key].apply(
-                    lambda x: str(x) if not pd.isnull(x) else x
+                lambda x: str(x) if not pd.isnull(x) else x
             )
         return sa
 
@@ -57,7 +59,7 @@ def createIdMapping(self):
         Get mapping of RNA and DNA IDs
         """
         return self.sa[["RNA_ID", "DNA_ID"]].drop_duplicates().dropna()
-    
+
     def createSampleFileMapping(self):
         """
         create a sample file mapping with unique entries of existing files
@@ -74,11 +76,11 @@ def createSampleFileMapping(self):
                 df['FILE_TYPE'] = file_type
                 assay_subsets.append(df)
         file_mapping = pd.concat(assay_subsets)
-        
+
         # cleaning SAMPLE_FILE_MAPPING
         file_mapping.dropna(inplace=True)
         file_mapping.drop_duplicates(inplace=True)
-        
+
         # check for missing files
         existing = utils.checkFileExists(file_mapping["FILE_PATH"])
         if len(existing) == 0:
@@ -102,7 +104,13 @@ def createGroupIds(self, group_key="DROP_GROUP", file_type=None, sep=','):
         if not file_type:
             file_type = "RNA_BAM_FILE"
         # infer ID type from file type
-        assay_id = self.subsetFileMapping(file_type)["ASSAY"].iloc[0]
+        assay_id = self.subsetFileMapping(file_type)["ASSAY"].unique()
+        if len(assay_id) == 0:
+            return {}  # no files, return empty mapping
+        elif len(assay_id) > 1:
+            raise ValueError(f"More than 1 assay entry for file type {file_type}:\n{assay_id}")
+        else:
+            assay_id = assay_id[0]
 
         # Subset sample annotation to only IDs of specified file_type
         ids = self.getSampleIDs(file_type)
@@ -139,7 +147,7 @@ def subsetSampleAnnotation(self, column, values, subset=None):
             # check type for subset
             if not isinstance(subset, pd.DataFrame):
                 raise TypeError(f"Is not pandas DataFrame\n {subset}")
-            if not sa_cols <= set(subset.columns): # check if mandatory cols not contained
+            if not sa_cols <= set(subset.columns):  # check if mandatory cols not contained
                 raise ValueError(f"Subset columns not the same as {sa_cols}\ngot: {subset.columns}")
 
         # check if column is valid
@@ -158,12 +166,12 @@ def subsetFileMapping(self, file_type=None, sample_id=None):
         subset = utils.subsetBy(subset, "ID", sample_id)
         return subset
 
-    def subsetGroups(self, subset_groups, assay="RNA", warn=30, error=10):
+    def subsetGroups(self, subset_groups, assay="RNA"):
         """
         Subset DROP group to sample IDs mapping by list of groups (`subset_groups`).
-        Give warning or error if subsetting results in too few sample IDs per group.
-        warn : number of samples threshold at which to warn about too few samples
-        error: number of samples threshold at which to give error
+        :param subset_groups: list of groups to include
+        :param assay: name/prefix of assay type
+        :return: dictionary with group names as keys and ID lists as entries
         """
         ids_by_group = self.getGroupedIDs(assay)
 
@@ -173,16 +181,6 @@ def subsetGroups(self, subset_groups, assay="RNA", warn=30, error=10):
             subset_groups = [subset_groups] if subset_groups.__class__ == str else subset_groups
             subset = {gr: ids for gr, ids in
                       ids_by_group.items() if gr in subset_groups}
-
-        for group in subset_groups:
-            if len(subset[group]) < error:
-                message = f'Too few IDs in DROP_GROUP {group}'
-                message += f', please ensure that it has at least {error} IDs'
-                message += f', groups: {subset[group]}'
-                raise ValueError(message)
-            elif len(subset[group]) < warn:
-                logger.info(f'WARNING: Less than {warn} IDs in DROP_GROUP {group}')
-
         return subset
 
     ### Getters
@@ -199,7 +197,7 @@ def getFilePath(self, sample_id, file_type, single_file=True):
                 raise ValueError(message)
             path = path[0]
         return path
-    
+
     def getFilePaths(self, file_type, group=None):
         """
         Get all file paths of a file type
@@ -233,7 +231,7 @@ def getRow(self, column, value):
         return row
 
     ### DROP Groups ###
-    
+
     def getGroupedIDs(self, assays):
         """
         Get group to IDs mapping
@@ -251,12 +249,16 @@ def getGroupedIDs(self, assays):
             else:
                 raise ValueError(f"'{assay}' is not a valid assay name")
         return groupedIDs
-        
+
     def getGroups(self, assay="RNA"):
         return self.getGroupedIDs(assay).keys()
-    
+
     def getIDsByGroup(self, group, assay="RNA"):
-        return self.getGroupedIDs(assay)[group]
+        try:
+            ids = self.getGroupedIDs(assay)[group]
+        except:
+            ids = []
+        return ids
 
     def getSampleIDs(self, file_type):
         ids = self.subsetFileMapping(file_type)["ID"]