Merge pull request #3512 from mvdbeek/add_compressed_fastq_peek_and_p…

…review Add compressed fastq peek and preview
galaxyproject · Feb 3, 2017 · 1dbe75f · 1dbe75f
2 parents de509a8 + 7075d79
commit 1dbe75f
Show file tree

Hide file tree

Showing 5 changed files with 123 additions and 102 deletions.
diff --git a/lib/galaxy/datatypes/data.py b/lib/galaxy/datatypes/data.py
@@ -15,6 +15,7 @@
 
 from galaxy import util
 from galaxy.datatypes.metadata import MetadataElement  # import directly to maintain ease of use in Datatype class definitions
+from galaxy.util import compression_utils
 from galaxy.util import FILENAME_VALID_CHARS
 from galaxy.util import inflector
 from galaxy.util import unicodify
@@ -976,39 +977,38 @@ def get_file_peek( file_name, is_multi_byte=False, WIDTH=256, LINE_COUNT=5, skip
     count = 0
     file_type = None
     data_checked = False
-    temp = open( file_name, "U" )
-    while count < LINE_COUNT:
-        line = temp.readline( WIDTH )
-        if line and not is_multi_byte and not data_checked:
-            # See if we have a compressed or binary file
-            if line[0:2] == util.gzip_magic:
-                file_type = 'gzipped'
-            else:
+    temp = compression_utils.get_fileobj( file_name, "U" )
+    try:
+        while count < LINE_COUNT:
+            line = temp.readline( WIDTH )
+            if line and not is_multi_byte and not data_checked:
+                # See if we have a compressed or binary file
                 for char in line:
                     if ord( char ) > 128:
                         file_type = 'binary'
                         break
-            data_checked = True
-            if file_type in [ 'gzipped', 'binary' ]:
-                break
-        if not line_wrap:
-            if line.endswith('\n'):
-                line = line[:-1]
-            else:
-                while True:
-                    i = temp.read(1)
-                    if not i or i == '\n':
-                        break
-        skip_line = False
-        for skipchar in skipchars:
-            if line.startswith( skipchar ):
-                skip_line = True
-                break
-        if not skip_line:
-            lines.append( line )
-            count += 1
-    temp.close()
-    if file_type in [ 'gzipped', 'binary' ]:
+                data_checked = True
+                if file_type == 'binary':
+                    break
+            if not line_wrap:
+                if line.endswith('\n'):
+                    line = line[:-1]
+                else:
+                    while True:
+                        i = temp.read(1)
+                        if not i or i == '\n':
+                            break
+            skip_line = False
+            for skipchar in skipchars:
+                if line.startswith( skipchar ):
+                    skip_line = True
+                    break
+            if not skip_line:
+                lines.append( line )
+                count += 1
+    finally:
+        temp.close()
+    if file_type == 'binary':
         text = "%s file" % file_type
     else:
         try:

diff --git a/lib/galaxy/datatypes/sequence.py b/lib/galaxy/datatypes/sequence.py
@@ -2,8 +2,6 @@
 Sequence classes
 """
 
-import bz2
-import gzip
 import json
 import logging
 import os
@@ -19,7 +17,10 @@
 from galaxy.datatypes.binary import Binary
 from galaxy.datatypes.metadata import MetadataElement
 from galaxy.datatypes.sniff import get_headers
-from galaxy.util import nice_size
+from galaxy.util import (
+    compression_utils,
+    nice_size
+)
 from galaxy.util.checkers import (
     is_bz2,
     is_gzip
@@ -142,22 +143,13 @@ def do_slow_split( cls, input_datasets, subdir_generator_function, split_params)
         if input_datasets[0].metadata is not None and input_datasets[0].metadata.sequences is not None:
             total_sequences = input_datasets[0].metadata.sequences
         else:
-            input_file = input_datasets[0].file_name
-            compress = is_gzip(input_file)
-            if compress:
-                # gzip is really slow before python 2.7!
-                in_file = gzip.GzipFile(input_file, 'r')
-            else:
-                # TODO
-                # if a file is not compressed, seek locations can be calculated and stored
-                # ideally, this would be done in metadata
-                # TODO
-                # Add BufferedReader if python 2.7?
-                in_file = open(input_file, 'rt')
-            total_sequences = long(0)
-            for i, line in enumerate(in_file):
-                total_sequences += 1
-            in_file.close()
+            in_file = compression_utils.get_fileobj(input_datasets[0].file_name)
+            try:
+                total_sequences = long(0)
+                for i, line in enumerate(in_file):
+                    total_sequences += 1
+            finally:
+                in_file.close()
             total_sequences /= 4
 
         sequences_per_file = cls.get_sequences_per_file(total_sequences, split_params)
@@ -578,15 +570,8 @@ def set_meta( self, dataset, **kwd ):
         data_lines = 0
         sequences = 0
         seq_counter = 0     # blocks should be 4 lines long
-        compressed_gzip = is_gzip(dataset.file_name)
-        compressed_bzip2 = is_bz2(dataset.file_name)
+        in_file = compression_utils.get_fileobj(dataset.file_name)
         try:
-            if compressed_gzip:
-                in_file = gzip.GzipFile(dataset.file_name)
-            elif compressed_bzip2:
-                in_file = bz2.BZ2File(dataset.file_name)
-            else:
-                in_file = open(dataset.file_name)
             for line in in_file:
                 line = line.strip()
                 if line and line.startswith( '#' ) and not data_lines:
@@ -640,6 +625,20 @@ def sniff( self, filename ):
         except:
             return False
 
+    def display_data(self, trans, dataset, preview=False, filename=None, to_ext=None, **kwd):
+        if preview:
+            fh = compression_utils.get_fileobj(dataset.file_name)
+            max_peek_size = 1000000  # 1 MB
+            if os.stat( dataset.file_name ).st_size < max_peek_size:
+                mime = "text/plain"
+                self._clean_and_set_mime_type( trans, mime )
+                return fh.read()
+            return trans.stream_template_mako( "/dataset/large_file.mako",
+                                           truncated_data=fh.read(max_peek_size),
+                                           data=dataset)
+        else:
+            return Sequence.display_data(self, trans, dataset, preview, filename, to_ext, **kwd)
+
     def split( cls, input_datasets, subdir_generator_function, split_params):
         """
         FASTQ files are split on cluster boundaries, in increments of 4 lines

diff --git a/lib/galaxy/datatypes/sniff.py b/lib/galaxy/datatypes/sniff.py
@@ -18,7 +18,10 @@
 
 from galaxy import util
 from galaxy.util import multi_byte
-from galaxy.util import unicodify
+from galaxy.util import (
+    compression_utils,
+    unicodify
+)
 from galaxy.util.checkers import (
     check_binary,
     check_html,
@@ -204,15 +207,8 @@ def get_headers( fname, sep, count=60, is_multi_byte=False ):
     [['chr7', '127475281', '127491632', 'NM_000230', '0', '+', '127486022', '127488767', '0', '3', '29,172,3225,', '0,10713,13126,'], ['chr7', '127486011', '127488900', 'D49487', '0', '+', '127486022', '127488767', '0', '2', '155,490,', '0,2399']]
     """
     headers = []
-    compressed_gzip = is_gzip(fname)
-    compressed_bzip2 = is_bz2(fname)
+    in_file = compression_utils.get_fileobj(fname)
     try:
-        if compressed_gzip:
-            in_file = gzip.GzipFile(fname, 'r')
-        elif compressed_bzip2:
-            in_file = bz2.BZ2File(fname, 'r')
-        else:
-            in_file = open(fname, 'rt')
         for idx, line in enumerate(in_file):
             line = line.rstrip('\n\r')
             if is_multi_byte:
@@ -490,9 +486,9 @@ def handle_uploaded_dataset_file( filename, datatypes_registry, ext='auto', is_m
 
 
 AUTO_DETECT_EXTENSIONS = [ 'auto' ]  # should 'data' also cause auto detect?
-DECOMPRESSION_FUNCTIONS = dict( gzip=gzip.GzipFile )
-COMPRESSION_CHECK_FUNCTIONS = [ ( 'gzip', is_gzip ) ]
-COMPRESSION_DATATYPES = dict( gzip=[ 'bam', 'fastq.gz', 'fastqsanger.gz', 'fastqillumina.gz', 'fastqsolexa.gz', 'fastqcssanger.gz', 'fastq.bz2', 'fastqsanger.bz2', 'fastqillumina.bz2', 'fastqsolexa.bz2', 'fastqcssanger.bz2' ] )
+DECOMPRESSION_FUNCTIONS = dict( gzip=gzip.GzipFile, bz2=bz2.BZ2File )
+COMPRESSION_CHECK_FUNCTIONS = [ ( 'gzip', is_gzip ), ('bz2', is_bz2) ]
+COMPRESSION_DATATYPES = dict( gzip=[ 'bam', 'fastq.gz', 'fastqsanger.gz', 'fastqillumina.gz', 'fastqsolexa.gz', 'fastqcssanger.gz'], bz2=['fastq.bz2', 'fastqsanger.bz2', 'fastqillumina.bz2', 'fastqsolexa.bz2', 'fastqcssanger.bz2' ] )
 COMPRESSED_EXTENSIONS = []
 for exts in COMPRESSION_DATATYPES.values():
     COMPRESSED_EXTENSIONS.extend( exts )

diff --git a/lib/galaxy/datatypes/tabular.py b/lib/galaxy/datatypes/tabular.py
@@ -5,7 +5,6 @@
 
 import abc
 import csv
-import gzip
 import logging
 import os
 import re
@@ -19,7 +18,7 @@
 from galaxy.datatypes import data, metadata
 from galaxy.datatypes.metadata import MetadataElement
 from galaxy.datatypes.sniff import get_headers
-from galaxy.util.checkers import is_gzip
+from galaxy.util import compression_utils
 
 from . import dataproviders
 
@@ -789,11 +788,7 @@ def sniff( self, filename ):
             - We will only check that up to the first 5 alignments are correctly formatted.
         """
         try:
-            compress = is_gzip(filename)
-            if compress:
-                fh = gzip.GzipFile(filename, 'r')
-            else:
-                fh = open( filename )
+            fh = compression_utils.get_fileobj(filename, gzip_only=True)
             count = 0
             while True:
                 line = fh.readline()
@@ -830,33 +825,31 @@ def sniff( self, filename ):
 
     def set_meta( self, dataset, overwrite=True, skip=None, max_data_lines=5, **kwd ):
         if dataset.has_data():
-            compress = is_gzip(dataset.file_name)
-            if compress:
-                dataset_fh = gzip.GzipFile(dataset.file_name, 'r')
-            else:
-                dataset_fh = open( dataset.file_name )
-            lanes = {}
-            tiles = {}
-            barcodes = {}
-            reads = {}
-            # Should always read the entire file (until we devise a more clever way to pass metadata on)
-            # if self.max_optional_metadata_filesize >= 0 and dataset.get_size() > self.max_optional_metadata_filesize:
-            # If the dataset is larger than optional_metadata, just count comment lines.
-            #     dataset.metadata.data_lines = None
-            # else:
-            # Otherwise, read the whole thing and set num data lines.
-            for i, line in enumerate(dataset_fh):
-                if line:
-                    line_pieces = line.split('\t')
-                    if len(line_pieces) != 22:
-                        raise Exception('%s:%d:Corrupt line!' % (dataset.file_name, i))
-                    lanes[line_pieces[2]] = 1
-                    tiles[line_pieces[3]] = 1
-                    barcodes[line_pieces[6]] = 1
-                    reads[line_pieces[7]] = 1
-                pass
-            dataset.metadata.data_lines = i + 1
-            dataset_fh.close()
+            dataset_fh = compression_utils.get_fileobj(dataset.file_name, gzip_only=True)
+            try:
+                lanes = {}
+                tiles = {}
+                barcodes = {}
+                reads = {}
+                # Should always read the entire file (until we devise a more clever way to pass metadata on)
+                # if self.max_optional_metadata_filesize >= 0 and dataset.get_size() > self.max_optional_metadata_filesize:
+                # If the dataset is larger than optional_metadata, just count comment lines.
+                #     dataset.metadata.data_lines = None
+                # else:
+                # Otherwise, read the whole thing and set num data lines.
+                for i, line in enumerate(dataset_fh):
+                    if line:
+                        line_pieces = line.split('\t')
+                        if len(line_pieces) != 22:
+                            raise Exception('%s:%d:Corrupt line!' % (dataset.file_name, i))
+                        lanes[line_pieces[2]] = 1
+                        tiles[line_pieces[3]] = 1
+                        barcodes[line_pieces[6]] = 1
+                        reads[line_pieces[7]] = 1
+                    pass
+                dataset.metadata.data_lines = i + 1
+            finally:
+                dataset_fh.close()
             dataset.metadata.comment_lines = 0
             dataset.metadata.columns = 21
             dataset.metadata.column_types = ['str', 'int', 'int', 'int', 'int', 'int', 'str', 'int', 'str', 'str', 'str', 'str', 'str', 'str', 'str', 'str', 'str', 'str', 'str', 'str', 'str']

diff --git a/lib/galaxy/util/compression_utils.py b/lib/galaxy/util/compression_utils.py
@@ -0,0 +1,33 @@
+import bz2
+import gzip
+import zipfile
+
+from .checkers import (
+    is_bz2,
+    is_gzip
+)
+
+
+def get_fileobj(filename, mode="r", gzip_only=False, bz2_only=False, zip_only=False):
+    """
+    Returns a fileobj. If the file is compressed, return appropriate file reader.
+
+    :param filename: path to file that should be opened
+    :param mode: mode to pass to opener
+    :param gzip_only: only open file if file is gzip compressed or not compressed
+    :param bz2_only: only open file if file is bz2 compressed or not compressed
+    :param zip_only: only open file if file is zip compressed or not compressed
+    """
+    # the various compression readers don't support 'U' mode,
+    # so we open in 'r'.
+    if mode == 'U':
+        cmode = 'r'
+    else:
+        cmode = mode
+    if not bz2_only and not zip_only and is_gzip(filename):
+        return gzip.GzipFile(filename, cmode)
+    if not gzip_only and not zip_only and is_bz2(filename):
+        return bz2.BZ2File(filename, cmode)
+    if not bz2_only and not gzip_only and zipfile.is_zipfile(filename):
+        return zipfile.os_zipfile(filename, cmode)
+    return open(filename, mode)