Skip to content

Commit

Permalink
Merge pull request #3512 from mvdbeek/add_compressed_fastq_peek_and_p…
Browse files Browse the repository at this point in the history
…review

Add compressed fastq peek and preview
  • Loading branch information
jmchilton committed Feb 3, 2017
2 parents de509a8 + 7075d79 commit 1dbe75f
Show file tree
Hide file tree
Showing 5 changed files with 123 additions and 102 deletions.
58 changes: 29 additions & 29 deletions lib/galaxy/datatypes/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

from galaxy import util
from galaxy.datatypes.metadata import MetadataElement # import directly to maintain ease of use in Datatype class definitions
from galaxy.util import compression_utils
from galaxy.util import FILENAME_VALID_CHARS
from galaxy.util import inflector
from galaxy.util import unicodify
Expand Down Expand Up @@ -976,39 +977,38 @@ def get_file_peek( file_name, is_multi_byte=False, WIDTH=256, LINE_COUNT=5, skip
count = 0
file_type = None
data_checked = False
temp = open( file_name, "U" )
while count < LINE_COUNT:
line = temp.readline( WIDTH )
if line and not is_multi_byte and not data_checked:
# See if we have a compressed or binary file
if line[0:2] == util.gzip_magic:
file_type = 'gzipped'
else:
temp = compression_utils.get_fileobj( file_name, "U" )
try:
while count < LINE_COUNT:
line = temp.readline( WIDTH )
if line and not is_multi_byte and not data_checked:
# See if we have a compressed or binary file
for char in line:
if ord( char ) > 128:
file_type = 'binary'
break
data_checked = True
if file_type in [ 'gzipped', 'binary' ]:
break
if not line_wrap:
if line.endswith('\n'):
line = line[:-1]
else:
while True:
i = temp.read(1)
if not i or i == '\n':
break
skip_line = False
for skipchar in skipchars:
if line.startswith( skipchar ):
skip_line = True
break
if not skip_line:
lines.append( line )
count += 1
temp.close()
if file_type in [ 'gzipped', 'binary' ]:
data_checked = True
if file_type == 'binary':
break
if not line_wrap:
if line.endswith('\n'):
line = line[:-1]
else:
while True:
i = temp.read(1)
if not i or i == '\n':
break
skip_line = False
for skipchar in skipchars:
if line.startswith( skipchar ):
skip_line = True
break
if not skip_line:
lines.append( line )
count += 1
finally:
temp.close()
if file_type == 'binary':
text = "%s file" % file_type
else:
try:
Expand Down
53 changes: 26 additions & 27 deletions lib/galaxy/datatypes/sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
Sequence classes
"""

import bz2
import gzip
import json
import logging
import os
Expand All @@ -19,7 +17,10 @@
from galaxy.datatypes.binary import Binary
from galaxy.datatypes.metadata import MetadataElement
from galaxy.datatypes.sniff import get_headers
from galaxy.util import nice_size
from galaxy.util import (
compression_utils,
nice_size
)
from galaxy.util.checkers import (
is_bz2,
is_gzip
Expand Down Expand Up @@ -142,22 +143,13 @@ def do_slow_split( cls, input_datasets, subdir_generator_function, split_params)
if input_datasets[0].metadata is not None and input_datasets[0].metadata.sequences is not None:
total_sequences = input_datasets[0].metadata.sequences
else:
input_file = input_datasets[0].file_name
compress = is_gzip(input_file)
if compress:
# gzip is really slow before python 2.7!
in_file = gzip.GzipFile(input_file, 'r')
else:
# TODO
# if a file is not compressed, seek locations can be calculated and stored
# ideally, this would be done in metadata
# TODO
# Add BufferedReader if python 2.7?
in_file = open(input_file, 'rt')
total_sequences = long(0)
for i, line in enumerate(in_file):
total_sequences += 1
in_file.close()
in_file = compression_utils.get_fileobj(input_datasets[0].file_name)
try:
total_sequences = long(0)
for i, line in enumerate(in_file):
total_sequences += 1
finally:
in_file.close()
total_sequences /= 4

sequences_per_file = cls.get_sequences_per_file(total_sequences, split_params)
Expand Down Expand Up @@ -578,15 +570,8 @@ def set_meta( self, dataset, **kwd ):
data_lines = 0
sequences = 0
seq_counter = 0 # blocks should be 4 lines long
compressed_gzip = is_gzip(dataset.file_name)
compressed_bzip2 = is_bz2(dataset.file_name)
in_file = compression_utils.get_fileobj(dataset.file_name)
try:
if compressed_gzip:
in_file = gzip.GzipFile(dataset.file_name)
elif compressed_bzip2:
in_file = bz2.BZ2File(dataset.file_name)
else:
in_file = open(dataset.file_name)
for line in in_file:
line = line.strip()
if line and line.startswith( '#' ) and not data_lines:
Expand Down Expand Up @@ -640,6 +625,20 @@ def sniff( self, filename ):
except:
return False

def display_data(self, trans, dataset, preview=False, filename=None, to_ext=None, **kwd):
if preview:
fh = compression_utils.get_fileobj(dataset.file_name)
max_peek_size = 1000000 # 1 MB
if os.stat( dataset.file_name ).st_size < max_peek_size:
mime = "text/plain"
self._clean_and_set_mime_type( trans, mime )
return fh.read()
return trans.stream_template_mako( "/dataset/large_file.mako",
truncated_data=fh.read(max_peek_size),
data=dataset)
else:
return Sequence.display_data(self, trans, dataset, preview, filename, to_ext, **kwd)

def split( cls, input_datasets, subdir_generator_function, split_params):
"""
FASTQ files are split on cluster boundaries, in increments of 4 lines
Expand Down
20 changes: 8 additions & 12 deletions lib/galaxy/datatypes/sniff.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,10 @@

from galaxy import util
from galaxy.util import multi_byte
from galaxy.util import unicodify
from galaxy.util import (
compression_utils,
unicodify
)
from galaxy.util.checkers import (
check_binary,
check_html,
Expand Down Expand Up @@ -204,15 +207,8 @@ def get_headers( fname, sep, count=60, is_multi_byte=False ):
[['chr7', '127475281', '127491632', 'NM_000230', '0', '+', '127486022', '127488767', '0', '3', '29,172,3225,', '0,10713,13126,'], ['chr7', '127486011', '127488900', 'D49487', '0', '+', '127486022', '127488767', '0', '2', '155,490,', '0,2399']]
"""
headers = []
compressed_gzip = is_gzip(fname)
compressed_bzip2 = is_bz2(fname)
in_file = compression_utils.get_fileobj(fname)
try:
if compressed_gzip:
in_file = gzip.GzipFile(fname, 'r')
elif compressed_bzip2:
in_file = bz2.BZ2File(fname, 'r')
else:
in_file = open(fname, 'rt')
for idx, line in enumerate(in_file):
line = line.rstrip('\n\r')
if is_multi_byte:
Expand Down Expand Up @@ -490,9 +486,9 @@ def handle_uploaded_dataset_file( filename, datatypes_registry, ext='auto', is_m


AUTO_DETECT_EXTENSIONS = [ 'auto' ] # should 'data' also cause auto detect?
DECOMPRESSION_FUNCTIONS = dict( gzip=gzip.GzipFile )
COMPRESSION_CHECK_FUNCTIONS = [ ( 'gzip', is_gzip ) ]
COMPRESSION_DATATYPES = dict( gzip=[ 'bam', 'fastq.gz', 'fastqsanger.gz', 'fastqillumina.gz', 'fastqsolexa.gz', 'fastqcssanger.gz', 'fastq.bz2', 'fastqsanger.bz2', 'fastqillumina.bz2', 'fastqsolexa.bz2', 'fastqcssanger.bz2' ] )
DECOMPRESSION_FUNCTIONS = dict( gzip=gzip.GzipFile, bz2=bz2.BZ2File )
COMPRESSION_CHECK_FUNCTIONS = [ ( 'gzip', is_gzip ), ('bz2', is_bz2) ]
COMPRESSION_DATATYPES = dict( gzip=[ 'bam', 'fastq.gz', 'fastqsanger.gz', 'fastqillumina.gz', 'fastqsolexa.gz', 'fastqcssanger.gz'], bz2=['fastq.bz2', 'fastqsanger.bz2', 'fastqillumina.bz2', 'fastqsolexa.bz2', 'fastqcssanger.bz2' ] )
COMPRESSED_EXTENSIONS = []
for exts in COMPRESSION_DATATYPES.values():
COMPRESSED_EXTENSIONS.extend( exts )
Expand Down
61 changes: 27 additions & 34 deletions lib/galaxy/datatypes/tabular.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

import abc
import csv
import gzip
import logging
import os
import re
Expand All @@ -19,7 +18,7 @@
from galaxy.datatypes import data, metadata
from galaxy.datatypes.metadata import MetadataElement
from galaxy.datatypes.sniff import get_headers
from galaxy.util.checkers import is_gzip
from galaxy.util import compression_utils

from . import dataproviders

Expand Down Expand Up @@ -789,11 +788,7 @@ def sniff( self, filename ):
- We will only check that up to the first 5 alignments are correctly formatted.
"""
try:
compress = is_gzip(filename)
if compress:
fh = gzip.GzipFile(filename, 'r')
else:
fh = open( filename )
fh = compression_utils.get_fileobj(filename, gzip_only=True)
count = 0
while True:
line = fh.readline()
Expand Down Expand Up @@ -830,33 +825,31 @@ def sniff( self, filename ):

def set_meta( self, dataset, overwrite=True, skip=None, max_data_lines=5, **kwd ):
if dataset.has_data():
compress = is_gzip(dataset.file_name)
if compress:
dataset_fh = gzip.GzipFile(dataset.file_name, 'r')
else:
dataset_fh = open( dataset.file_name )
lanes = {}
tiles = {}
barcodes = {}
reads = {}
# Should always read the entire file (until we devise a more clever way to pass metadata on)
# if self.max_optional_metadata_filesize >= 0 and dataset.get_size() > self.max_optional_metadata_filesize:
# If the dataset is larger than optional_metadata, just count comment lines.
# dataset.metadata.data_lines = None
# else:
# Otherwise, read the whole thing and set num data lines.
for i, line in enumerate(dataset_fh):
if line:
line_pieces = line.split('\t')
if len(line_pieces) != 22:
raise Exception('%s:%d:Corrupt line!' % (dataset.file_name, i))
lanes[line_pieces[2]] = 1
tiles[line_pieces[3]] = 1
barcodes[line_pieces[6]] = 1
reads[line_pieces[7]] = 1
pass
dataset.metadata.data_lines = i + 1
dataset_fh.close()
dataset_fh = compression_utils.get_fileobj(dataset.file_name, gzip_only=True)
try:
lanes = {}
tiles = {}
barcodes = {}
reads = {}
# Should always read the entire file (until we devise a more clever way to pass metadata on)
# if self.max_optional_metadata_filesize >= 0 and dataset.get_size() > self.max_optional_metadata_filesize:
# If the dataset is larger than optional_metadata, just count comment lines.
# dataset.metadata.data_lines = None
# else:
# Otherwise, read the whole thing and set num data lines.
for i, line in enumerate(dataset_fh):
if line:
line_pieces = line.split('\t')
if len(line_pieces) != 22:
raise Exception('%s:%d:Corrupt line!' % (dataset.file_name, i))
lanes[line_pieces[2]] = 1
tiles[line_pieces[3]] = 1
barcodes[line_pieces[6]] = 1
reads[line_pieces[7]] = 1
pass
dataset.metadata.data_lines = i + 1
finally:
dataset_fh.close()
dataset.metadata.comment_lines = 0
dataset.metadata.columns = 21
dataset.metadata.column_types = ['str', 'int', 'int', 'int', 'int', 'int', 'str', 'int', 'str', 'str', 'str', 'str', 'str', 'str', 'str', 'str', 'str', 'str', 'str', 'str', 'str']
Expand Down
33 changes: 33 additions & 0 deletions lib/galaxy/util/compression_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import bz2
import gzip
import zipfile

from .checkers import (
is_bz2,
is_gzip
)


def get_fileobj(filename, mode="r", gzip_only=False, bz2_only=False, zip_only=False):
"""
Returns a fileobj. If the file is compressed, return appropriate file reader.
:param filename: path to file that should be opened
:param mode: mode to pass to opener
:param gzip_only: only open file if file is gzip compressed or not compressed
:param bz2_only: only open file if file is bz2 compressed or not compressed
:param zip_only: only open file if file is zip compressed or not compressed
"""
# the various compression readers don't support 'U' mode,
# so we open in 'r'.
if mode == 'U':
cmode = 'r'
else:
cmode = mode
if not bz2_only and not zip_only and is_gzip(filename):
return gzip.GzipFile(filename, cmode)
if not gzip_only and not zip_only and is_bz2(filename):
return bz2.BZ2File(filename, cmode)
if not bz2_only and not gzip_only and zipfile.is_zipfile(filename):
return zipfile.os_zipfile(filename, cmode)
return open(filename, mode)

0 comments on commit 1dbe75f

Please sign in to comment.