Skip to content

Commit

Permalink
Update get_fileobj() to use utf-8 encoding in text mode
Browse files Browse the repository at this point in the history
Also, merge its 3 parameters `gzip_only`, `bz2_only`, `zip_only` into
`compressed_formats` (a list of allowed formats).

As a consequence of the changes in `get_fileobj()`, update:
- `files_diff()`
- `get_file_peek()`, which now determines that a file is binary when a
  `UnicodeDecodeError` exception is raised and doesn't need
  `is_multi_byte` any more
- `iter_headers()` and `get_headers()`, which now return Unicode and don't
  need `is_multi_byte` parameter any more

As a consequence of the changes in `get_file_peek()`, update:
- `set_peek()`, which now doesn't need `is_multi_byte` any more

As a consequence of the changes in `get_headers()`, update:
- `guess_ext` and `is_column_based()`, which now determine that a file is
  binary when a `UnicodeDecodeError` exception is raised and don't need
  `is_multi_byte` any more

As a consequence of the changes to `guess_ext`, update:
- `handle_uploaded_dataset_file() doesn't need `is_multi_byte` any more

Also, remove duplicated calls to `get_file_peek()` in
lib/galaxy/datatypes/molecules.py and lib/galaxy/datatypes/msa.py

The `is_multi_byte` was not removed from the signature of `get_file_peek()`
and `set_peek()` in order to preserve compatibility for ToolShed datatypes,
thanks @jmchilton for the review.
  • Loading branch information
nsoranzo committed Nov 28, 2017
1 parent 6b730ae commit 59f44cf
Show file tree
Hide file tree
Showing 23 changed files with 186 additions and 211 deletions.
2 changes: 1 addition & 1 deletion lib/galaxy/datatypes/annotation.py
Expand Up @@ -14,7 +14,7 @@ class SnapHmm(Text):

def set_peek(self, dataset, is_multi_byte=False):
if not dataset.dataset.purged:
dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte)
dataset.peek = get_file_peek(dataset.file_name)
dataset.blurb = "SNAP HMM model"
else:
dataset.peek = 'file does not exist'
Expand Down
2 changes: 1 addition & 1 deletion lib/galaxy/datatypes/binary.py
Expand Up @@ -1088,7 +1088,7 @@ def set_peek(self, dataset, is_multi_byte=False):
dataset.peek = "Binary TwoBit format nucleotide file"
dataset.blurb = nice_size(dataset.get_size())
else:
return super(TwoBit, self).set_peek(dataset, is_multi_byte)
return super(TwoBit, self).set_peek(dataset)

def display_peek(self, dataset):
try:
Expand Down
2 changes: 1 addition & 1 deletion lib/galaxy/datatypes/blast.py
Expand Up @@ -53,7 +53,7 @@ class BlastXml(GenericXml):
def set_peek(self, dataset, is_multi_byte=False):
"""Set the peek and blurb text"""
if not dataset.dataset.purged:
dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte)
dataset.peek = get_file_peek(dataset.file_name)
dataset.blurb = 'NCBI Blast XML data'
else:
dataset.peek = 'file does not exist'
Expand Down
4 changes: 2 additions & 2 deletions lib/galaxy/datatypes/constructive_solid_geometry.py
Expand Up @@ -102,7 +102,7 @@ def set_meta(self, dataset, **kwd):

def set_peek(self, dataset, is_multi_byte=False):
if not dataset.dataset.purged:
dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte)
dataset.peek = get_file_peek(dataset.file_name)
dataset.blurb = "Faces: %s, Vertices: %s" % (str(dataset.metadata.face), str(dataset.metadata.vertex))
else:
dataset.peek = 'File does not exist'
Expand Down Expand Up @@ -429,7 +429,7 @@ def get_blurb(self, dataset):

def set_peek(self, dataset, is_multi_byte=False):
if not dataset.dataset.purged:
dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte)
dataset.peek = get_file_peek(dataset.file_name)
dataset.blurb = self.get_blurb(dataset)
else:
dataset.peek = 'File does not exist'
Expand Down
39 changes: 16 additions & 23 deletions lib/galaxy/datatypes/data.py
Expand Up @@ -195,7 +195,12 @@ def get_max_optional_metadata_filesize(self):
max_optional_metadata_filesize = property(get_max_optional_metadata_filesize, set_max_optional_metadata_filesize)

def set_peek(self, dataset, is_multi_byte=False):
"""Set the peek and blurb text"""
"""
Set the peek and blurb text
:param is_multi_byte: deprecated
:type is_multi_byte: bool
"""
if not dataset.dataset.purged:
dataset.peek = ''
dataset.blurb = 'data'
Expand Down Expand Up @@ -838,7 +843,7 @@ def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, ski
"""
if not dataset.dataset.purged:
# The file must exist on disk for the get_file_peek() method
dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH, skipchars=skipchars, line_wrap=line_wrap)
dataset.peek = get_file_peek(dataset.file_name, WIDTH=WIDTH, skipchars=skipchars, line_wrap=line_wrap)
if line_count is None:
# See if line_count is stored in the metadata
if dataset.metadata.data_lines:
Expand Down Expand Up @@ -1046,7 +1051,10 @@ def get_test_fname(fname):

def get_file_peek(file_name, is_multi_byte=False, WIDTH=256, LINE_COUNT=5, skipchars=None, line_wrap=True):
"""
Returns the first LINE_COUNT lines wrapped to WIDTH
Returns the first LINE_COUNT lines wrapped to WIDTH.
:param is_multi_byte: deprecated
:type is_multi_byte: bool
>>> fname = get_test_fname('4.bed')
>>> get_file_peek(fname, LINE_COUNT=1)
Expand All @@ -1061,20 +1069,12 @@ def get_file_peek(file_name, is_multi_byte=False, WIDTH=256, LINE_COUNT=5, skipc
skipchars = []
lines = []
count = 0
file_type = None
data_checked = False
with compression_utils.get_fileobj(file_name, "U") as temp:
while count < LINE_COUNT:
line = temp.readline(WIDTH)
if line and not is_multi_byte and not data_checked:
# See if we have a compressed or binary file
for char in line:
if ord(char) > 128:
file_type = 'binary'
break
data_checked = True
if file_type == 'binary':
break
try:
line = temp.readline(WIDTH)
except UnicodeDecodeError:
return "binary file"
if not line_wrap:
if line.endswith('\n'):
line = line[:-1]
Expand All @@ -1091,11 +1091,4 @@ def get_file_peek(file_name, is_multi_byte=False, WIDTH=256, LINE_COUNT=5, skipc
if not skip_line:
lines.append(line)
count += 1
if file_type == 'binary':
text = "%s file" % file_type
else:
try:
text = util.unicodify('\n'.join(lines))
except UnicodeDecodeError:
text = "binary/unknown file"
return text
return '\n'.join(lines)
4 changes: 2 additions & 2 deletions lib/galaxy/datatypes/graph.py
Expand Up @@ -27,7 +27,7 @@ def set_peek(self, dataset, is_multi_byte=False):
Set the peek and blurb text
"""
if not dataset.dataset.purged:
dataset.peek = data.get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte)
dataset.peek = data.get_file_peek(dataset.file_name)
dataset.blurb = 'XGMML data'
else:
dataset.peek = 'file does not exist'
Expand Down Expand Up @@ -73,7 +73,7 @@ def set_peek(self, dataset, is_multi_byte=False):
Set the peek and blurb text
"""
if not dataset.dataset.purged:
dataset.peek = data.get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte)
dataset.peek = data.get_file_peek(dataset.file_name)
dataset.blurb = 'SIF data'
else:
dataset.peek = 'file does not exist'
Expand Down
20 changes: 8 additions & 12 deletions lib/galaxy/datatypes/molecules.py
Expand Up @@ -61,12 +61,11 @@ class GenericMolFile(data.Text):

def set_peek(self, dataset, is_multi_byte=False):
if not dataset.dataset.purged:
dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte)
if (dataset.metadata.number_of_molecules == 1):
dataset.blurb = "1 molecule"
else:
dataset.blurb = "%s molecules" % dataset.metadata.number_of_molecules
dataset.peek = data.get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte)
dataset.peek = get_file_peek(dataset.file_name)
else:
dataset.peek = 'file does not exist'
dataset.blurb = 'file purged from disk'
Expand Down Expand Up @@ -471,7 +470,7 @@ class PHAR(GenericMolFile):

def set_peek(self, dataset, is_multi_byte=False):
if not dataset.dataset.purged:
dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte)
dataset.peek = get_file_peek(dataset.file_name)
dataset.blurb = "pharmacophore"
else:
dataset.peek = 'file does not exist'
Expand Down Expand Up @@ -524,7 +523,7 @@ def set_peek(self, dataset, is_multi_byte=False):
if not dataset.dataset.purged:
atom_numbers = count_special_lines("^ATOM", dataset.file_name)
hetatm_numbers = count_special_lines("^HETATM", dataset.file_name)
dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte)
dataset.peek = get_file_peek(dataset.file_name)
dataset.blurb = "%s atoms and %s HET-atoms" % (atom_numbers, hetatm_numbers)
else:
dataset.peek = 'file does not exist'
Expand Down Expand Up @@ -575,7 +574,7 @@ def set_peek(self, dataset, is_multi_byte=False):
if not dataset.dataset.purged:
root_numbers = count_special_lines("^ROOT", dataset.file_name)
branch_numbers = count_special_lines("^BRANCH", dataset.file_name)
dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte)
dataset.peek = get_file_peek(dataset.file_name)
dataset.blurb = "%s roots and %s branches" % (root_numbers, branch_numbers)
else:
dataset.peek = 'file does not exist'
Expand All @@ -587,7 +586,7 @@ class grd(data.Text):

def set_peek(self, dataset, is_multi_byte=False):
if not dataset.dataset.purged:
dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte)
dataset.peek = get_file_peek(dataset.file_name)
dataset.blurb = "grids for docking"
else:
dataset.peek = 'file does not exist'
Expand Down Expand Up @@ -621,12 +620,11 @@ def set_meta(self, dataset, **kwd):

def set_peek(self, dataset, is_multi_byte=False):
if not dataset.dataset.purged:
dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte)
if (dataset.metadata.number_of_molecules == 1):
dataset.blurb = "1 molecule"
else:
dataset.blurb = "%s molecules" % dataset.metadata.number_of_molecules
dataset.peek = data.get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte)
dataset.peek = get_file_peek(dataset.file_name)
else:
dataset.peek = 'file does not exist'
dataset.blurb = 'file purged from disk'
Expand Down Expand Up @@ -666,12 +664,11 @@ def set_meta(self, dataset, **kwd):

def set_peek(self, dataset, is_multi_byte=False):
if not dataset.dataset.purged:
dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte)
if dataset.metadata.number_of_molecules == 1:
dataset.blurb = "1 molecule"
else:
dataset.blurb = "%s molecules" % dataset.metadata.number_of_molecules
dataset.peek = data.get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte)
dataset.peek = get_file_peek(dataset.file_name)
else:
dataset.peek = 'file does not exist'
dataset.blurb = 'file purged from disk'
Expand Down Expand Up @@ -727,12 +724,11 @@ def set_meta(self, dataset, **kwd):

def set_peek(self, dataset, is_multi_byte=False):
if not dataset.dataset.purged:
dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte)
if (dataset.metadata.number_of_molecules == 1):
dataset.blurb = "1 molecule"
else:
dataset.blurb = "%s molecules" % dataset.metadata.number_of_molecules
dataset.peek = data.get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte)
dataset.peek = get_file_peek(dataset.file_name)
else:
dataset.peek = 'file does not exist'
dataset.blurb = 'file purged from disk'
Expand Down
8 changes: 3 additions & 5 deletions lib/galaxy/datatypes/msa.py
Expand Up @@ -17,7 +17,7 @@ class Hmmer(Text):

def set_peek(self, dataset, is_multi_byte=False):
if not dataset.dataset.purged:
dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte)
dataset.peek = get_file_peek(dataset.file_name)
dataset.blurb = "HMMER Database"
else:
dataset.peek = 'file does not exist'
Expand Down Expand Up @@ -104,12 +104,11 @@ class Stockholm_1_0(Text):

def set_peek(self, dataset, is_multi_byte=False):
if not dataset.dataset.purged:
dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte)
if (dataset.metadata.number_of_models == 1):
dataset.blurb = "1 alignment"
else:
dataset.blurb = "%s alignments" % dataset.metadata.number_of_models
dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte)
dataset.peek = get_file_peek(dataset.file_name)
else:
dataset.peek = 'file does not exist'
dataset.blurb = 'file purged from disc'
Expand Down Expand Up @@ -187,12 +186,11 @@ class MauveXmfa(Text):

def set_peek(self, dataset, is_multi_byte=False):
if not dataset.dataset.purged:
dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte)
if (dataset.metadata.number_of_models == 1):
dataset.blurb = "1 alignment"
else:
dataset.blurb = "%s alignments" % dataset.metadata.number_of_models
dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte)
dataset.peek = get_file_peek(dataset.file_name)
else:
dataset.peek = 'file does not exist'
dataset.blurb = 'file purged from disc'
Expand Down
26 changes: 13 additions & 13 deletions lib/galaxy/datatypes/plant_tribes.py
Expand Up @@ -23,7 +23,7 @@ def display_peek(self, dataset):

def set_peek(self, dataset, is_multi_byte=False):
if not dataset.dataset.purged:
dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte)
dataset.peek = get_file_peek(dataset.file_name)
dataset.blurb = "ESTScan scores matrices"
else:
dataset.peek = 'file does not exist'
Expand Down Expand Up @@ -125,7 +125,7 @@ def set_meta(self, dataset, **kwd):

def set_peek(self, dataset, is_multi_byte=False):
if not dataset.dataset.purged:
dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte)
dataset.peek = get_file_peek(dataset.file_name)
if (dataset.metadata.number_comp == 1):
dataset.blurb = "1 significant component"
else:
Expand Down Expand Up @@ -159,7 +159,7 @@ class PlantTribesOrtho(PlantTribes):
file_ext = "ptortho"

def set_peek(self, dataset, is_multi_byte=False):
super(PlantTribesOrtho, self).set_peek(dataset, is_multi_byte=is_multi_byte)
super(PlantTribesOrtho, self).set_peek(dataset)
dataset.blurb = "Proteins orthogroup fasta files: %d items" % dataset.metadata.num_files


Expand All @@ -171,7 +171,7 @@ class PlantTribesOrthoCodingSequence(PlantTribes):
file_ext = "ptorthocs"

def set_peek(self, dataset, is_multi_byte=False):
super(PlantTribesOrthoCodingSequence, self).set_peek(dataset, is_multi_byte=is_multi_byte)
super(PlantTribesOrthoCodingSequence, self).set_peek(dataset)
dataset.blurb = "Protein and coding sequences orthogroup fasta files: %d items" % dataset.metadata.num_files


Expand All @@ -182,7 +182,7 @@ class PlantTribesTargetedGeneFamilies(PlantTribes):
file_ext = "pttgf"

def set_peek(self, dataset, is_multi_byte=False):
super(PlantTribesTargetedGeneFamilies, self).set_peek(dataset, is_multi_byte=is_multi_byte)
super(PlantTribesTargetedGeneFamilies, self).set_peek(dataset)
dataset.blurb = "Targeted gene families"


Expand All @@ -194,7 +194,7 @@ class PlantTribesPhylogeneticTree(PlantTribes):
file_ext = "pttree"

def set_peek(self, dataset, is_multi_byte=False):
super(PlantTribesPhylogeneticTree, self).set_peek(dataset, is_multi_byte=is_multi_byte)
super(PlantTribesPhylogeneticTree, self).set_peek(dataset)
dataset.blurb = "Phylogenetic trees: %d items" % dataset.metadata.num_files


Expand All @@ -205,7 +205,7 @@ class PlantTribesPhylip(PlantTribes):
file_ext = "ptphylip"

def set_peek(self, dataset, is_multi_byte=False):
super(PlantTribesPhylip, self).set_peek(dataset, is_multi_byte=is_multi_byte)
super(PlantTribesPhylip, self).set_peek(dataset)
dataset.blurb = "Orthogroup phylip multiple sequence alignments: %d items" % dataset.metadata.num_files


Expand All @@ -216,7 +216,7 @@ class PlantTribesMultipleSequenceAlignment(PlantTribes):
file_ext = "ptalign"

def set_peek(self, dataset, is_multi_byte=False):
super(PlantTribesMultipleSequenceAlignment, self).set_peek(dataset, is_multi_byte=is_multi_byte)
super(PlantTribesMultipleSequenceAlignment, self).set_peek(dataset)
dataset.blurb = "Proteins orthogroup alignments: %d items" % dataset.metadata.num_files


Expand All @@ -227,7 +227,7 @@ class PlantTribesMultipleSequenceAlignmentCodonAlignment(PlantTribes):
file_ext = "ptalignca"

def set_peek(self, dataset, is_multi_byte=False):
super(PlantTribesMultipleSequenceAlignmentCodonAlignment, self).set_peek(dataset, is_multi_byte=is_multi_byte)
super(PlantTribesMultipleSequenceAlignmentCodonAlignment, self).set_peek(dataset)
dataset.blurb = "Protein and coding sequences orthogroup alignments: %d items" % dataset.metadata.num_files


Expand All @@ -238,7 +238,7 @@ class PlantTribesMultipleSequenceAlignmentTrimmed(PlantTribes):
file_ext = "ptaligntrimmed"

def set_peek(self, dataset, is_multi_byte=False):
super(PlantTribesMultipleSequenceAlignmentTrimmed, self).set_peek(dataset, is_multi_byte=is_multi_byte)
super(PlantTribesMultipleSequenceAlignmentTrimmed, self).set_peek(dataset)
dataset.blurb = "Trimmed proteins orthogroup alignments: %d items" % dataset.metadata.num_files


Expand All @@ -249,7 +249,7 @@ class PlantTribesMultipleSequenceAlignmentTrimmedCodonAlignment(PlantTribes):
file_ext = "ptaligntrimmedca"

def set_peek(self, dataset, is_multi_byte=False):
super(PlantTribesMultipleSequenceAlignmentTrimmedCodonAlignment, self).set_peek(dataset, is_multi_byte=is_multi_byte)
super(PlantTribesMultipleSequenceAlignmentTrimmedCodonAlignment, self).set_peek(dataset)
dataset.blurb = "Trimmed protein and coding sequences orthogroup alignments: %d items" % dataset.metadata.num_files


Expand All @@ -260,7 +260,7 @@ class PlantTribesMultipleSequenceAlignmentFiltered(PlantTribes):
file_ext = "ptalignfiltered"

def set_peek(self, dataset, is_multi_byte=False):
super(PlantTribesMultipleSequenceAlignmentFiltered, self).set_peek(dataset, is_multi_byte=is_multi_byte)
super(PlantTribesMultipleSequenceAlignmentFiltered, self).set_peek(dataset)
dataset.blurb = "Filtered proteins orthogroup alignments: %d items" % dataset.metadata.num_files


Expand All @@ -271,5 +271,5 @@ class PlantTribesMultipleSequenceAlignmentFilteredCodonAlignment(PlantTribes):
file_ext = "ptalignfilteredca"

def set_peek(self, dataset, is_multi_byte=False):
super(PlantTribesMultipleSequenceAlignmentFilteredCodonAlignment, self).set_peek(dataset, is_multi_byte=is_multi_byte)
super(PlantTribesMultipleSequenceAlignmentFilteredCodonAlignment, self).set_peek(dataset)
dataset.blurb = "Filtered protein and coding sequences orthogroup alignments: %d items" % dataset.metadata.num_files

0 comments on commit 59f44cf

Please sign in to comment.