Skip to content

Commit

Permalink
Merge pull request #2317 from nsoranzo/release_16.04_tsv_fix
Browse files Browse the repository at this point in the history
[16.04] Do not sniff one-line files as CSV/TSV
  • Loading branch information
bgruening committed May 7, 2016
2 parents ff903ac + 427804f commit 5bb7c1d
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 11 deletions.
4 changes: 4 additions & 0 deletions lib/galaxy/datatypes/sniff.py
Expand Up @@ -297,6 +297,10 @@ def guess_ext( fname, sniff_order, is_multi_byte=False ):
>>> guess_ext(fname, sniff_order)
'gff3'
>>> fname = get_test_fname('temp.txt')
>>> file(fname, 'wt').write("a\\t2")
>>> guess_ext(fname, sniff_order)
'txt'
>>> fname = get_test_fname('temp.txt')
>>> file(fname, 'wt').write("a\\t2\\nc\\t1\\nd\\t0")
>>> guess_ext(fname, sniff_order)
'tabular'
Expand Down
27 changes: 16 additions & 11 deletions lib/galaxy/datatypes/tabular.py
Expand Up @@ -923,15 +923,19 @@ def sniff( self, filename ):
# No columns so not separated by this dialect.
return False

# check all rows can be read as otherwise set_meta throws an exception
# Check that there is a second row as it is used by set_meta and
# that all rows can be read
if self.strict_width:
num_columns = len(header_row)
found_second_line = False
for data_row in reader:
found_second_line = True
# All columns must be the same length
if num_columns != len(data_row):
return False
if not found_second_line:
return False
else:
# Check the next row as it is used by set_meta
data_row = reader.next()
if len(data_row) < 2:
# No columns so not separated by this dialect.
Expand All @@ -958,7 +962,6 @@ def sniff( self, filename ):
"""
if not csv.Sniffer().has_header(open(filename, 'r').read(self.big_peek_size)):
return False

return True
except:
# Not readable by Python's csv using this dialect
Expand Down Expand Up @@ -995,8 +998,8 @@ def set_meta( self, dataset, **kwd ):
@dataproviders.decorators.has_dataproviders
class CSV( BaseCSV ):
"""
Comma separated table data.
Only sniffs comma separated files with at least 2 columns
Comma-separated table data.
Only sniffs comma-separated files with at least 2 rows and 2 columns.
"""

def __init__(self, **kwd):
Expand All @@ -1009,14 +1012,16 @@ def __init__(self, **kwd):
@dataproviders.decorators.has_dataproviders
class TSV( BaseCSV ):
"""
Comma separated table data.
Only sniff tab separated files with at least two columns
Tab-separated table data.
Only sniff tab-separated files with at least 2 rows and 2 columns.
Note: Use of this datatype is optional as the general tabular format will handle most tab separated files.
This datatype would only be required for dataset with tabs INSIDE double quotes.
Note: Use of this datatype is optional as the general tabular datatype will
handle most tab-separated files. This datatype is only required for datasets
with tabs INSIDE double quotes.
This datatype currently does not support tsv files where the header has one column less to indicate first column is row names
This kind of file is handled fine by tabular.
This datatype currently does not support TSV files where the header has one
column less to indicate first column is row names. This kind of file is
handled fine by the tabular datatype.
"""

def __init__(self, **kwd):
Expand Down

0 comments on commit 5bb7c1d

Please sign in to comment.