Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable skipping comment lines while sniffing files #4239

Merged
merged 4 commits into from Jun 28, 2017
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 4 additions & 4 deletions lib/galaxy/datatypes/interval.py
Expand Up @@ -314,14 +314,14 @@ def sniff( self, filename ):
>>> Interval().sniff( fname )
True
"""
headers = get_headers( filename, '\t' )
headers = get_headers( filename, '\t', comment_designator='#' )
try:
"""
If we got here, we already know the file is_column_based and is not bed,
so we'll just look for some valid data.
"""
for hdr in headers:
if hdr and not hdr[0].startswith( '#' ):
if hdr:
if len(hdr) < 3:
return False
try:
Expand Down Expand Up @@ -504,12 +504,12 @@ def sniff( self, filename ):
>>> Bed().sniff( fname )
True
"""
headers = get_headers( filename, '\t' )
headers = get_headers( filename, '\t', comment_designator='#' )
try:
if not headers:
return False
for hdr in headers:
if (hdr[0] == '' or hdr[0].startswith( '#' )):
if (hdr[0] == ''):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The parentheses are not needed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed

continue
valid_col1 = False
if len(hdr) < 3 or len(hdr) > 12:
Expand Down
16 changes: 13 additions & 3 deletions lib/galaxy/datatypes/sniff.py
Expand Up @@ -198,24 +198,34 @@ def convert_newlines_sep2tabs( fname, in_place=True, patt="\\s+", tmp_dir=None,
return ( i + 1, temp_name )


def get_headers( fname, sep, count=60, is_multi_byte=False ):
def get_headers( fname, sep, count=60, is_multi_byte=False, comment_designator=None ):
"""
Returns a list with the first 'count' lines split by 'sep'
Returns a list with the first 'count' lines split by 'sep', ignoring lines
starting with 'comment_designator'

>>> fname = get_test_fname('complete.bed')
>>> get_headers(fname,'\\t')
[['chr7', '127475281', '127491632', 'NM_000230', '0', '+', '127486022', '127488767', '0', '3', '29,172,3225,', '0,10713,13126,'], ['chr7', '127486011', '127488900', 'D49487', '0', '+', '127486022', '127488767', '0', '2', '155,490,', '0,2399']]
>>> fname = get_test_fname('test.gff')
>>> get_headers(fname, '\\t', count=5, comment_designator='#')
[[''], ['chr7', 'bed2gff', 'AR', '26731313', '26731437', '.', '+', '.', 'score'], ['chr7', 'bed2gff', 'AR', '26731491', '26731536', '.', '+', '.', 'score'], ['chr7', 'bed2gff', 'AR', '26731541', '26731649', '.', '+', '.', 'score'], ['chr7', 'bed2gff', 'AR', '26731659', '26731841', '.', '+', '.', 'score']]
"""
headers = []
in_file = compression_utils.get_fileobj(fname)
try:
for idx, line in enumerate(in_file):
idx = 0
for line in in_file:
line = line.rstrip('\n\r')
if is_multi_byte:
# TODO: fix this - sep is never found in line
line = unicodify( line, 'utf-8' )
sep = sep.encode( 'utf-8' )
if comment_designator is not None:
comment_designator = comment_designator.encode( 'utf-8' )
if comment_designator is not None and line.startswith( comment_designator ):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if comment_designator and line.startswith( comment_designator ):

would work also when comment_designator is the empty string.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

True, fixed.

continue
headers.append( line.split(sep) )
idx += 1
if idx == count:
break
finally:
Expand Down