Skip to content

Commit

Permalink
Further tighten up GG format.
Browse files Browse the repository at this point in the history
I encountered similar input problems to #864 - this helps a little.

- Ensure first column is a valid marker as outlined on https://genome.ucsc.edu/goldenpath/help/hgGenomeHelp.html.
- Avoid readlines() since it may allocate huge amounts on memory on certain inputs.
- Eliminate bare exception.

Rebased with bug fixes in original commit and better testing and tighter exception handling as suggested by @nsoranzo.
  • Loading branch information
jmchilton committed Dec 8, 2016
1 parent fd0a52a commit 6c7b2fa
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 7 deletions.
26 changes: 19 additions & 7 deletions lib/galaxy/datatypes/genetics.py
Expand Up @@ -14,6 +14,7 @@

import logging
import os
import re
import sys
import urllib
from cgi import escape
Expand All @@ -28,6 +29,9 @@
gal_Log = logging.getLogger(__name__)
verbose = False

# https://genome.ucsc.edu/goldenpath/help/hgGenomeHelp.html
VALID_GENOME_GRAPH_MARKERS = re.compile('^(chr.*|RH.*|rs.*|SNP_.*|CN.*|A_.*)')


class GenomeGraphs( Tabular ):
"""
Expand Down Expand Up @@ -172,16 +176,24 @@ def sniff( self, filename ):
>>> GenomeGraphs().sniff( fname )
True
"""
f = open(filename, 'r')
f.readline() # header
rows = [f.readline().split()[1:] for x in range(3)] # small sample, trimming first column
with open(filename, 'r') as f:
buf = f.read(1024)

if len(rows) < 2:
return False

rows = [l.split() for l in buf.splitlines()[1:4]] # break on lines and drop header, small sample
for row in rows:
if len(row) < 1:
# Must actually have at least one value
if len(row) < 2:
# Must actually have a marker and at least one numeric value
return False
first_val = row[0]
if not VALID_GENOME_GRAPH_MARKERS.match(first_val):
return False
rest_row = row[1:]
try:
[float(x) for x in row] # first col has been removed
except:
[float(x) for x in rest_row] # first col has been removed
except ValueError:
return False
return True

Expand Down
3 changes: 3 additions & 0 deletions lib/galaxy/datatypes/sniff.py
Expand Up @@ -353,6 +353,9 @@ def guess_ext( fname, sniff_order, is_multi_byte=False ):
>>> fname = get_test_fname('mothur_datatypetest_true.mothur.otu')
>>> guess_ext(fname, sniff_order)
'mothur.otu'
>>> fname = get_test_fname('1.gg')
>>> guess_ext(fname, sniff_order)
'gg'
"""
file_ext = None
for datatype in sniff_order:
Expand Down

0 comments on commit 6c7b2fa

Please sign in to comment.