Skip to content

Commit

Permalink
Merge pull request #4237 from dpryan79/fastqSniffer_implement3571
Browse files Browse the repository at this point in the history
Sniff fastqsanger and prefer it over fastq if the quality scores match
  • Loading branch information
mvdbeek committed Jun 27, 2017
2 parents e983781 + e21e9ec commit 6cd026b
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 10 deletions.
3 changes: 3 additions & 0 deletions config/datatypes_conf.xml.sample
Expand Up @@ -691,6 +691,9 @@
<sniffer type="galaxy.datatypes.molecules:FPS"/>
<!-- TODO: see molecules.py <sniffer type="galaxy.datatypes.molecules:SMILES"/>-->
<sniffer type="galaxy.datatypes.sequence:Fasta"/>
<sniffer type="galaxy.datatypes.sequence:FastqSanger"/>
<sniffer type="galaxy.datatypes.sequence:FastqSangerGz"/>
<sniffer type="galaxy.datatypes.sequence:FastqSangerBz2"/>
<sniffer type="galaxy.datatypes.sequence:Fastq"/>
<sniffer type="galaxy.datatypes.sequence:FastqGz"/>
<sniffer type="galaxy.datatypes.sequence:FastqBz2"/>
Expand Down
1 change: 1 addition & 0 deletions lib/galaxy/datatypes/registry.py
Expand Up @@ -776,6 +776,7 @@ def set_default_values( self ):
qualityscore.QualityScoreSOLiD(),
qualityscore.QualityScore454(),
sequence.Fasta(),
sequence.FastqSanger(),
sequence.Fastq(),
interval.Wiggle(),
text.Html(),
Expand Down
41 changes: 31 additions & 10 deletions lib/galaxy/datatypes/sequence.py
Expand Up @@ -603,16 +603,27 @@ def sniff( self, filename ):
>>> from galaxy.datatypes.sniff import get_test_fname
>>> fname = get_test_fname( '1.fastqsanger' )
>>> Fastq().sniff( fname )
>>> FastqSanger().sniff( fname )
True
>>> fname = get_test_fname( '2.fastqsanger' )
>>> FastqSanger().sniff( fname )
True
>>> fname = get_test_fname( '2.fastq' )
>>> Fastq().sniff( fname )
True
>>> FastqSanger().sniff( fname )
False
"""
compressed = is_gzip(filename) or is_bz2(filename)
if compressed and not isinstance(self, Binary):
return False
headers = get_headers( filename, None )
headers = get_headers( filename, None, count=1000 )

# If this is a FastqSanger-derived class, then check to see if the base qualities match
if isinstance(self, FastqSanger) or isinstance(self, FastqSangerGz) or isinstance(self, FastqSangerBz2):
if not self.sangerQualities(headers):
return False

bases_regexp = re.compile( "^[NGTAC]*" )
# check that first block looks like a fastq block
try:
Expand Down Expand Up @@ -687,6 +698,14 @@ def process_split_file(data):
return True
process_split_file = staticmethod(process_split_file)

@staticmethod
def sangerQualities( lines ):
"""Presuming lines are lines from a fastq file, return True if the qualities are compatible with sanger encoding"""
for line in lines[3::4]:
if not all(_ >= '!' and _ <= 'M' for _ in line[0]):
return False
return True


class Fastq( BaseFastq ):
"""Class representing a generic FASTQ sequence"""
Expand Down Expand Up @@ -730,10 +749,6 @@ def sniff( self, filename ):
return BaseFastq.sniff( self, filename )


if SNIFF_COMPRESSED_FASTQS:
Binary.register_sniffable_binary_format("fastq.gz", "fastq.gz", FastqGz)


class FastqSangerGz( FastqGz ):
"""Class representing a compressed FASTQ sequence ( the Sanger variant )"""
edam_format = "format_1932"
Expand All @@ -746,6 +761,11 @@ class FastqSolexaGz( FastqGz ):
file_ext = "fastqsolexa.gz"


if SNIFF_COMPRESSED_FASTQS:
Binary.register_sniffable_binary_format("fastqsanger.gz", "fastqsanger.gz", FastqSangerGz)
Binary.register_sniffable_binary_format("fastq.gz", "fastq.gz", FastqGz)


class FastqIlluminaGz( FastqGz ):
"""Class representing a compressed FASTQ sequence ( the Illumina 1.3+ variant )"""
edam_format = "format_1931"
Expand All @@ -770,16 +790,17 @@ def sniff( self, filename ):
return BaseFastq.sniff( self, filename )


if SNIFF_COMPRESSED_FASTQS:
Binary.register_sniffable_binary_format("fastq.bz2", "fastq.bz2", FastqBz2)


class FastqSangerBz2( FastqBz2 ):
"""Class representing a compressed FASTQ sequence ( the Sanger variant )"""
edam_format = "format_1932"
file_ext = "fastqsanger.bz2"


if SNIFF_COMPRESSED_FASTQS:
Binary.register_sniffable_binary_format("fastqsanger.bz2", "fastqsanger.bz2", FastqSangerBz2)
Binary.register_sniffable_binary_format("fastq.bz2", "fastq.bz2", FastqBz2)


class FastqSolexaBz2( FastqBz2 ):
"""Class representing a compressed FASTQ sequence ( the Solexa variant )"""
edam_format = "format_1933"
Expand Down

0 comments on commit 6cd026b

Please sign in to comment.