Skip to content

Commit

Permalink
For fastq.gz support inherit from Fastq and Binary
Browse files Browse the repository at this point in the history
and enhance FastQ format to be gzip aware.
  • Loading branch information
mvdbeek committed Nov 10, 2016
1 parent aabfa42 commit 7d4840c
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 118 deletions.
28 changes: 14 additions & 14 deletions config/datatypes_conf.xml.sample
Expand Up @@ -72,31 +72,31 @@
<datatype extension="fastq" type="galaxy.datatypes.sequence:Fastq" display_in_upload="true" description="FASTQ format is a text-based format for storing both a biological sequence (usually nucleotide sequence) and its corresponding quality scores." description_url="https://wiki.galaxyproject.org/Learn/Datatypes#Fastq">
<converter file="fastq_to_fqtoc.xml" target_datatype="fqtoc"/>
</datatype>
<datatype extension="fastq.gz" type="galaxy.datatypes.sequence:FastqGz" display_in_upload="true" subclass="True">
<converter file="fastqgz_to_fastq.xml" target_datatype="fastq"/>
</datatype>
<datatype extension="fastqsanger" type="galaxy.datatypes.sequence:FastqSanger" display_in_upload="true">
<converter file="fastq_to_fqtoc.xml" target_datatype="fqtoc"/>
</datatype>
<datatype extension="fastqsanger.gz" type="galaxy.datatypes.sequence:FastqSangerGz" display_in_upload="true" subclass="True">
<converter file="fastqsangergz_to_fastqsanger.xml" target_datatype="fastqsanger"/>
</datatype>
<datatype extension="fastqsolexa" type="galaxy.datatypes.sequence:FastqSolexa" display_in_upload="true" description="FastqSolexa is the Illumina (Solexa) variant of the Fastq format, which stores sequences and quality scores in a single file." description_url="https://wiki.galaxyproject.org/Learn/Datatypes#FastqSolexa">
<converter file="fastq_to_fqtoc.xml" target_datatype="fqtoc"/>
</datatype>
<datatype extension="fastqsolexa.gz" type="galaxy.datatypes.sequence:FastqSolexaGz" display_in_upload="true" subclass="True">
<converter file="fastqsolexagz_to_fastqsolexa.xml" target_datatype="fastqsolexa"/>
</datatype>
<datatype extension="fastqcssanger" type="galaxy.datatypes.sequence:FastqCSSanger" display_in_upload="true">
<converter file="fastq_to_fqtoc.xml" target_datatype="fqtoc"/>
</datatype>
<datatype extension="fastqcssanger.gz" type="galaxy.datatypes.sequence:FastqCSSangerGz" display_in_upload="true" subclass="True">
<converter file="fastqcssangergz_to_fastqcssanger.xml" target_datatype="fastqcssanger"/>
</datatype>
<datatype extension="fastqillumina" type="galaxy.datatypes.sequence:FastqIllumina" display_in_upload="true">
<converter file="fastq_to_fqtoc.xml" target_datatype="fqtoc"/>
</datatype>
<datatype extension="fastqillumina.gz" type="galaxy.datatypes.sequence:FastqIlluminaGz" display_in_upload="true" subclass="True">
<datatype extension="fastq.gz" type="galaxy.datatypes.sequence:FastqGz" display_in_upload="true">
<converter file="fastqgz_to_fastq.xml" target_datatype="fastq"/>
</datatype>
<datatype extension="fastqsanger.gz" type="galaxy.datatypes.sequence:FastqSangerGz" display_in_upload="true">
<converter file="fastqsangergz_to_fastqsanger.xml" target_datatype="fastqsanger"/>
</datatype>
<datatype extension="fastqsolexa.gz" type="galaxy.datatypes.sequence:FastqSolexaGz" display_in_upload="true">
<converter file="fastqsolexagz_to_fastqsolexa.xml" target_datatype="fastqsolexa"/>
</datatype>
<datatype extension="fastqcssanger.gz" type="galaxy.datatypes.sequence:FastqCSSangerGz" display_in_upload="true">
<converter file="fastqcssangergz_to_fastqcssanger.xml" target_datatype="fastqcssanger"/>
</datatype>
<datatype extension="fastqillumina.gz" type="galaxy.datatypes.sequence:FastqIlluminaGz" display_in_upload="true">
<converter file="fastqilluminagz_to_fastqillumina.xml" target_datatype="fastqillumina"/>
</datatype>
<datatype extension="fqtoc" type="galaxy.datatypes.sequence:SequenceSplitLocations" display_in_upload="true"/>
Expand Down Expand Up @@ -636,8 +636,8 @@
<sniffer type="galaxy.datatypes.molecules:FPS"/>
<!-- TODO: see molecules.py <sniffer type="galaxy.datatypes.molecules:SMILES"/>-->
<sniffer type="galaxy.datatypes.sequence:Fasta"/>
<sniffer type="galaxy.datatypes.sequence:FastqGz"/>
<sniffer type="galaxy.datatypes.sequence:Fastq"/>
<sniffer type="galaxy.datatypes.sequence:FastqGz"/>
<sniffer type="galaxy.datatypes.interval:Wiggle"/>
<sniffer type="galaxy.datatypes.text:Html"/>
<sniffer type="galaxy.datatypes.images:Pdf"/>
Expand Down
116 changes: 21 additions & 95 deletions lib/galaxy/datatypes/sequence.py
Expand Up @@ -7,16 +7,15 @@
import logging
import os
import re
import subprocess
import string
import sys
import tempfile
from cgi import escape

import bx.align.maf

from galaxy import util
from galaxy.datatypes import metadata
from galaxy.datatypes.binary import Binary
from galaxy.datatypes.metadata import MetadataElement
from galaxy.datatypes.sniff import get_headers
from galaxy.util import nice_size
Expand Down Expand Up @@ -573,10 +572,12 @@ def set_meta( self, dataset, **kwd ):
data_lines = 0
sequences = 0
seq_counter = 0 # blocks should be 4 lines long
compress = is_gzip(dataset.file_name)
if compress:
self.decompress_fastqgz(dataset)
for line in open( dataset.file_name ):
compressed = is_gzip(dataset.file_name)
if compressed:
in_file = gzip.GzipFile(dataset.file_name)
else:
in_file = open(dataset.file_name)
for line in in_file:
line = line.strip()
if line and line.startswith( '#' ) and not data_lines:
# We don't count comment lines for sequence data types
Expand Down Expand Up @@ -611,6 +612,9 @@ def sniff( self, filename ):
>>> Fastq().sniff( fname )
True
"""
compressed = is_gzip(filename)
if compressed and not isinstance(self, Binary):
return False
headers = get_headers( filename, None )
bases_regexp = re.compile( "^[NGTAC]*" )
# check that first block looks like a fastq block
Expand Down Expand Up @@ -672,95 +676,6 @@ def process_split_file(data):
return True
process_split_file = staticmethod(process_split_file)

class FastqGz ( Fastq ):
"""Class representing a generic compressed FASTQ sequence"""
edam_format = "format_1930"
file_ext = "fastq.gz"

def set_meta( self, dataset, **kwd ):
"""
Set the number of sequences and the number of data lines
in dataset.
FIXME: This does not properly handle line wrapping
"""
if self.max_optional_metadata_filesize >= 0 and dataset.get_size() > self.max_optional_metadata_filesize:
dataset.metadata.data_lines = None
dataset.metadata.sequences = None
return
data_lines = 0
sequences = 0
seq_counter = 0 # blocks should be 4 lines long
compress = is_gzip(dataset.file_name)
if not compress:
self.compress_fastq(dataset)
for line in gzip.GzipFile(dataset.file_name, 'r'):
line = line.strip()
if line and line.startswith( '#' ) and not data_lines:
# We don't count comment lines for sequence data types
continue
seq_counter += 1
data_lines += 1
if line and line.startswith( '@' ):
if seq_counter >= 4:
# count previous block
# blocks should be 4 lines long
sequences += 1
seq_counter = 1
if seq_counter >= 4:
# count final block
sequences += 1
dataset.metadata.data_lines = data_lines
dataset.metadata.sequences = sequences

def compress_fastq(self,dataset):
print('X Compressing the fastq files')
if dataset.file_name and dataset.file_name.strip():
compress = is_gzip(dataset.file_name)
if not compress:
# TODO: consider how to created temp file in Galaxy database/tmp folder
compressed_file = tempfile.NamedTemporaryFile(delete=False)
shutil.copyfileobj(open(dataset.file_name), gzip.open(compressed_file.name, 'wb'))
compressed_file.close()
os.unlink(dataset.file_name)
os.rename(compressed_file.name, dataset.file_name)
# # Compress fastq file
# stderr_name = tempfile.NamedTemporaryFile( prefix="fastq_compress" ).name
# command = ['gzip',dataset.file_name]
# try:
# exit_code = subprocess.call( args=command, stderr=open( stderr_name, 'wb' ) )
# except Exception as e:
# log.warning( '%s, Compression Exception: %s', self, e )
#
# # Rename compressed file
# os.rename(dataset.file_name + ".gz" , dataset.file_name)
else:
return False
else:
return False

def sniff( self, filename ):

"""
Determines whether the file is in generic fastq format
For details, see http://maq.sourceforge.net/fastq.shtml
Note: There are three kinds of FASTQ files, known as "Sanger" (sometimes called "Standard"), Solexa, and Illumina
These differ in the representation of the quality scores
>>> from galaxy.datatypes.sniff import get_test_fname
>>> fname = get_test_fname( '1.fastqsanger' )
>>> Fastq().sniff( fname )
True
>>> fname = get_test_fname( '2.fastqsanger' )
>>> Fastq().sniff( fname )
True
"""
is_compressed = is_gzip(filename)
is_fastq = super(FastqGz,self).sniff(filename)
if is_fastq and is_compressed:
return True
else:
return False

class FastqSanger( Fastq ):
"""Class representing a FASTQ sequence ( the Sanger variant )"""
Expand All @@ -785,27 +700,38 @@ class FastqCSSanger( Fastq ):
file_ext = "fastqcssanger"


class FastqGz ( Fastq, Binary ):
"""Class representing a generic compressed FASTQ sequence"""
edam_format = "format_1930"
file_ext = "fastq.gz"
Binary.register_sniffable_binary_format("fastq.gz", "fastq.gz", FastqGz)


class FastqSangerGz( FastqGz ):
"""Class representing a compressed FASTQ sequence ( the Sanger variant )"""
edam_format = "format_1932"
file_ext = "fastqsanger.gz"
Binary.register_sniffable_binary_format("fastqsanger.gz", "fastqsanger.gz", FastqSangerGz)


class FastqSolexaGz( FastqGz ):
"""Class representing a compressed FASTQ sequence ( the Solexa variant )"""
edam_format = "format_1933"
file_ext = "fastqsolexa.gz"
Binary.register_sniffable_binary_format("fastqsolexa.gz", "fastqsolexa.gz", FastqSolexaGz)


class FastqIlluminaGz( FastqGz ):
"""Class representing a compressed FASTQ sequence ( the Illumina 1.3+ variant )"""
edam_format = "format_1931"
file_ext = "fastqillumina.gz"
Binary.register_sniffable_binary_format("fastqillumina.gz", "fastqillumina.gz", FastqIlluminaGz)


class FastqCSSangerGz( FastqGz ):
"""Class representing a Color Space compressed FASTQ sequence ( e.g a SOLiD variant )"""
file_ext = "fastqcssanger.gz"
Binary.register_sniffable_binary_format("fastqcssanger.gz", "fastqcssanger.gz", FastqCSSangerGz)


class Maf( Alignment ):
Expand Down
6 changes: 3 additions & 3 deletions lib/galaxy/datatypes/sniff.py
Expand Up @@ -198,8 +198,8 @@ def get_headers( fname, sep, count=60, is_multi_byte=False ):
[['chr7', '127475281', '127491632', 'NM_000230', '0', '+', '127486022', '127488767', '0', '3', '29,172,3225,', '0,10713,13126,'], ['chr7', '127486011', '127488900', 'D49487', '0', '+', '127486022', '127488767', '0', '2', '155,490,', '0,2399']]
"""
headers = []
compress = is_gzip(fname)
if compress:
compressed = is_gzip(fname)
if compressed:
in_file = gzip.GzipFile(fname, 'r')
else:
in_file = open(fname, 'rt')
Expand Down Expand Up @@ -473,7 +473,7 @@ def handle_uploaded_dataset_file( filename, datatypes_registry, ext='auto', is_m
AUTO_DETECT_EXTENSIONS = [ 'auto' ] # should 'data' also cause auto detect?
DECOMPRESSION_FUNCTIONS = dict( gzip=gzip.GzipFile )
COMPRESSION_CHECK_FUNCTIONS = [ ( 'gzip', is_gzip ) ]
COMPRESSION_DATATYPES = dict( gzip=[ 'bam' ] )
COMPRESSION_DATATYPES = dict( gzip=[ 'bam', 'fastq.gz', 'fastqsanger.gz', 'fastqillumina.gz', 'fastqsolexa.gz', 'fastqcssanger.gz' ] )
COMPRESSED_EXTENSIONS = []
for exts in COMPRESSION_DATATYPES.values():
COMPRESSED_EXTENSIONS.extend( exts )
Expand Down
6 changes: 0 additions & 6 deletions tools/data_source/upload.py
Expand Up @@ -125,12 +125,6 @@ def add_file( dataset, registry, json_file, output_path ):
if type_info:
data_type = type_info[0]
ext = type_info[1]
# Is dataset is compressed Fastq?
is_gzipped, is_valid = check_gzip( dataset.path )
if is_gzipped and is_valid:
ext = sniff.guess_ext( dataset.path, registry.sniff_order)
if ext:
data_type = ext
if not data_type:
root_datatype = registry.get_datatype_by_extension( dataset.file_type )
if getattr( root_datatype, 'compressed', False ):
Expand Down

0 comments on commit 7d4840c

Please sign in to comment.