Skip to content

Commit

Permalink
Merge pull request #7299 from chambm/feature/rawtar-datatypes
Browse files Browse the repository at this point in the history
Add datatypes and sniffers for tar archives of directory-based MS formats and WIFF/SCAN pairs
  • Loading branch information
bgruening committed Feb 9, 2019
2 parents f002696 + dc303f0 commit f7322d8
Show file tree
Hide file tree
Showing 4 changed files with 124 additions and 0 deletions.
12 changes: 12 additions & 0 deletions config/datatypes_conf.xml.sample
Expand Up @@ -219,6 +219,12 @@
<datatype extension="idxml" type="galaxy.datatypes.proteomics:IdXML" mimetype="application/xml" display_in_upload="true"/>
<datatype extension="tandem" type="galaxy.datatypes.proteomics:TandemXML" mimetype="application/xml" display_in_upload="true"/>
<datatype extension="thermo.raw" type="galaxy.datatypes.proteomics:ThermoRAW" mimetype="application/octet-stream" display_in_upload="true"/>
<datatype extension="brukerbaf.d.tar" type="galaxy.datatypes.binary:BafTar" display_in_upload="true"/>
<datatype extension="agilentbrukeryep.d.tar" type="galaxy.datatypes.binary:YepTar" display_in_upload="true"/>
<datatype extension="brukertdf.d.tar" type="galaxy.datatypes.binary:TdfTar" display_in_upload="true"/>
<datatype extension="agilentmasshunter.d.tar" type="galaxy.datatypes.binary:MassHunterTar" display_in_upload="true"/>
<datatype extension="watersmasslynx.raw.tar" type="galaxy.datatypes.binary:MassLynxTar" display_in_upload="true"/>
<datatype extension="wiff.tar" type="galaxy.datatypes.binary:WiffTar" display_in_upload="true"/>
<datatype extension="mzml" type="galaxy.datatypes.proteomics:MzML" mimetype="application/xml" display_in_upload="true"/>
<datatype extension="nmrml" type="galaxy.datatypes.proteomics:NmrML" mimetype="application/xml" display_in_upload="true" description="nmrML is an open mark-up language for NMR data." description_url="http://nmrml.org/schema/"/>
<datatype extension="mgf" type="galaxy.datatypes.proteomics:Mgf" display_in_upload="true"/>
Expand Down Expand Up @@ -745,6 +751,12 @@
<sniffer type="galaxy.datatypes.binary:DAA"/>
<sniffer type="galaxy.datatypes.binary:RMA6"/>
<sniffer type="galaxy.datatypes.binary:DMND"/>
<sniffer type="galaxy.datatypes.binary:BafTar"/>
<sniffer type="galaxy.datatypes.binary:TdfTar"/>
<sniffer type="galaxy.datatypes.binary:MassHunterTar"/>
<sniffer type="galaxy.datatypes.binary:MassLynxTar"/>
<sniffer type="galaxy.datatypes.binary:YepTar"/>
<sniffer type="galaxy.datatypes.binary:WiffTar"/>
<sniffer type="galaxy.datatypes.binary:Fast5ArchiveGz"/>
<sniffer type="galaxy.datatypes.binary:Fast5ArchiveBz2"/>
<sniffer type="galaxy.datatypes.binary:Fast5Archive"/>
Expand Down
112 changes: 112 additions & 0 deletions lib/galaxy/datatypes/binary.py
Expand Up @@ -2239,6 +2239,118 @@ def sniff(self, dataset):
return False


class BafTar(CompressedArchive):
"""
Base class for common behavior of tar files of directory-based raw file formats
>>> from galaxy.datatypes.sniff import get_test_fname
>>> fname = get_test_fname('brukerbaf.d.tar')
>>> BafTar().sniff(fname)
True
>>> fname = get_test_fname('test.fast5.tar')
>>> BafTar().sniff(fname)
False
"""
edam_data = "data_2536" # mass spectrometry data
edam_format = "format_3712" # TODO: add more raw formats to EDAM?
file_ext = "brukerbaf.d.tar"

def get_signature_file(self):
return "analysis.baf"

def sniff(self, filename):
if tarfile.is_tarfile(filename):
with tarfile.open(filename) as rawtar:
return self.get_signature_file() in [os.path.basename(f).lower() for f in rawtar.getnames()]
return False

def get_type(self):
return "Bruker BAF directory archive"

def set_peek(self, dataset, is_multi_byte=False):
if not dataset.dataset.purged:
dataset.peek = self.get_type()
dataset.blurb = nice_size(dataset.get_size())
else:
dataset.peek = 'file does not exist'
dataset.blurb = 'file purged from disk'

def display_peek(self, dataset):
try:
return dataset.peek
except Exception:
return "%s (%s)" % (self.get_type(), nice_size(dataset.get_size()))


class YepTar(BafTar):
""" A tar'd up .d directory containing Agilent/Bruker YEP format data """
file_ext = "agilentbrukeryep.d.tar"

def get_signature_file(self):
return "analysis.yep"

def get_type(self):
return "Agilent/Bruker YEP directory archive"


class TdfTar(BafTar):
""" A tar'd up .d directory containing Bruker TDF format data """
file_ext = "brukertdf.d.tar"

def get_signature_file(self):
return "analysis.tdf"

def get_type(self):
return "Bruker TDF directory archive"


class MassHunterTar(BafTar):
""" A tar'd up .d directory containing Agilent MassHunter format data """
file_ext = "agilentmasshunter.d.tar"

def get_signature_file(self):
return "msscan.bin"

def get_type(self):
return "Agilent MassHunter directory archive"


class MassLynxTar(BafTar):
""" A tar'd up .d directory containing Waters MassLynx format data """
file_ext = "watersmasslynx.raw.tar"

def get_signature_file(self):
return "_func001.dat"

def get_type(self):
return "Waters MassLynx RAW directory archive"


class WiffTar(BafTar):
"""
A tar'd up .wiff/.scan pair containing Sciex WIFF format data
>>> from galaxy.datatypes.sniff import get_test_fname
>>> fname = get_test_fname('some.wiff.tar')
>>> WiffTar().sniff(fname)
True
>>> fname = get_test_fname('brukerbaf.d.tar')
>>> WiffTar().sniff(fname)
False
>>> fname = get_test_fname('test.fast5.tar')
>>> WiffTar().sniff(fname)
False
"""
file_ext = "wiff.tar"

def sniff(self, filename):
if tarfile.is_tarfile(filename):
with tarfile.open(filename) as rawtar:
return ".wiff" in [os.path.splitext(os.path.basename(f).lower())[1] for f in rawtar.getnames()]
return False

def get_type(self):
return "Sciex WIFF/SCAN archive"


if __name__ == '__main__':
import doctest
doctest.testmod(sys.modules[__name__])
Binary file added lib/galaxy/datatypes/test/brukerbaf.d.tar
Binary file not shown.
Binary file added lib/galaxy/datatypes/test/some.wiff.tar
Binary file not shown.

0 comments on commit f7322d8

Please sign in to comment.