Merge pull request #2696 from peterjc/blast_datatypes

Returning Blast datatypes to Galaxy core (work-in-progress)
galaxyproject · Nov 5, 2016 · 58b7b6c · 58b7b6c
2 parents b1892b7 + b5ba960
commit 58b7b6c
Show file tree

Hide file tree

Showing 10 changed files with 1,227 additions and 4 deletions.
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
@@ -31,7 +31,7 @@ The following individuals have contributed code to Galaxy:
 * Saket Choudhary <saketkc@gmail.com>
 * Wen-Yu Chung <wychung@bx.psu.edu>
 * Dave Clements <clements@galaxyproject.org>
-* Peter Cock <p.j.a.cock@googlemail.com>
+* Peter Cock <p.j.a.cock@googlemail.com> <peter.cock@hutton.ac.uk>
 * Ira Cooke <iracooke@gmail.com>
 * Nate Coraor <nate@bx.psu.edu>
 * Michael Cotterell <mepcotterell@gmail.com>
@@ -72,6 +72,7 @@ The following individuals have contributed code to Galaxy:
 * Jan Kanis <jan.code@jankanis.nl>
 * David King <dcking@bx.psu.edu>
 * Rory Kirchner <roryk@mit.edu>
+* Edward Kirton <eskirton@lbl.gov>
 * Brad Langhorst <langhorst@neb.com>
 * Ross Lazarus <ross.lazarus@gmail.com> <rossl@bx.psu.edu>
 * Gildas Le Corguillé @lecorguille

diff --git a/config/datatypes_conf.xml.sample b/config/datatypes_conf.xml.sample
@@ -540,6 +540,13 @@
     <datatype extension="trackhub" type="galaxy.datatypes.tracks:UCSCTrackHub" display_in_upload="true">
         <display file="ucsc/trackhub.xml" />
     </datatype>
+    <datatype extension="blastxml" type="galaxy.datatypes.blast:BlastXml" mimetype="application/xml" display_in_upload="true"/>
+    <datatype extension="blastdbn" type="galaxy.datatypes.blast:BlastNucDb" mimetype="text/html" display_in_upload="false"/>
+    <datatype extension="blastdbp" type="galaxy.datatypes.blast:BlastProtDb" mimetype="text/html" display_in_upload="false"/>
+    <datatype extension="blastdbd" type="galaxy.datatypes.blast:BlastDomainDb" mimetype="text/html" display_in_upload="false"/>
+    <datatype extension="maskinfo-asn1" type="galaxy.datatypes.data:GenericAsn1" mimetype="text/plain" subclass="True" display_in_upload="true" />
+    <datatype extension="maskinfo-asn1-binary" type="galaxy.datatypes.binary:GenericAsn1Binary" mimetype="application/octet-stream" subclass="True" display_in_upload="true" />
+    <datatype extension="pssm-asn1" type="galaxy.datatypes.data:GenericAsn1" mimetype="text/plain" subclass="True" display_in_upload="true" />
   </registration>
   <sniffers>
     <!--
@@ -580,6 +587,7 @@
     <sniffer type="galaxy.datatypes.binary:Sra"/>
     <sniffer type="galaxy.datatypes.binary:NetCDF"/>
     <sniffer type="galaxy.datatypes.triples:Rdf"/>
+    <sniffer type="galaxy.datatypes.blast:BlastXml"/>
     <sniffer type="galaxy.datatypes.xml:Phyloxml"/>
     <sniffer type="galaxy.datatypes.xml:Owl"/>
     <sniffer type="galaxy.datatypes.proteomics:MzML"/>

diff --git a/config/tool_data_table_conf.xml.sample b/config/tool_data_table_conf.xml.sample
@@ -10,6 +10,21 @@
         <columns>value, dbkey, formats, name, path</columns>
         <file path="tool-data/bfast_indexes.loc" />
     </table>
+    <!-- Locations of nucleotide BLAST databases -->
+    <table name="blastdb" comment_char="#" allow_duplicate_entries="False">
+        <columns>value, name, path</columns>
+        <file path="tool-data/blastdb.loc" />
+    </table>
+    <!-- Locations of protein BLAST databases -->
+    <table name="blastdb_p" comment_char="#" allow_duplicate_entries="False">
+        <columns>value, name, path</columns>
+        <file path="tool-data/blastdb_p.loc" />
+    </table>
+    <!-- Locations of protein domain BLAST databases -->
+    <table name="blastdb_d" comment_char="#" allow_duplicate_entries="False">
+        <columns>value, name, path</columns>
+        <file path="tool-data/blastdb_d.loc" />
+    </table>
     <!-- Locations of indexes in the BWA mapper format -->
     <table name="bwa_indexes" comment_char="#" allow_duplicate_entries="False">
         <columns>value, dbkey, name, path</columns>

diff --git a/lib/galaxy/datatypes/blast.py b/lib/galaxy/datatypes/blast.py
@@ -0,0 +1,311 @@
+# This file is now part of the Galaxy Project, but due to historical reasons
+# reflecting time developed outside of the Galaxy Project, this file is under
+# the MIT license.
+#
+# The MIT License (MIT)
+# Copyright (c) 2012,2013,2014,2015,2016 Peter Cock
+# Copyright (c) 2012 Edward Kirton
+# Copyright (c) 2013 Nicola Soranzo
+# Copyright (c) 2014 Bjoern Gruening
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+"""NCBI BLAST datatypes.
+
+Covers the ``blastxml`` format and the BLAST databases.
+"""
+
+import logging
+import os
+from time import sleep
+
+from galaxy.datatypes.data import get_file_peek
+from galaxy.datatypes.data import Data, Text
+from galaxy.datatypes.xml import GenericXml
+
+log = logging.getLogger(__name__)
+
+
+class BlastXml(GenericXml):
+    """NCBI Blast XML Output data"""
+    file_ext = "blastxml"
+    edam_format = "format_3331"
+    edam_data = "data_0857"
+
+    def set_peek(self, dataset, is_multi_byte=False):
+        """Set the peek and blurb text"""
+        if not dataset.dataset.purged:
+            dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte)
+            dataset.blurb = 'NCBI Blast XML data'
+        else:
+            dataset.peek = 'file does not exist'
+            dataset.blurb = 'file purged from disk'
+
+    def sniff(self, filename):
+        """Determines whether the file is blastxml
+
+        >>> from galaxy.datatypes.sniff import get_test_fname
+        >>> fname = get_test_fname('megablast_xml_parser_test1.blastxml')
+        >>> BlastXml().sniff(fname)
+        True
+        >>> fname = get_test_fname('tblastn_four_human_vs_rhodopsin.xml')
+        >>> BlastXml().sniff(fname)
+        True
+        >>> fname = get_test_fname('interval.interval')
+        >>> BlastXml().sniff(fname)
+        False
+        """
+        # TODO - Use a context manager on Python 2.5+ to close handle
+        handle = open(filename)
+        line = handle.readline()
+        if line.strip() != '<?xml version="1.0"?>':
+            handle.close()
+            return False
+        line = handle.readline()
+        if line.strip() not in ['<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">',
+                                '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "NCBI_BlastOutput.dtd">']:
+            handle.close()
+            return False
+        line = handle.readline()
+        if line.strip() != '<BlastOutput>':
+            handle.close()
+            return False
+        handle.close()
+        return True
+
+    def merge(split_files, output_file):
+        """Merging multiple XML files is non-trivial and must be done in subclasses."""
+        if len(split_files) == 1:
+            # For one file only, use base class method (move/copy)
+            return Text.merge(split_files, output_file)
+        if not split_files:
+            raise ValueError("Given no BLAST XML files, %r, to merge into %s"
+                             % (split_files, output_file))
+        out = open(output_file, "w")
+        h = None
+        for f in split_files:
+            if not os.path.isfile(f):
+                log.warning("BLAST XML file %s missing, retry in 1s..." % f)
+                sleep(1)
+            if not os.path.isfile(f):
+                log.error("BLAST XML file %s missing" % f)
+                raise ValueError("BLAST XML file %s missing" % f)
+            h = open(f)
+            header = h.readline()
+            if not header:
+                out.close()
+                h.close()
+                # Retry, could be transient error with networked file system...
+                log.warning("BLAST XML file %s empty, retry in 1s..." % f)
+                sleep(1)
+                h = open(f)
+                header = h.readline()
+                if not header:
+                    log.error("BLAST XML file %s was empty" % f)
+                    raise ValueError("BLAST XML file %s was empty" % f)
+            if header.strip() != '<?xml version="1.0"?>':
+                out.write(header)  # for diagnosis
+                out.close()
+                h.close()
+                raise ValueError("%s is not an XML file!" % f)
+            line = h.readline()
+            header += line
+            if line.strip() not in ['<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">',
+                                    '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "NCBI_BlastOutput.dtd">']:
+                out.write(header)  # for diagnosis
+                out.close()
+                h.close()
+                raise ValueError("%s is not a BLAST XML file!" % f)
+            while True:
+                line = h.readline()
+                if not line:
+                    out.write(header)  # for diagnosis
+                    out.close()
+                    h.close()
+                    raise ValueError("BLAST XML file %s ended prematurely" % f)
+                header += line
+                if "<Iteration>" in line:
+                    break
+                if len(header) > 10000:
+                    # Something has gone wrong, don't load too much into memory!
+                    # Write what we have to the merged file for diagnostics
+                    out.write(header)
+                    out.close()
+                    h.close()
+                    raise ValueError("The header in BLAST XML file %s is too long" % f)
+            if "<BlastOutput>" not in header:
+                out.close()
+                h.close()
+                raise ValueError("%s is not a BLAST XML file:\n%s\n..." % (f, header))
+            if f == split_files[0]:
+                out.write(header)
+                old_header = header
+            elif old_header[:300] != header[:300]:
+                # Enough to check <BlastOutput_program> and <BlastOutput_version> match
+                out.close()
+                h.close()
+                raise ValueError("BLAST XML headers don't match for %s and %s - have:\n%s\n...\n\nAnd:\n%s\n...\n"
+                                 % (split_files[0], f, old_header[:300], header[:300]))
+            else:
+                out.write("    <Iteration>\n")
+            for line in h:
+                if "</BlastOutput_iterations>" in line:
+                    break
+                # TODO - Increment <Iteration_iter-num> and if required automatic query names
+                # like <Iteration_query-ID>Query_3</Iteration_query-ID> to be increasing?
+                out.write(line)
+            h.close()
+        out.write("  </BlastOutput_iterations>\n")
+        out.write("</BlastOutput>\n")
+        out.close()
+    merge = staticmethod(merge)
+
+
+class _BlastDb(object):
+    """Base class for BLAST database datatype."""
+
+    def set_peek(self, dataset, is_multi_byte=False):
+        """Set the peek and blurb text."""
+        if not dataset.dataset.purged:
+            dataset.peek = "BLAST database (multiple files)"
+            dataset.blurb = "BLAST database (multiple files)"
+        else:
+            dataset.peek = 'file does not exist'
+            dataset.blurb = 'file purged from disk'
+
+    def display_peek(self, dataset):
+        """Create HTML content, used for displaying peek."""
+        try:
+            return dataset.peek
+        except Exception:
+            return "BLAST database (multiple files)"
+
+    def display_data(self, trans, data, preview=False, filename=None,
+                     to_ext=None, size=None, offset=None, **kwd):
+        """Documented as an old display method, but still gets called via tests etc
+
+        This allows us to format the data shown in the central pane via the "eye" icon.
+        """
+        if filename is not None and filename != "index":
+            # Change nothing - important for the unit tests to access child files:
+            return Data.display_data(self, trans, data, preview, filename,
+                                     to_ext, size, offset, **kwd)
+        if self.file_ext == "blastdbn":
+            title = "This is a nucleotide BLAST database"
+        elif self.file_ext == "blastdbp":
+            title = "This is a protein BLAST database"
+        elif self.file_ext == "blastdbd":
+            title = "This is a domain BLAST database"
+        else:
+            # Error?
+            title = "This is a BLAST database."
+        msg = ""
+        try:
+            # Try to use any text recorded in the dummy index file:
+            handle = open(data.file_name, "rU")
+            msg = handle.read().strip()
+            handle.close()
+        except Exception:
+            pass
+        if not msg:
+            msg = title
+        # Galaxy assumes HTML for the display of composite datatypes,
+        return "<html><head><title>%s</title></head><body><pre>%s</pre></body></html>" % (title, msg)
+
+    def merge(split_files, output_file):
+        """Merge BLAST databases (not implemented for now)."""
+        raise NotImplementedError("Merging BLAST databases is non-trivial (do this via makeblastdb?)")
+
+    def split(cls, input_datasets, subdir_generator_function, split_params):
+        """Split a BLAST database (not implemented for now)."""
+        if split_params is None:
+            return None
+        raise NotImplementedError("Can't split BLAST databases")
+
+
+class BlastNucDb(_BlastDb, Data):
+    """Class for nucleotide BLAST database files."""
+    file_ext = 'blastdbn'
+    allow_datatype_change = False
+    composite_type = 'basic'
+
+    def __init__(self, **kwd):
+        Data.__init__(self, **kwd)
+        self.add_composite_file('blastdb.nhr', is_binary=True)  # sequence headers
+        self.add_composite_file('blastdb.nin', is_binary=True)  # index file
+        self.add_composite_file('blastdb.nsq', is_binary=True)  # nucleotide sequences
+        self.add_composite_file('blastdb.nal', is_binary=False, optional=True)  # alias ( -gi_mask option of makeblastdb)
+        self.add_composite_file('blastdb.nhd', is_binary=True, optional=True)  # sorted sequence hash values ( -hash_index option of makeblastdb)
+        self.add_composite_file('blastdb.nhi', is_binary=True, optional=True)  # index of sequence hash values ( -hash_index option of makeblastdb)
+        self.add_composite_file('blastdb.nnd', is_binary=True, optional=True)  # sorted GI values ( -parse_seqids option of makeblastdb and gi present in the description lines)
+        self.add_composite_file('blastdb.nni', is_binary=True, optional=True)  # index of GI values ( -parse_seqids option of makeblastdb and gi present in the description lines)
+        self.add_composite_file('blastdb.nog', is_binary=True, optional=True)  # OID->GI lookup file ( -hash_index or -parse_seqids option of makeblastdb)
+        self.add_composite_file('blastdb.nsd', is_binary=True, optional=True)  # sorted sequence accession values ( -hash_index or -parse_seqids option of makeblastdb)
+        self.add_composite_file('blastdb.nsi', is_binary=True, optional=True)  # index of sequence accession values ( -hash_index or -parse_seqids option of makeblastdb)
+#        self.add_composite_file('blastdb.00.idx', is_binary=True, optional=True)  # first volume of the MegaBLAST index generated by makembindex
+# The previous line should be repeated for each index volume, with filename extensions like '.01.idx', '.02.idx', etc.
+        self.add_composite_file('blastdb.shd', is_binary=True, optional=True)  # MegaBLAST index superheader (-old_style_index false option of makembindex)
+#        self.add_composite_file('blastdb.naa', is_binary=True, optional=True)  # index of a WriteDB column for e.g. mask data
+#        self.add_composite_file('blastdb.nab', is_binary=True, optional=True)  # data of a WriteDB column
+#        self.add_composite_file('blastdb.nac', is_binary=True, optional=True)  # multiple byte order for a WriteDB column
+# The previous 3 lines should be repeated for each WriteDB column, with filename extensions like ('.nba', '.nbb', '.nbc'), ('.nca', '.ncb', '.ncc'), etc.
+
+
+class BlastProtDb(_BlastDb, Data):
+    """Class for protein BLAST database files."""
+    file_ext = 'blastdbp'
+    allow_datatype_change = False
+    composite_type = 'basic'
+
+    def __init__(self, **kwd):
+        Data.__init__(self, **kwd)
+# Component file comments are as in BlastNucDb except where noted
+        self.add_composite_file('blastdb.phr', is_binary=True)
+        self.add_composite_file('blastdb.pin', is_binary=True)
+        self.add_composite_file('blastdb.psq', is_binary=True)  # protein sequences
+        self.add_composite_file('blastdb.phd', is_binary=True, optional=True)
+        self.add_composite_file('blastdb.phi', is_binary=True, optional=True)
+        self.add_composite_file('blastdb.pnd', is_binary=True, optional=True)
+        self.add_composite_file('blastdb.pni', is_binary=True, optional=True)
+        self.add_composite_file('blastdb.pog', is_binary=True, optional=True)
+        self.add_composite_file('blastdb.psd', is_binary=True, optional=True)
+        self.add_composite_file('blastdb.psi', is_binary=True, optional=True)
+#        self.add_composite_file('blastdb.paa', is_binary=True, optional=True)
+#        self.add_composite_file('blastdb.pab', is_binary=True, optional=True)
+#        self.add_composite_file('blastdb.pac', is_binary=True, optional=True)
+# The last 3 lines should be repeated for each WriteDB column, with filename extensions like ('.pba', '.pbb', '.pbc'), ('.pca', '.pcb', '.pcc'), etc.
+
+
+class BlastDomainDb(_BlastDb, Data):
+    """Class for domain BLAST database files."""
+    file_ext = 'blastdbd'
+    allow_datatype_change = False
+    composite_type = 'basic'
+
+    def __init__(self, **kwd):
+        Data.__init__(self, **kwd)
+        self.add_composite_file('blastdb.phr', is_binary=True)
+        self.add_composite_file('blastdb.pin', is_binary=True)
+        self.add_composite_file('blastdb.psq', is_binary=True)
+        self.add_composite_file('blastdb.freq', is_binary=True, optional=True)
+        self.add_composite_file('blastdb.loo', is_binary=True, optional=True)
+        self.add_composite_file('blastdb.psd', is_binary=True, optional=True)
+        self.add_composite_file('blastdb.psi', is_binary=True, optional=True)
+        self.add_composite_file('blastdb.rps', is_binary=True, optional=True)
+        self.add_composite_file('blastdb.aux', is_binary=True, optional=True)
diff --git a/lib/galaxy/datatypes/sniff.py b/lib/galaxy/datatypes/sniff.py
@@ -261,14 +261,14 @@ def guess_ext( fname, sniff_order, is_multi_byte=False ):
     Returns an extension that can be used in the datatype factory to
     generate a data for the 'fname' file
 
-    >>> fname = get_test_fname('megablast_xml_parser_test1.blastxml')
     >>> from galaxy.datatypes import registry
     >>> sample_conf = os.path.join(util.galaxy_directory(), "config", "datatypes_conf.xml.sample")
     >>> datatypes_registry = registry.Registry()
     >>> datatypes_registry.load_datatypes(root_dir=util.galaxy_directory(), config=sample_conf)
     >>> sniff_order = datatypes_registry.sniff_order
+    >>> fname = get_test_fname('megablast_xml_parser_test1.blastxml')
     >>> guess_ext(fname, sniff_order)
-    'xml'
+    'blastxml'
     >>> fname = get_test_fname('interval.interval')
     >>> guess_ext(fname, sniff_order)
     'interval'