sourmash-bio · luizirber · Dec 16, 2019 · Oct 1, 2018 · Oct 13, 2018 · Oct 13, 2018
diff --git a/setup.py b/setup.py
@@ -64,7 +64,8 @@
                                language="c++",
                                extra_compile_args=EXTRA_COMPILE_ARGS,
                                extra_link_args=EXTRA_LINK_ARGS)],
-    "install_requires": ["screed>=0.9", "ijson>=2.5.1", "khmer>=2.1", 'numpy', 'matplotlib', 'scipy'],
+    "install_requires": ["screed>=0.9", "ijson>=2.5.1", "khmer>=2.1", 'numpy',
+                         'matplotlib', 'scipy', "deprecation>=2.0.6"],
     "setup_requires": ['Cython>=0.25.2', "setuptools>=38.6.0",
                        'setuptools_scm', 'setuptools_scm_git_archive'],
     "use_scm_version": {"write_to": "sourmash/version.py"},

diff --git a/sourmash/commands.py b/sourmash/commands.py
@@ -450,8 +450,7 @@ def index(args):
                 ss.minhash = ss.minhash.downsample_scaled(args.scaled)
             scaleds.add(ss.minhash.scaled)
 
-            leaf = SigLeaf(ss.md5sum(), ss)
-            tree.add_node(leaf)
+            tree.insert(ss)
             n += 1
 
         if not ss:
@@ -545,6 +544,10 @@ def search(args):
                                                 not args.containment,
                                                 args.traverse_directory)
 
+    # forcibly ignore abundances if query has no abundances
+    if not query.minhash.track_abundance:
+        args.ignore_abundance = True
+
     if not len(databases):
         error('Nothing found to search!')
         sys.exit(-1)
@@ -570,7 +573,7 @@ def search(args):
     print_results("----------   -----")
     for sr in results[:n_matches]:
         pct = '{:.1f}%'.format(sr.similarity*100)
-        name = sr.match_sig._display_name(60)
+        name = sr.match._display_name(60)
         print_results('{:>6}       {}', pct, name)
 
     if args.best_only:
@@ -583,14 +586,14 @@ def search(args):
         w.writeheader()
         for sr in results:
             d = dict(sr._asdict())
-            del d['match_sig']
+            del d['match']
             w.writerow(d)
 
     # save matching signatures upon request
     if args.save_matches:
         outname = args.save_matches.name
         notify('saving all matched signatures to "{}"', outname)
-        sig.save_signatures([ sr.match_sig for sr in results ],
+        sig.save_signatures([ sr.match for sr in results ],
                             args.save_matches)
 
 
@@ -758,7 +761,7 @@ def gather(args):
         pct_query = '{:.1f}%'.format(result.f_unique_weighted*100)
         pct_genome = '{:.1f}%'.format(result.f_match*100)
         average_abund ='{:.1f}'.format(result.average_abund)
-        name = result.leaf._display_name(40)
+        name = result.match._display_name(40)
 
         if query.minhash.track_abundance and not args.ignore_abundance:
             print_results('{:9}   {:>7} {:>7} {:>9}    {}',
@@ -786,13 +789,13 @@ def gather(args):
         w.writeheader()
         for result in found:
             d = dict(result._asdict())
-            del d['leaf']                 # actual signature not in CSV.
+            del d['match']                 # actual signature not in CSV.
             w.writerow(d)
 
     if found and args.save_matches:
         outname = args.save_matches.name
         notify('saving all matches to "{}"', outname)
-        sig.save_signatures([ r.leaf for r in found ], args.save_matches)
+        sig.save_signatures([ r.match for r in found ], args.save_matches)
 
     if args.output_unassigned:
         if not len(query.minhash):
@@ -906,7 +909,7 @@ def multigather(args):
             pct_query = '{:.1f}%'.format(result.f_unique_weighted*100)
             pct_genome = '{:.1f}%'.format(result.f_match*100)
             average_abund ='{:.1f}'.format(result.average_abund)
-            name = result.leaf._display_name(40)
+            name = result.match._display_name(40)
 
             if query.minhash.track_abundance and not args.ignore_abundance:
                 print_results('{:9}   {:>7} {:>7} {:>9}    {}',
@@ -941,14 +944,14 @@ def multigather(args):
             w.writeheader()
             for result in found:
                 d = dict(result._asdict())
-                del d['leaf']                 # actual signature not in CSV.
+                del d['match']                 # actual signature not in CSV.
                 w.writerow(d)
 
         output_matches = output_base + '.matches.sig'
         with open(output_matches, 'wt') as fp:
             outname = output_matches
             notify('saving all matches to "{}"', outname)
-            sig.save_signatures([ r.leaf for r in found ], fp)
+            sig.save_signatures([ r.match for r in found ], fp)
 
         output_unassigned = output_base + '.unassigned.sig'
         with open(output_unassigned, 'wt') as fp:

diff --git a/sourmash/index.py b/sourmash/index.py
@@ -0,0 +1,127 @@
+"An Abstract Base Class for collections of signatures."
+
+from abc import ABCMeta, abstractmethod
+from collections import namedtuple
+
+# compatible with Python 2 *and* 3:
+ABC = ABCMeta("ABC", (object,), {"__slots__": ()})
+
+
+class Index(ABC):
+    @abstractmethod
+    def signatures(self):
+        "Return an iterator over all signatures in the Index object."
+
+    @abstractmethod
+    def insert(self, signature):
+        """ """
+
+    @abstractmethod
+    def save(self, path, storage=None, sparseness=0.0, structure_only=False):
+        """ """
+
+    @classmethod
+    @abstractmethod
+    def load(cls, location, leaf_loader=None, storage=None, print_version_warning=True):
+        """ """
+
+    def find(self, search_fn, *args, **kwargs):
+        """Use search_fn to find matching signatures in the index.
+
+        search_fn(other_sig, *args) should return a boolean that indicates
+        whether other_sig is a match.
+
+        Returns a list.
+        """
+
+        matches = []
+
+        for node in self.signatures():
+            if search_fn(node, *args):
+                matches.append(node)
+        return matches
+
+    def search(self, query, *args, **kwargs):
+        """Return set of matches with similarity above 'threshold'.
+
+        Results will be sorted by similarity, highest to lowest.
+
+        Optional arguments accepted by all Index subclasses:
+          * do_containment: default False. If True, use Jaccard containment.
+          * best_only: default False. If True, allow optimizations that
+            may. May discard matches better than threshold, but first match
+            is guaranteed to be best.
+          * ignore_abundance: default False. If True, and query signature
+            and database support k-mer abundances, ignore those abundances.
+
+        Note, the "best only" hint is ignored by LinearIndex.
+        """
+
+        # check arguments
+        if 'threshold' not in kwargs:
+            raise TypeError("'search' requires 'threshold'")
+        threshold = kwargs['threshold']
+
+        do_containment = kwargs.get('do_containment', False)
+        ignore_abundance = kwargs.get('ignore_abundance', False)
+
+        # configure search - containment? ignore abundance?
+        if do_containment:
+            query_match = lambda x: query.contained_by(x, downsample=True)
+        else:
+            query_match = lambda x: query.similarity(
+                x, downsample=True, ignore_abundance=ignore_abundance)
+
+        # do the actual search:
+        matches = []
+
+        for ss in self.signatures():
+            similarity = query_match(ss)
+            if similarity >= threshold:
+                matches.append((similarity, ss, self.filename))
+
+        # sort!
+        matches.sort(key=lambda x: -x[0])
+        return matches
+
+    def gather(self, query, *args, **kwargs):
+        "Return the match with the best Jaccard containment in the Index."
+        results = []
+        for ss in self.signatures():
+            cont = query.minhash.containment_ignore_maxhash(ss.minhash)
+            if cont:
+                results.append((cont, ss, self.filename))
+
+        results.sort(reverse=True, key=lambda x: (x[0], x[1].name()))
+
+        return results
+
+
+class LinearIndex(Index):
+    def __init__(self, _signatures=None, filename=None):
+        self._signatures = []
+        if _signatures:
+            self._signatures = list(_signatures)
+        self.filename = filename
+
+    def signatures(self):
+        return iter(self._signatures)
+
+    def __len__(self):
+        return len(self._signatures)
+
+    def insert(self, node):
+        self._signatures.append(node)
+
+    def save(self, path):
+        from .signature import save_signatures
+        with open(path, 'wt') as fp:
+            save_signatures(self.signatures(), fp)
+
+    @classmethod
+    def load(cls, location):
+        from .signature import load_signatures
+        si = load_signatures(location)
+
+        lidx = LinearIndex(si, filename=location)
+        return lidx
diff --git a/sourmash/lca/lca_utils.py b/sourmash/lca/lca_utils.py
@@ -19,6 +19,7 @@
 
 from .._minhash import get_max_hash_for_scaled
 from ..logging import notify, error, debug
+from ..index import Index
 
 # type to store an element in a taxonomic lineage
 LineagePair = namedtuple('LineagePair', ['rank', 'name'])
@@ -138,7 +139,7 @@ def find_lca(tree):
             return tuple(lineage), len(node)
 
 
-class LCA_Database(object):
+class LCA_Database(Index):
     """
     Wrapper class for taxonomic database.
 
@@ -163,6 +164,12 @@ def __init__(self):
     def __repr__(self):
         return "LCA_Database('{}')".format(self.filename)
 
+    def signatures(self):
+        from .. import SourmashSignature
+        self._create_signatures()
+        for v in self._signatures.values():
+            yield SourmashSignature(v)
+
     def load(self, db_name):
         "Load from a JSON file."
         xopen = open
@@ -261,10 +268,48 @@ def save(self, db_name):
 
             json.dump(save_d, fp)
 
+    def search(self, query, *args, **kwargs):
+        # check arguments
+        if 'threshold' not in kwargs:
+            raise TypeError("'search' requires 'threshold'")
+        threshold = kwargs['threshold']
+        do_containment = kwargs.get('do_containment', False)
+        ignore_abundance = kwargs.get('ignore_abundance', True)
+        if not ignore_abundance:
+            raise TypeError("'search' on LCA databases does not use abundance")
+
+        results = []
+        for x in self.find_signatures(query.minhash, threshold, do_containment):
+            (score, match, filename) = x
+            results.append((score, match, filename))
+
+        results.sort(key=lambda x: -x[0])
+        return results
+
+    def gather(self, query, *args, **kwargs):
+        results = []
+        for x in self.find_signatures(query.minhash, 0.0,
+                                      containment=True, ignore_scaled=True):
+            (score, match, filename) = x
+            if score:
+                results.append((score, match, filename))
+
+        return results
+
+    def insert(self, node):
+        raise NotImplementedError
+
+    def find(self, search_fn, *args, **kwargs):
+        raise NotImplementedError
+
     def downsample_scaled(self, scaled):
         """
         Downsample to the provided scaled value, i.e. eliminate all hashes
         that don't fall in the required range.
+
+        NOTE: we probably need to invalidate some of the dynamically
+        calculated members of this object, like _signatures, when we do this.
+        But we aren't going to right now.
         """
         if scaled == self.scaled:
             return
@@ -294,27 +339,37 @@ def get_lineage_assignments(self, hashval):
 
         return x
 
-    def find(self, minhash, threshold, containment=False, ignore_scaled=False):
-        """
-        Do a Jaccard similarity or containment search.
-        """
-        # make sure we're looking at the same scaled value as database
-        if self.scaled > minhash.scaled:
-            minhash = minhash.downsample_scaled(self.scaled)
-        elif self.scaled < minhash.scaled and not ignore_scaled:
-            raise ValueError("lca db scaled is {} vs query {}; must downsample".format(self.scaled, minhash.scaled))
+    def _create_signatures(self):
+        "Create a _signatures member dictionary that contains {idx: minhash}."
+        from .. import MinHash
+
+        if not hasattr(self, '_signatures'):
+            minhash = MinHash(n=0, ksize=self.ksize, scaled=self.scaled)
 
-        if not hasattr(self, 'signatures'):
             debug('creating signatures for LCA DB...')
             sigd = defaultdict(minhash.copy_and_clear)
 
             for (k, v) in self.hashval_to_idx.items():
                 for vv in v:
                     sigd[vv].add_hash(k)
 
-            self.signatures = sigd
+            self._signatures = sigd
+
+        debug('=> {} signatures!', len(self._signatures))
+
+    def find_signatures(self, minhash, threshold, containment=False,
+                       ignore_scaled=False):
+        """
+        Do a Jaccard similarity or containment search.
+        """
+        # make sure we're looking at the same scaled value as database
+        if self.scaled > minhash.scaled:
+            minhash = minhash.downsample_scaled(self.scaled)
+        elif self.scaled < minhash.scaled and not ignore_scaled:
+            # note that containment can be calculated w/o matching scaled.
+            raise ValueError("lca db scaled is {} vs query {}; must downsample".format(self.scaled, minhash.scaled))
 
-        debug('=> {} signatures!', len(self.signatures))
+        self._create_signatures()
 
         # build idx_to_ident from ident_to_idx
         if not hasattr(self, 'idx_to_ident'):
@@ -340,7 +395,7 @@ def find(self, minhash, threshold, containment=False, ignore_scaled=False):
             name = self.ident_to_name[ident]
             debug('looking at {} ({})', ident, name)
 
-            match_mh = self.signatures[idx]
+            match_mh = self._signatures[idx]
             match_size = len(match_mh)
 
             debug('count: {}; query_mins: {}; match size: {}',
@@ -354,11 +409,10 @@ def find(self, minhash, threshold, containment=False, ignore_scaled=False):
             debug('score: {} (containment? {})', score, containment)
 
             if score >= threshold:
-                # reconstruct signature... ugh.
                 from .. import SourmashSignature
                 match_sig = SourmashSignature(match_mh, name=name)
 
-                yield score, match_sig, match_sig.md5sum(), self.filename, name
+                yield score, match_sig, self.filename
 
 
 def load_single_database(filename, verbose=False):