Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] Create an Index abstract base class #556

Merged
merged 42 commits into from Dec 16, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
dd1d351
Start moving LCA to Index
luizirber Oct 1, 2018
291beee
rename add_node to insert
luizirber Oct 13, 2018
a39a74a
simple test
luizirber Oct 13, 2018
2769ae9
first pass definition of search and gather - tests pass, at least!
ctb Jan 3, 2019
6e2e504
start adding code for LinearIndex.search
ctb Jan 5, 2019
412e101
an initial test of LinearIndex.search
ctb Jan 5, 2019
5f197d3
implement save & load for LinearIndex
ctb Sep 6, 2019
723d3df
add test for LinearIndex.load
ctb Sep 6, 2019
577c9fa
implemented & tested LinearIndex.gather
ctb Sep 6, 2019
eb41b93
implement LinearIndex in load_databases and search funtions
ctb Sep 6, 2019
2a13428
implemented LinearIndex for gather, too
ctb Sep 6, 2019
0803af3
implemented search in LCA db
ctb Sep 6, 2019
9406e10
implemented gather on LCA DBs
ctb Sep 6, 2019
9137212
implemented gather on SBT
ctb Sep 6, 2019
100cbd9
implemented search on SBTs
ctb Sep 6, 2019
980a470
removed conditionals in search & gather in favor of Index interface
ctb Sep 6, 2019
622ddee
fix remaining tests for search & gather
ctb Sep 6, 2019
dd55de6
remove some debugging code
ctb Sep 6, 2019
6f2e4c2
fix my errant default parameter ways
ctb Sep 8, 2019
f5e622b
cleanup and simplification of gather code
ctb Sep 8, 2019
088395d
significant refactor of gather code
ctb Sep 8, 2019
c281e79
futher refactoring and simplification
ctb Sep 8, 2019
e82cffc
rely on 'Index.gather' returning actual matches
ctb Sep 8, 2019
ef9b900
remove duplicate SearchResult, clean up & rationalize SearchResult an…
ctb Sep 8, 2019
df4b911
display full order of sigs in failed tests
ctb Sep 8, 2019
2540edd
Merge branch 'master' into refactor/index
luizirber Oct 23, 2019
95ddb7f
fix heisenbug in tests
luizirber Oct 23, 2019
5e4fff9
Merge branch 'master' of github.com:dib-lab/sourmash into refactor/index
ctb Dec 12, 2019
5984d3d
add signatures() iterator to Index objects
ctb Dec 12, 2019
931737e
move search, gather functions into base Index class
ctb Dec 13, 2019
da7c979
fix lca search ignore abundance
ctb Dec 13, 2019
3fa8de3
add function doc
ctb Dec 13, 2019
d3dc2fb
add signatures() method to both LCA and SBT indices
ctb Dec 13, 2019
83ad1b9
Update tests/test_sbt.py
ctb Dec 14, 2019
c80ef46
SBT.insert now matches Index.insert, while SBT.add_node does what ins…
ctb Dec 14, 2019
c994294
Merge branch 'refactor/index_cleanup' of github.com:dib-lab/sourmash …
ctb Dec 14, 2019
b0af24d
clean up signature loading
ctb Dec 14, 2019
2e8c3ab
round out Index method tests, sort of :)
ctb Dec 15, 2019
c6c0213
[WIP] add signatures() method to both LCA and SBT indices (#796)
ctb Dec 15, 2019
2b3e447
Merge branch 'master' into refactor/index
ctb Dec 15, 2019
689dcf5
add more scaled relationship tests in lca DB
ctb Dec 15, 2019
48abd10
Merge branch 'refactor/index' of github.com:dib-lab/sourmash into ref…
ctb Dec 15, 2019
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 2 additions & 1 deletion setup.py
Expand Up @@ -64,7 +64,8 @@
language="c++",
extra_compile_args=EXTRA_COMPILE_ARGS,
extra_link_args=EXTRA_LINK_ARGS)],
"install_requires": ["screed>=0.9", "ijson>=2.5.1", "khmer>=2.1", 'numpy', 'matplotlib', 'scipy'],
"install_requires": ["screed>=0.9", "ijson>=2.5.1", "khmer>=2.1", 'numpy',
'matplotlib', 'scipy', "deprecation>=2.0.6"],
"setup_requires": ['Cython>=0.25.2', "setuptools>=38.6.0",
'setuptools_scm', 'setuptools_scm_git_archive'],
"use_scm_version": {"write_to": "sourmash/version.py"},
Expand Down
25 changes: 14 additions & 11 deletions sourmash/commands.py
Expand Up @@ -450,8 +450,7 @@ def index(args):
ss.minhash = ss.minhash.downsample_scaled(args.scaled)
scaleds.add(ss.minhash.scaled)

leaf = SigLeaf(ss.md5sum(), ss)
tree.add_node(leaf)
tree.insert(ss)
n += 1

if not ss:
Expand Down Expand Up @@ -545,6 +544,10 @@ def search(args):
not args.containment,
args.traverse_directory)

# forcibly ignore abundances if query has no abundances
if not query.minhash.track_abundance:
args.ignore_abundance = True

if not len(databases):
error('Nothing found to search!')
sys.exit(-1)
Expand All @@ -570,7 +573,7 @@ def search(args):
print_results("---------- -----")
for sr in results[:n_matches]:
pct = '{:.1f}%'.format(sr.similarity*100)
name = sr.match_sig._display_name(60)
name = sr.match._display_name(60)
print_results('{:>6} {}', pct, name)

if args.best_only:
Expand All @@ -583,14 +586,14 @@ def search(args):
w.writeheader()
for sr in results:
d = dict(sr._asdict())
del d['match_sig']
del d['match']
w.writerow(d)

# save matching signatures upon request
if args.save_matches:
outname = args.save_matches.name
notify('saving all matched signatures to "{}"', outname)
sig.save_signatures([ sr.match_sig for sr in results ],
sig.save_signatures([ sr.match for sr in results ],
args.save_matches)


Expand Down Expand Up @@ -758,7 +761,7 @@ def gather(args):
pct_query = '{:.1f}%'.format(result.f_unique_weighted*100)
pct_genome = '{:.1f}%'.format(result.f_match*100)
average_abund ='{:.1f}'.format(result.average_abund)
name = result.leaf._display_name(40)
name = result.match._display_name(40)

if query.minhash.track_abundance and not args.ignore_abundance:
print_results('{:9} {:>7} {:>7} {:>9} {}',
Expand Down Expand Up @@ -786,13 +789,13 @@ def gather(args):
w.writeheader()
for result in found:
d = dict(result._asdict())
del d['leaf'] # actual signature not in CSV.
del d['match'] # actual signature not in CSV.
w.writerow(d)

if found and args.save_matches:
outname = args.save_matches.name
notify('saving all matches to "{}"', outname)
sig.save_signatures([ r.leaf for r in found ], args.save_matches)
sig.save_signatures([ r.match for r in found ], args.save_matches)

if args.output_unassigned:
if not len(query.minhash):
Expand Down Expand Up @@ -906,7 +909,7 @@ def multigather(args):
pct_query = '{:.1f}%'.format(result.f_unique_weighted*100)
pct_genome = '{:.1f}%'.format(result.f_match*100)
average_abund ='{:.1f}'.format(result.average_abund)
name = result.leaf._display_name(40)
name = result.match._display_name(40)

if query.minhash.track_abundance and not args.ignore_abundance:
print_results('{:9} {:>7} {:>7} {:>9} {}',
Expand Down Expand Up @@ -941,14 +944,14 @@ def multigather(args):
w.writeheader()
for result in found:
d = dict(result._asdict())
del d['leaf'] # actual signature not in CSV.
del d['match'] # actual signature not in CSV.
w.writerow(d)

output_matches = output_base + '.matches.sig'
with open(output_matches, 'wt') as fp:
outname = output_matches
notify('saving all matches to "{}"', outname)
sig.save_signatures([ r.leaf for r in found ], fp)
sig.save_signatures([ r.match for r in found ], fp)

output_unassigned = output_base + '.unassigned.sig'
with open(output_unassigned, 'wt') as fp:
Expand Down
127 changes: 127 additions & 0 deletions sourmash/index.py
@@ -0,0 +1,127 @@
"An Abstract Base Class for collections of signatures."

from abc import ABCMeta, abstractmethod
from collections import namedtuple

# compatible with Python 2 *and* 3:
ABC = ABCMeta("ABC", (object,), {"__slots__": ()})


class Index(ABC):
@abstractmethod
def signatures(self):
"Return an iterator over all signatures in the Index object."

@abstractmethod
def insert(self, signature):
""" """

@abstractmethod
def save(self, path, storage=None, sparseness=0.0, structure_only=False):
""" """

@classmethod
@abstractmethod
def load(cls, location, leaf_loader=None, storage=None, print_version_warning=True):
""" """

def find(self, search_fn, *args, **kwargs):
"""Use search_fn to find matching signatures in the index.

search_fn(other_sig, *args) should return a boolean that indicates
whether other_sig is a match.

Returns a list.
"""

matches = []

for node in self.signatures():
if search_fn(node, *args):
matches.append(node)
return matches

def search(self, query, *args, **kwargs):
"""Return set of matches with similarity above 'threshold'.

Results will be sorted by similarity, highest to lowest.

Optional arguments accepted by all Index subclasses:
* do_containment: default False. If True, use Jaccard containment.
* best_only: default False. If True, allow optimizations that
may. May discard matches better than threshold, but first match
is guaranteed to be best.
* ignore_abundance: default False. If True, and query signature
and database support k-mer abundances, ignore those abundances.

Note, the "best only" hint is ignored by LinearIndex.
"""

# check arguments
if 'threshold' not in kwargs:
raise TypeError("'search' requires 'threshold'")
threshold = kwargs['threshold']

do_containment = kwargs.get('do_containment', False)
ignore_abundance = kwargs.get('ignore_abundance', False)
ctb marked this conversation as resolved.
Show resolved Hide resolved

# configure search - containment? ignore abundance?
if do_containment:
query_match = lambda x: query.contained_by(x, downsample=True)
else:
query_match = lambda x: query.similarity(
x, downsample=True, ignore_abundance=ignore_abundance)

# do the actual search:
matches = []

for ss in self.signatures():
similarity = query_match(ss)
if similarity >= threshold:
matches.append((similarity, ss, self.filename))

# sort!
matches.sort(key=lambda x: -x[0])
return matches

def gather(self, query, *args, **kwargs):
"Return the match with the best Jaccard containment in the Index."
results = []
for ss in self.signatures():
cont = query.minhash.containment_ignore_maxhash(ss.minhash)
if cont:
results.append((cont, ss, self.filename))

results.sort(reverse=True, key=lambda x: (x[0], x[1].name()))

return results


class LinearIndex(Index):
def __init__(self, _signatures=None, filename=None):
self._signatures = []
if _signatures:
self._signatures = list(_signatures)
self.filename = filename

def signatures(self):
return iter(self._signatures)

def __len__(self):
return len(self._signatures)

def insert(self, node):
self._signatures.append(node)

def save(self, path):
from .signature import save_signatures
with open(path, 'wt') as fp:
save_signatures(self.signatures(), fp)

@classmethod
def load(cls, location):
from .signature import load_signatures
si = load_signatures(location)

lidx = LinearIndex(si, filename=location)
return lidx
86 changes: 70 additions & 16 deletions sourmash/lca/lca_utils.py
Expand Up @@ -19,6 +19,7 @@

from .._minhash import get_max_hash_for_scaled
from ..logging import notify, error, debug
from ..index import Index

# type to store an element in a taxonomic lineage
LineagePair = namedtuple('LineagePair', ['rank', 'name'])
Expand Down Expand Up @@ -138,7 +139,7 @@ def find_lca(tree):
return tuple(lineage), len(node)


class LCA_Database(object):
class LCA_Database(Index):
"""
Wrapper class for taxonomic database.

Expand All @@ -163,6 +164,12 @@ def __init__(self):
def __repr__(self):
return "LCA_Database('{}')".format(self.filename)

def signatures(self):
from .. import SourmashSignature
self._create_signatures()
for v in self._signatures.values():
yield SourmashSignature(v)

def load(self, db_name):
"Load from a JSON file."
xopen = open
Expand Down Expand Up @@ -261,10 +268,48 @@ def save(self, db_name):

json.dump(save_d, fp)

def search(self, query, *args, **kwargs):
# check arguments
if 'threshold' not in kwargs:
raise TypeError("'search' requires 'threshold'")
threshold = kwargs['threshold']
do_containment = kwargs.get('do_containment', False)
ignore_abundance = kwargs.get('ignore_abundance', True)
if not ignore_abundance:
raise TypeError("'search' on LCA databases does not use abundance")

results = []
for x in self.find_signatures(query.minhash, threshold, do_containment):
(score, match, filename) = x
results.append((score, match, filename))

results.sort(key=lambda x: -x[0])
return results

def gather(self, query, *args, **kwargs):
results = []
for x in self.find_signatures(query.minhash, 0.0,
containment=True, ignore_scaled=True):
(score, match, filename) = x
if score:
results.append((score, match, filename))

return results

def insert(self, node):
raise NotImplementedError

def find(self, search_fn, *args, **kwargs):
raise NotImplementedError

def downsample_scaled(self, scaled):
"""
Downsample to the provided scaled value, i.e. eliminate all hashes
that don't fall in the required range.

NOTE: we probably need to invalidate some of the dynamically
calculated members of this object, like _signatures, when we do this.
But we aren't going to right now.
"""
if scaled == self.scaled:
return
Expand Down Expand Up @@ -294,27 +339,37 @@ def get_lineage_assignments(self, hashval):

return x

def find(self, minhash, threshold, containment=False, ignore_scaled=False):
"""
Do a Jaccard similarity or containment search.
"""
# make sure we're looking at the same scaled value as database
if self.scaled > minhash.scaled:
minhash = minhash.downsample_scaled(self.scaled)
elif self.scaled < minhash.scaled and not ignore_scaled:
raise ValueError("lca db scaled is {} vs query {}; must downsample".format(self.scaled, minhash.scaled))
def _create_signatures(self):
"Create a _signatures member dictionary that contains {idx: minhash}."
from .. import MinHash

if not hasattr(self, '_signatures'):
minhash = MinHash(n=0, ksize=self.ksize, scaled=self.scaled)

if not hasattr(self, 'signatures'):
debug('creating signatures for LCA DB...')
sigd = defaultdict(minhash.copy_and_clear)

for (k, v) in self.hashval_to_idx.items():
for vv in v:
sigd[vv].add_hash(k)

self.signatures = sigd
self._signatures = sigd

debug('=> {} signatures!', len(self._signatures))

def find_signatures(self, minhash, threshold, containment=False,
ignore_scaled=False):
"""
Do a Jaccard similarity or containment search.
"""
# make sure we're looking at the same scaled value as database
if self.scaled > minhash.scaled:
minhash = minhash.downsample_scaled(self.scaled)
elif self.scaled < minhash.scaled and not ignore_scaled:
# note that containment can be calculated w/o matching scaled.
raise ValueError("lca db scaled is {} vs query {}; must downsample".format(self.scaled, minhash.scaled))

debug('=> {} signatures!', len(self.signatures))
self._create_signatures()

# build idx_to_ident from ident_to_idx
if not hasattr(self, 'idx_to_ident'):
Expand All @@ -340,7 +395,7 @@ def find(self, minhash, threshold, containment=False, ignore_scaled=False):
name = self.ident_to_name[ident]
debug('looking at {} ({})', ident, name)

match_mh = self.signatures[idx]
match_mh = self._signatures[idx]
match_size = len(match_mh)

debug('count: {}; query_mins: {}; match size: {}',
Expand All @@ -354,11 +409,10 @@ def find(self, minhash, threshold, containment=False, ignore_scaled=False):
debug('score: {} (containment? {})', score, containment)

if score >= threshold:
# reconstruct signature... ugh.
from .. import SourmashSignature
match_sig = SourmashSignature(match_mh, name=name)

yield score, match_sig, match_sig.md5sum(), self.filename, name
yield score, match_sig, self.filename


def load_single_database(filename, verbose=False):
Expand Down