Updates to localize & alac modules (#301)

This update discards the old `localize` module and renames the brand-spanking-new `cutout` module to `localize`. The `alac` module has been updated and simplified to support the new streaming conventions. Closes #297.
kevlar-dev · Nov 15, 2018 · 0e95536 · 0e95536
1 parent 0a13ca0
commit 0e95536
Show file tree

Hide file tree

Showing 17 changed files with 361 additions and 596 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,14 +10,14 @@ This project adheres to [Semantic Versioning](http://semver.org/).
 - New flags for filtering gDNA cutouts or calls from specified sequences (see #285).
 - New filter that discards any contig/gDNA alignment with more than 4 mismatches (see #288).
 - A new feature that generates a Nodetable containing only variant-spanning k-mers to support re-counting k-mers and computing likelihood scores in low memory (see #289, #292).
-- A new `kevlar cutout` command that mimics the behavior of `kevlar localize` but does so much more efficiently (see #294).
 - A new `ProgressIndicator` class that provides gradually less frequent updates over time (see #299).
 
 ### Changed
 - Ported augfastx handling from `kevlar.seqio` module to a new Cython module (see #279).
 - Dynamic error model for likelihood calculations is now an configurable option (see #286).
 - Cleaned up overlap-related code with a new `ReadPair` class (see #283).
 - Updated `kevlar assemble`, `kevlar localize`, and `kevlar call` to accept streams of partitioned reads; previously, only reads for a single partition were permitted (see #294).
+- Overhauled the `kevlar localize` command to compute seed locations for all seeds in all partitions with a single BWA call, massively improving efficiency (see #294 and #301).
 
 ### Fixed
 - Minor bug with .gml output due to a change in the networkx package (see #278).

diff --git a/docs/cli.rst b/docs/cli.rst
@@ -47,16 +47,6 @@ kevlar assemble
    :prog: kevlar
    :path: assemble
 
-kevlar cutout
--------------
-
-.. argparse::
-   :module: kevlar.cli.__init__
-   :func: parser
-   :nodefault:
-   :prog: kevlar
-   :path: cutout
-
 kevlar localize
 ---------------
 

diff --git a/docs/install.rst b/docs/install.rst
@@ -35,7 +35,7 @@ The kevlar package **requires Python 3** and has several dependencies that are n
 - the `scipy library <https://www.scipy.org/>`_
 - the `khmer package <http://khmer.readthedocs.io/>`_
 
-Also, kevlar requires the `bwa <https://github.com/lh3/bwa>`_ and `samtools <https://github.com/samtools/samtools>`_ commands to be callable from your ``$PATH`` environmental variable.
+Also, kevlar requires the `bwa <https://github.com/lh3/bwa>`_ command to be callable from your ``$PATH`` environmental variable.
 
 When kevlar is installed from PyPI, most Python dependencies *should* handled automatically.
 But since kevlar currently relies on an unreleased version of khmer this last dependency must be installed manually.

diff --git a/kevlar/__init__.py b/kevlar/__init__.py
@@ -51,7 +51,6 @@
 from kevlar import effcount
 from kevlar import partition
 from kevlar import localize
-from kevlar import cutout
 from kevlar import call
 from kevlar import alac
 from kevlar import simplex

diff --git a/kevlar/alac.py b/kevlar/alac.py
@@ -7,123 +7,59 @@
 # licensed under the MIT license: see LICENSE.
 # -----------------------------------------------------------------------------
 
-from itertools import chain
-from queue import Queue
-import re
-import sys
-from threading import Thread
-from time import sleep
-
+from collections import defaultdict
 import kevlar
 from kevlar.assemble import assemble_fml_asm
 from kevlar.localize import localize
 from kevlar.call import call
 import khmer
-
-
-def make_call_from_reads(queue, idx, calls, refrfile, ksize=31, delta=50,
-                         seedsize=31, maxdiff=None, exclpattern=None,
-                         inclpattern=None, match=1, mismatch=2, gapopen=5,
-                         gapextend=0, min_ikmers=None, refrseqs=None,
-                         logstream=sys.stderr):
-        while True:
-            if queue.empty():
-                sleep(1)
-                continue
-            reads = queue.get()
-            cc = kevlar.seqio.partition_id(reads[0].name)
-            if cc is not None and int(cc) % 1000 == 0:  # pragma: no cover
-                message = '[kevlar::alac::make_call_from_reads'
-                message += ' (thread={:d})]'.format(idx)
-                message += ' grabbed partition={} from queue,'.format(cc)
-                message += ' queue size now {:d}'.format(queue.qsize())
-                print(message, file=logstream)
-
-            # Assemble partitioned reads into contig(s)
-            contigs = list(assemble_fml_asm(reads, logstream=logstream))
-            if min_ikmers is not None:
-                # Apply min ikmer filter if it's set
-                contigs = [
-                    c for c in contigs if len(c.annotations) >= min_ikmers
-                ]
-            if len(contigs) == 0:
-                queue.task_done()
-                continue
-
-            # Identify the genomic region(s) associated with each contig
-            localizer = localize(
-                contigs, refrfile, seedsize, delta=delta, maxdiff=maxdiff,
-                inclpattern=inclpattern, exclpattern=exclpattern,
-                refrseqs=refrseqs, logstream=logstream
-            )
-            targets = list(localizer)
-            if len(targets) == 0:
-                queue.task_done()
-                continue
-
-            # Align contigs to genomic targets to make variant calls
-            caller = call(targets, contigs, match, mismatch, gapopen,
-                          gapextend, ksize, refrfile)
-            for varcall in caller:
-                if cc:
-                    varcall.annotate('PART', cc)
-                calls.append(varcall)
-            queue.task_done()
+import re
+import sys
 
 
 def alac(pstream, refrfile, threads=1, ksize=31, maxreads=10000, delta=50,
          seedsize=31, maxdiff=None, inclpattern=None, exclpattern=None,
          match=1, mismatch=2, gapopen=5, gapextend=0, min_ikmers=None,
          maskfile=None, maskmem=1e6, maskmaxfpr=0.01, logstream=sys.stderr):
-    part_queue = Queue(maxsize=max(32, 12 * threads))
-
-    message = 'loading reference genome into memory'
-    print('[kevlar::alac]', message, file=logstream)
-    refrstream = kevlar.open(refrfile, 'r')
-    refrseqs = kevlar.seqio.parse_seq_dict(refrstream)
-    message = 'done! Loading partitioned reads...'
-    print('[kevlar::alac]', message, file=logstream)
-
-    call_lists = list()
-    for idx in range(threads):
-        thread_calls = list()
-        call_lists.append(thread_calls)
-        worker = Thread(
-            target=make_call_from_reads,
-            args=(
-                part_queue, idx, thread_calls, refrfile, ksize, delta,
-                seedsize, maxdiff, inclpattern, exclpattern, match, mismatch,
-                gapopen, gapextend, min_ikmers, refrseqs, logstream,
-            )
+    assembler = kevlar.assemble.assemble(pstream, maxreads=maxreads)
+    contigs_by_partition = defaultdict(list)
+    for partid, contig in assembler:
+        contigs_by_partition[partid].append(contig)
+
+    contigstream = [(pid, ctgs) for pid, ctgs in contigs_by_partition.items()]
+    targeter = kevlar.localize.localize(
+        contigstream, refrfile, seedsize=seedsize, delta=delta,
+        maxdiff=maxdiff, inclpattern=inclpattern, exclpattern=exclpattern,
+        logstream=logstream
+    )
+    targets_by_partition = defaultdict(list)
+    for partid, gdna in targeter:
+        targets_by_partition[partid].append(gdna)
+
+    calls = list()
+    for partid in sorted(targets_by_partition):
+        gdnalist = targets_by_partition[partid]
+        contigs = contigs_by_partition[partid]
+        caller = kevlar.call.call(
+            gdnalist, contigs, partid, match=match, mismatch=mismatch,
+            gapopen=gapopen, gapextend=gapextend, ksize=ksize,
+            refrfile=refrfile,
         )
-        worker.setDaemon(True)
-        worker.start()
-
-    for partid, partition in pstream:
-        reads = list(partition)
-        if len(reads) > maxreads:
-            message = 'skipping partition with {:d} reads'.format(len(reads))
-            print('[kevlar::alac] WARNING:', message, file=logstream)
-            continue
-        if partid is not None and int(partid) % 1000 == 0:  # pragma: no cover
-            message = '{} partitions scheduled for processing'.format(partid)
-            print('[kevlar::alac]', message, file=logstream)
-        part_queue.put(reads)
-
-    part_queue.join()
-    allcalls = sorted(chain(*call_lists), key=lambda c: (c.seqid, c.position))
+        partcalls = list(caller)
+        calls.extend(partcalls)
+    calls = sorted(calls, key=lambda c: (c.seqid, c.position))
     if maskfile:
         message = 'generating mask of variant-spanning k-mers'
         print('[kevlar::alac]', message, file=logstream)
         numtables = 4
         buckets = maskmem * khmer._buckets_per_byte['nodegraph'] / numtables
         mask = khmer.Nodetable(ksize, buckets, numtables)
-        for call in allcalls:
+        for call in calls:
             window = call.attribute('ALTWINDOW')
             if window is not None and len(window) >= ksize:
                 mask.consume(window)
         mask.save(maskfile)
-    for call in allcalls:
+    for call in calls:
         yield call
 
 
@@ -141,7 +77,7 @@ def main(args):
         exclpattern=args.exclude, match=args.match, mismatch=args.mismatch,
         gapopen=args.open, gapextend=args.extend, min_ikmers=args.min_ikmers,
         maskfile=args.gen_mask, maskmem=args.mask_mem,
-        maskmaxfpr=args.mask_max_fpr, logstream=args.logfile
+        maskmaxfpr=args.mask_max_fpr, logstream=args.logfile,
     )
 
     writer = kevlar.vcf.VCFWriter(

diff --git a/kevlar/assemble.py b/kevlar/assemble.py
@@ -43,10 +43,10 @@ def assemble(partstream, maxreads=10000, logstream=sys.stderr):
             if partid is not None:
                 newname += ' kvcc={}'.format(partid)
             contig.name = newname
-            yield contig
+            yield partid, contig
     message = 'processed {} partitions'.format(pn)
     message += ' and assembled {} contigs'.format(n)
-    print('[kevlar::assemble] ', message, file=logstream)
+    print('[kevlar::assemble]', message, file=logstream)
 
 
 def main(args):
@@ -58,5 +58,5 @@ def main(args):
     outstream = kevlar.open(args.out, 'w')
     assembler = assemble(pstream, maxreads=args.max_reads,
                          logstream=args.logfile)
-    for contig in assembler:
+    for partid, contig in assembler:
         kevlar.print_augmented_fastx(contig, outstream)
diff --git a/kevlar/call.py b/kevlar/call.py
@@ -81,9 +81,9 @@ def dedup(callstream):
             yield sortedcalls[0]
 
 
-def prelim_call(targetlist, querylist, match=1, mismatch=2, gapopen=5,
-                gapextend=0, ksize=31, refrfile=None, debug=False, mindist=5,
-                logstream=sys.stderr):
+def prelim_call(targetlist, querylist, partid=None, match=1, mismatch=2,
+                gapopen=5, gapextend=0, ksize=31, refrfile=None, debug=False,
+                mindist=5, logstream=sys.stderr):
     """Implement the `kevlar call` procedure as a generator function."""
     for query in sorted(querylist, reverse=True, key=len):
         alignments = list()
@@ -108,6 +108,8 @@ def prelim_call(targetlist, querylist, match=1, mismatch=2, gapopen=5,
                       alignment.contig.name, '\n', str(alignment), sep='',
                       end='\n\n', file=logstream)
             for varcall in alignment.call_variants(ksize, mindist, logstream):
+                if partid is not None:
+                    varcall.annotate('PART', partid)
                 if alignment.matedist:
                     varcall.annotate('MATEDIST', alignment.matedist)
                 yield varcall
@@ -162,7 +164,7 @@ def main(args):
         progress_indicator.update()
         contigs = contigs_by_partition[partid]
         caller = call(
-            gdnas, contigs, match=args.match, mismatch=args.mismatch,
+            gdnas, contigs, partid, match=args.match, mismatch=args.mismatch,
             gapopen=args.open, gapextend=args.extend, ksize=args.ksize,
             refrfile=args.refr, debug=args.debug, mindist=5,
             logstream=args.logfile

diff --git a/kevlar/cli/__init__.py b/kevlar/cli/__init__.py
@@ -22,7 +22,6 @@
 from . import gentrio
 from . import partition
 from . import localize
-from . import cutout
 from . import call
 from . import alac
 from . import simplex
@@ -41,7 +40,6 @@
     'mutate': kevlar.mutate.main,
     'gentrio': kevlar.gentrio.main,
     'partition': kevlar.partition.main,
-    'cutout': kevlar.cutout.main,
     'localize': kevlar.localize.main,
     'call': kevlar.call.main,
     'alac': kevlar.alac.main,
@@ -63,7 +61,6 @@
     'gentrio': gentrio.subparser,
     'partition': partition.subparser,
     'localize': localize.subparser,
-    'cutout': cutout.subparser,
     'call': call.subparser,
     'alac': alac.subparser,
     'simplex': simplex.subparser,

diff --git a/kevlar/cli/cutout.py b/kevlar/cli/cutout.py
diff --git a/kevlar/cli/localize.py b/kevlar/cli/localize.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 #
 # -----------------------------------------------------------------------------
-# Copyright (c) 2017 The Regents of the University of California
+# Copyright (c) 2018 The Regents of the University of California
 #
 # This file is part of kevlar (http://github.com/dib-lab/kevlar) and is
 # licensed under the MIT license: see LICENSE.
@@ -14,17 +14,18 @@ def subparser(subparsers):
     """Define the `kevlar localize` command-line interface."""
 
     desc = """\
-    Given a reference genome and a contig (or set of contigs) assembled from
-    variant-related reads, retrieve the portion of the reference genome
-    corresponding to the variant. NOTE: this command relies on the `bwa`
-    program being in the PATH environmental variable.
+    For each partition, compute the reference target sequence to use for
+    variant calling. NOTE: this command relies on the `bwa` program being in
+    the PATH environmental variable.
     """
 
     subparser = subparsers.add_parser('localize', description=desc)
     subparser.add_argument('-d', '--delta', type=int, metavar='D',
                            default=50, help='retrieve the genomic interval '
                            'from the reference by extending beyond the span '
                            'of all k-mer starting positions by D bp')
+    subparser.add_argument('-p', '--part-id', type=str, metavar='ID',
+                           help='only localize partition "ID" in the input')
     subparser.add_argument('-o', '--out', metavar='FILE', default='-',
                            help='output file; default is terminal (stdout)')
     subparser.add_argument('-z', '--seed-size', type=int, metavar='Z',