Merge pull request #80 from dib-lab/feature/partition

Introduce "kevlar partition", fix assert bug
kevlar-dev · May 31, 2017 · 3375f8a · 3375f8a
2 parents 3130636 + e366fd5
commit 3375f8a
Show file tree

Hide file tree

Showing 35 changed files with 872 additions and 514 deletions.
diff --git a/kevlar/__init__.py b/kevlar/__init__.py
@@ -3,19 +3,35 @@
 # -----------------------------------------------------------------------------
 # Copyright (c) 2016 The Regents of the University of California
 #
-# This file is part of kevlar (http://github.com/standage/kevlar) and is
+# This file is part of kevlar (http://github.com/dib-lab/kevlar) and is
 # licensed under the MIT license: see LICENSE.
 # -----------------------------------------------------------------------------
 
+# Core libraries
+from __future__ import print_function
 try:
     import __builtin__ as builtins
-except:  # pragma: no cover
+except:
     import builtins
 from collections import namedtuple
+from gzip import open as gzopen
+import re
 import sys
+
+# Third-party libraries
+import khmer
+import screed
+
+# Internal modules
 from kevlar import seqio
 from kevlar import overlap
+from kevlar import counting
+from kevlar import sketch
 from kevlar.seqio import parse_augmented_fastq, print_augmented_fastq
+from kevlar.variantset import VariantSet
+from kevlar.timer import Timer
+
+# Subcommands and command-line interface
 from kevlar import dump
 from kevlar import novel
 from kevlar import collect
@@ -24,12 +40,8 @@
 from kevlar import mutate
 from kevlar import assemble
 from kevlar import count
+from kevlar import partition
 from kevlar import cli
-from kevlar.variantset import VariantSet
-from kevlar.timer import Timer
-from gzip import open as gzopen
-import khmer
-import screed
 
 from kevlar._version import get_versions
 __version__ = get_versions()['version']
@@ -49,54 +61,6 @@ def open(filename, mode):
     return openfunc(filename, mode)
 
 
-def sketch_autoload(infile, count=True, graph=False,
-                    ksize=31, table_size=1e4, num_tables=4,
-                    num_bands=0, band=0):
-    """
-    Use file extension to conditionally load sketch into memory.
-
-    If the file extension is one of the following, treat the file as a sketch
-    that has been written to disk and load it with `kevlar.load_sketch`.
-    Sketch attributes such as ksize, table size, and number of tables will
-    be set automatically.
-
-    - `.ct` or `.counttable`: `Counttable`
-    - `.nt` or `.nodetable`: `Nodetable`
-    - `.cg` or `.countgraph`: `Countgraph`
-    - `.ng` or `.nodegraph`: `Nodegraph`
-
-    Otherwise, a sketch will be created using the specified arguments and the
-    input file will be treated as a Fasta/Fastq file to be loaded with
-    `.consume_seqfile` or `.consume_seqfile_banding`.
-    """
-    sketch_extensions = (
-        '.ct', '.counttable', '.nt', '.nodetable',
-        '.cg', '.countgraph', '.ng', '.nodegraph',
-    )
-
-    if infile.endswith(sketch_extensions):
-        return load_sketch(infile, count=count, graph=graph, smallcount=False)
-    else:
-        sketch = allocate_sketch(ksize, table_size, num_tables, count=count,
-                                 graph=graph, smallcount=False)
-        if num_bands > 1:
-            sketch.consume_seqfile_banding(infile, num_bands, band)
-        else:
-            sketch.consume_seqfile(infile)
-        return sketch
-
-
-def calc_fpr(table):
-    """Stolen shamelessly from khmer/__init__.py"""
-    sizes = table.hashsizes()
-    n_ht = float(len(sizes))
-    occupancy = float(table.n_occupied())
-    min_size = min(sizes)
-    fp_one = occupancy / min_size
-    fp_all = fp_one ** n_ht
-    return fp_all
-
-
 def revcom(seq):
     return screed.dna.reverse_complement(str(seq))
 
@@ -113,48 +77,32 @@ def same_seq(seq1, seq2, seq2revcom=None):
     return seq1 == seq2 or seq1 == seq2revcom
 
 
-def load_sketch(filename, count=False, graph=False, smallcount=False):
-    """Convenience function for loading a sketch from the specified file."""
-    if count and graph:
-        if smallcount:
-            createfunc = khmer._SmallCountgraph
-        else:
-            createfunc = khmer._Countgraph
-    elif count and not graph:
-        if smallcount:
-            createfunc = khmer._SmallCounttable
-        else:
-            createfunc = khmer._Counttable
-    elif not count and graph:
-        createfunc = khmer._Nodegraph
-    elif not count and not graph:
-        createfunc = khmer._Nodetable
-
-    sketch = createfunc(1, [1])
-    sketch.load(filename)
-    return sketch
-
-
-def allocate_sketch(ksize, target_tablesize, num_tables=4, count=False,
-                    graph=False, smallcount=False):
-    """Convenience function for allocating memory for a new sketch."""
-    if count and graph:
-        if smallcount:
-            createfunc = khmer.SmallCountgraph
-        else:
-            createfunc = khmer.Countgraph
-    elif count and not graph:
-        if smallcount:
-            createfunc = khmer.SmallCounttable
-        else:
-            createfunc = khmer.Counttable
-    elif not count and graph:
-        createfunc = khmer.Nodegraph
-    elif not count and not graph:
-        createfunc = khmer.Nodetable
-
-    sketch = createfunc(ksize, target_tablesize, num_tables)
-    return sketch
+def to_gml(graph, outfilename, logfile=sys.stderr):
+    """Write the given read graph to a GML file."""
+    if not outfilename.endswith('.gml'):
+        print('[kevlar] WARNING: GML files usually need extension .gml',
+              file=logfile)
+    networkx.write_gml(graph, outfilename)
+    message = '[kevlar] graph written to {}'.format(args.gml)
+    print(message, file=logfile)
+
+
+def multi_file_iter_screed(filenames):
+    for filename in filenames:
+        for record in screed.open(filename):
+            yield record
+
+
+def multi_file_iter_khmer(filenames):
+    for filename in filenames:
+        for record in khmer.ReadParser(filename):
+            yield record
+
+
+def clean_subseqs(sequence, ksize):
+    for subseq in re.split('[^ACGT]', sequence):
+        if len(subseq) >= ksize:
+            yield subseq
 
 
 KmerOfInterest = namedtuple('KmerOfInterest', 'sequence offset abund')
diff --git a/kevlar/__main__.py b/kevlar/__main__.py
@@ -3,7 +3,7 @@
 # -----------------------------------------------------------------------------
 # Copyright (c) 2017 The Regents of the University of California
 #
-# This file is part of kevlar (http://github.com/standage/kevlar) and is
+# This file is part of kevlar (http://github.com/dib-lab/kevlar) and is
 # licensed under the MIT license: see LICENSE.
 # -----------------------------------------------------------------------------
 

diff --git a/kevlar/assemble.py b/kevlar/assemble.py
@@ -3,7 +3,7 @@
 # -----------------------------------------------------------------------------
 # Copyright (c) 2017 The Regents of the University of California
 #
-# This file is part of kevlar (http://github.com/standage/kevlar) and is
+# This file is part of kevlar (http://github.com/dib-lab/kevlar) and is
 # licensed under the MIT license: see LICENSE.
 # -----------------------------------------------------------------------------
 
@@ -124,14 +124,14 @@ def fetch_largest_overlapping_pair(graph, reads):
     )
 
 
-def assemble_with_greed(reads, kmers, graph, debugout=None):
+def assemble_with_greed(reads, kmers, graph, ccindex, debugout=None):
     """Find shortest common superstring using a greedy assembly algorithm."""
     count = 0
     while len(graph.edges()) > 0:
         count += 1
 
         pair = fetch_largest_overlapping_pair(graph, reads)
-        newname = 'contig{:d}'.format(count)
+        newname = 'contig{:d};cc={:d}'.format(count, ccindex)
         newrecord = merge_and_reannotate(pair, newname)
         if debugout:
             print('### DEBUG', pair.tail.name, pair.head.name, pair.offset,
@@ -176,31 +176,36 @@ def main(args):
     reads, kmers = load_reads_and_kmers(kevlar.open(args.augfastq, 'r'),
                                         debugout)
     inputreads = list(reads)
-    graph = kevlar.overlap.graph_init(reads, kmers, args.min_abund,
-                                      args.max_abund, debugout)
+    graph = kevlar.overlap.graph_init_strict(reads, kmers, args.min_abund,
+                                             args.max_abund, debugout)
     if args.gml:
         networkx.write_gml(graph, args.gml)
         message = '[kevlar::assemble] graph written to {}'.format(args.gml)
         print(message, file=args.logfile)
 
-    ccs = list(networkx.connected_components(graph))
-    assert len(ccs) == 1
-
-    assemble_with_greed(reads, kmers, graph, debugout)
+    num_ccs = networkx.number_connected_components(graph)
+    if num_ccs > 1:
+        message = 'assembly graph contains {:d}'.format(num_ccs)
+        message += ' connected components; designated in the output by cc=N'
+        print('[kevlar::assemble] WARNING:', message, file=args.logfile)
 
     contigcount = 0
     unassembledcount = 0
     outstream = kevlar.open(args.out, 'w')
-    for seqname in graph.nodes():
-        if seqname in inputreads:
-            unassembledcount += 1
-            continue
-        contigcount += 1
-        contigrecord = reads[seqname]
-        kevlar.print_augmented_fastq(contigrecord, outstream)
+    cc_stream = networkx.connected_component_subgraphs(graph)
+    for n, cc in enumerate(cc_stream, 1):
+        assemble_with_greed(reads, kmers, cc, n, debugout)
+        for seqname in cc.nodes():
+            if seqname in inputreads:
+                unassembledcount += 1
+                continue
+            contigcount += 1
+            contigrecord = reads[seqname]
+            kevlar.print_augmented_fastq(contigrecord, outstream)
 
     assembledcount = len(inputreads) - unassembledcount
     message = '[kevlar::assemble] assembled'
     message += ' {:d}/{:d} reads'.format(assembledcount, len(inputreads))
+    message += ' from {:d} connected component(s)'.format(num_ccs)
     message += ' into {:d} contig(s)'.format(contigcount)
     print(message, file=args.logfile)
diff --git a/kevlar/cli/__init__.py b/kevlar/cli/__init__.py
@@ -22,6 +22,7 @@
     'reaugment': kevlar.reaugment.main,
     'assemble': kevlar.assemble.main,
     'mutate': kevlar.mutate.main,
+    'partition': kevlar.partition.main,
 }
 
 
@@ -58,5 +59,6 @@ def parser():
     kevlar.reaugment.subparser(subparsers)
     kevlar.assemble.subparser(subparsers)
     kevlar.mutate.subparser(subparsers)
+    kevlar.partition.subparser(subparsers)
 
     return parser
diff --git a/kevlar/collect.py b/kevlar/collect.py
@@ -3,7 +3,7 @@
 # -----------------------------------------------------------------------------
 # Copyright (c) 2017 The Regents of the University of California
 #
-# This file is part of kevlar (http://github.com/standage/kevlar) and is
+# This file is part of kevlar (http://github.com/dib-lab/kevlar) and is
 # licensed under the MIT license: see LICENSE.
 # -----------------------------------------------------------------------------
 
@@ -94,7 +94,7 @@ def recalc_abund(filelist, ksize, memory, maxfpr=0.1, logfile=sys.stderr):
                     # Ignore the novel k-mers in the first pass
                     continue
 
-    fpr = kevlar.calc_fpr(countgraph)
+    fpr = kevlar.sketch.estimate_fpr(countgraph)
     message = '    {:d} reads'.format(reads_consumed)
     message += ' and {:d} k-mers consumed'.format(kmers_consumed)
     message += '; estimated false positive rate is {:1.3f}'.format(fpr)