Skip to content

Commit

Permalink
Merge pull request #80 from dib-lab/feature/partition
Browse files Browse the repository at this point in the history
Introduce "kevlar partition", fix assert bug
  • Loading branch information
standage committed May 31, 2017
2 parents 3130636 + e366fd5 commit 3375f8a
Show file tree
Hide file tree
Showing 35 changed files with 872 additions and 514 deletions.
142 changes: 45 additions & 97 deletions kevlar/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,35 @@
# -----------------------------------------------------------------------------
# Copyright (c) 2016 The Regents of the University of California
#
# This file is part of kevlar (http://github.com/standage/kevlar) and is
# This file is part of kevlar (http://github.com/dib-lab/kevlar) and is
# licensed under the MIT license: see LICENSE.
# -----------------------------------------------------------------------------

# Core libraries
from __future__ import print_function
try:
import __builtin__ as builtins
except: # pragma: no cover
except:
import builtins
from collections import namedtuple
from gzip import open as gzopen
import re
import sys

# Third-party libraries
import khmer
import screed

# Internal modules
from kevlar import seqio
from kevlar import overlap
from kevlar import counting
from kevlar import sketch
from kevlar.seqio import parse_augmented_fastq, print_augmented_fastq
from kevlar.variantset import VariantSet
from kevlar.timer import Timer

# Subcommands and command-line interface
from kevlar import dump
from kevlar import novel
from kevlar import collect
Expand All @@ -24,12 +40,8 @@
from kevlar import mutate
from kevlar import assemble
from kevlar import count
from kevlar import partition
from kevlar import cli
from kevlar.variantset import VariantSet
from kevlar.timer import Timer
from gzip import open as gzopen
import khmer
import screed

from kevlar._version import get_versions
__version__ = get_versions()['version']
Expand All @@ -49,54 +61,6 @@ def open(filename, mode):
return openfunc(filename, mode)


def sketch_autoload(infile, count=True, graph=False,
ksize=31, table_size=1e4, num_tables=4,
num_bands=0, band=0):
"""
Use file extension to conditionally load sketch into memory.
If the file extension is one of the following, treat the file as a sketch
that has been written to disk and load it with `kevlar.load_sketch`.
Sketch attributes such as ksize, table size, and number of tables will
be set automatically.
- `.ct` or `.counttable`: `Counttable`
- `.nt` or `.nodetable`: `Nodetable`
- `.cg` or `.countgraph`: `Countgraph`
- `.ng` or `.nodegraph`: `Nodegraph`
Otherwise, a sketch will be created using the specified arguments and the
input file will be treated as a Fasta/Fastq file to be loaded with
`.consume_seqfile` or `.consume_seqfile_banding`.
"""
sketch_extensions = (
'.ct', '.counttable', '.nt', '.nodetable',
'.cg', '.countgraph', '.ng', '.nodegraph',
)

if infile.endswith(sketch_extensions):
return load_sketch(infile, count=count, graph=graph, smallcount=False)
else:
sketch = allocate_sketch(ksize, table_size, num_tables, count=count,
graph=graph, smallcount=False)
if num_bands > 1:
sketch.consume_seqfile_banding(infile, num_bands, band)
else:
sketch.consume_seqfile(infile)
return sketch


def calc_fpr(table):
"""Stolen shamelessly from khmer/__init__.py"""
sizes = table.hashsizes()
n_ht = float(len(sizes))
occupancy = float(table.n_occupied())
min_size = min(sizes)
fp_one = occupancy / min_size
fp_all = fp_one ** n_ht
return fp_all


def revcom(seq):
return screed.dna.reverse_complement(str(seq))

Expand All @@ -113,48 +77,32 @@ def same_seq(seq1, seq2, seq2revcom=None):
return seq1 == seq2 or seq1 == seq2revcom


def load_sketch(filename, count=False, graph=False, smallcount=False):
"""Convenience function for loading a sketch from the specified file."""
if count and graph:
if smallcount:
createfunc = khmer._SmallCountgraph
else:
createfunc = khmer._Countgraph
elif count and not graph:
if smallcount:
createfunc = khmer._SmallCounttable
else:
createfunc = khmer._Counttable
elif not count and graph:
createfunc = khmer._Nodegraph
elif not count and not graph:
createfunc = khmer._Nodetable

sketch = createfunc(1, [1])
sketch.load(filename)
return sketch


def allocate_sketch(ksize, target_tablesize, num_tables=4, count=False,
graph=False, smallcount=False):
"""Convenience function for allocating memory for a new sketch."""
if count and graph:
if smallcount:
createfunc = khmer.SmallCountgraph
else:
createfunc = khmer.Countgraph
elif count and not graph:
if smallcount:
createfunc = khmer.SmallCounttable
else:
createfunc = khmer.Counttable
elif not count and graph:
createfunc = khmer.Nodegraph
elif not count and not graph:
createfunc = khmer.Nodetable

sketch = createfunc(ksize, target_tablesize, num_tables)
return sketch
def to_gml(graph, outfilename, logfile=sys.stderr):
"""Write the given read graph to a GML file."""
if not outfilename.endswith('.gml'):
print('[kevlar] WARNING: GML files usually need extension .gml',
file=logfile)
networkx.write_gml(graph, outfilename)
message = '[kevlar] graph written to {}'.format(args.gml)
print(message, file=logfile)


def multi_file_iter_screed(filenames):
for filename in filenames:
for record in screed.open(filename):
yield record


def multi_file_iter_khmer(filenames):
for filename in filenames:
for record in khmer.ReadParser(filename):
yield record


def clean_subseqs(sequence, ksize):
for subseq in re.split('[^ACGT]', sequence):
if len(subseq) >= ksize:
yield subseq


KmerOfInterest = namedtuple('KmerOfInterest', 'sequence offset abund')
2 changes: 1 addition & 1 deletion kevlar/__main__.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# -----------------------------------------------------------------------------
# Copyright (c) 2017 The Regents of the University of California
#
# This file is part of kevlar (http://github.com/standage/kevlar) and is
# This file is part of kevlar (http://github.com/dib-lab/kevlar) and is
# licensed under the MIT license: see LICENSE.
# -----------------------------------------------------------------------------

Expand Down
37 changes: 21 additions & 16 deletions kevlar/assemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# -----------------------------------------------------------------------------
# Copyright (c) 2017 The Regents of the University of California
#
# This file is part of kevlar (http://github.com/standage/kevlar) and is
# This file is part of kevlar (http://github.com/dib-lab/kevlar) and is
# licensed under the MIT license: see LICENSE.
# -----------------------------------------------------------------------------

Expand Down Expand Up @@ -124,14 +124,14 @@ def fetch_largest_overlapping_pair(graph, reads):
)


def assemble_with_greed(reads, kmers, graph, debugout=None):
def assemble_with_greed(reads, kmers, graph, ccindex, debugout=None):
"""Find shortest common superstring using a greedy assembly algorithm."""
count = 0
while len(graph.edges()) > 0:
count += 1

pair = fetch_largest_overlapping_pair(graph, reads)
newname = 'contig{:d}'.format(count)
newname = 'contig{:d};cc={:d}'.format(count, ccindex)
newrecord = merge_and_reannotate(pair, newname)
if debugout:
print('### DEBUG', pair.tail.name, pair.head.name, pair.offset,
Expand Down Expand Up @@ -176,31 +176,36 @@ def main(args):
reads, kmers = load_reads_and_kmers(kevlar.open(args.augfastq, 'r'),
debugout)
inputreads = list(reads)
graph = kevlar.overlap.graph_init(reads, kmers, args.min_abund,
args.max_abund, debugout)
graph = kevlar.overlap.graph_init_strict(reads, kmers, args.min_abund,
args.max_abund, debugout)
if args.gml:
networkx.write_gml(graph, args.gml)
message = '[kevlar::assemble] graph written to {}'.format(args.gml)
print(message, file=args.logfile)

ccs = list(networkx.connected_components(graph))
assert len(ccs) == 1

assemble_with_greed(reads, kmers, graph, debugout)
num_ccs = networkx.number_connected_components(graph)
if num_ccs > 1:
message = 'assembly graph contains {:d}'.format(num_ccs)
message += ' connected components; designated in the output by cc=N'
print('[kevlar::assemble] WARNING:', message, file=args.logfile)

contigcount = 0
unassembledcount = 0
outstream = kevlar.open(args.out, 'w')
for seqname in graph.nodes():
if seqname in inputreads:
unassembledcount += 1
continue
contigcount += 1
contigrecord = reads[seqname]
kevlar.print_augmented_fastq(contigrecord, outstream)
cc_stream = networkx.connected_component_subgraphs(graph)
for n, cc in enumerate(cc_stream, 1):
assemble_with_greed(reads, kmers, cc, n, debugout)
for seqname in cc.nodes():
if seqname in inputreads:
unassembledcount += 1
continue
contigcount += 1
contigrecord = reads[seqname]
kevlar.print_augmented_fastq(contigrecord, outstream)

assembledcount = len(inputreads) - unassembledcount
message = '[kevlar::assemble] assembled'
message += ' {:d}/{:d} reads'.format(assembledcount, len(inputreads))
message += ' from {:d} connected component(s)'.format(num_ccs)
message += ' into {:d} contig(s)'.format(contigcount)
print(message, file=args.logfile)
2 changes: 2 additions & 0 deletions kevlar/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
'reaugment': kevlar.reaugment.main,
'assemble': kevlar.assemble.main,
'mutate': kevlar.mutate.main,
'partition': kevlar.partition.main,
}


Expand Down Expand Up @@ -58,5 +59,6 @@ def parser():
kevlar.reaugment.subparser(subparsers)
kevlar.assemble.subparser(subparsers)
kevlar.mutate.subparser(subparsers)
kevlar.partition.subparser(subparsers)

return parser
4 changes: 2 additions & 2 deletions kevlar/collect.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# -----------------------------------------------------------------------------
# Copyright (c) 2017 The Regents of the University of California
#
# This file is part of kevlar (http://github.com/standage/kevlar) and is
# This file is part of kevlar (http://github.com/dib-lab/kevlar) and is
# licensed under the MIT license: see LICENSE.
# -----------------------------------------------------------------------------

Expand Down Expand Up @@ -94,7 +94,7 @@ def recalc_abund(filelist, ksize, memory, maxfpr=0.1, logfile=sys.stderr):
# Ignore the novel k-mers in the first pass
continue

fpr = kevlar.calc_fpr(countgraph)
fpr = kevlar.sketch.estimate_fpr(countgraph)
message = ' {:d} reads'.format(reads_consumed)
message += ' and {:d} k-mers consumed'.format(kmers_consumed)
message += '; estimated false positive rate is {:1.3f}'.format(fpr)
Expand Down

0 comments on commit 3375f8a

Please sign in to comment.