Skip to content

Commit

Permalink
Merge pull request #214 from dib-lab/feature/asmbl-2step
Browse files Browse the repository at this point in the history
Assemble with greedy assembler if fermi-lite fails
  • Loading branch information
standage committed Mar 13, 2018
2 parents c96a0d2 + 869521f commit d3dd400
Show file tree
Hide file tree
Showing 16 changed files with 1,391 additions and 60 deletions.
4 changes: 3 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,17 @@ language: python
addons:
apt:
packages:
- bwa
- cloc
- git
branches:
only: # don't build "pushes" except on the master branch
- master
cache: pip
python:
- 3.5
- 3.6
before_install:
- git clone https://github.com/lh3/bwa.git && cd bwa && make -j 2 && sudo cp bwa /usr/local/bin && cd -
install:
- make devenv
- pip install wheel
Expand Down
15 changes: 12 additions & 3 deletions kevlar/alac.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# licensed under the MIT license: see LICENSE.
# -----------------------------------------------------------------------------

import re
import sys
import kevlar
from kevlar.assemble import assemble_greedy, assemble_fml_asm
Expand All @@ -16,7 +17,7 @@

def alac(pstream, refrfile, ksize=31, bigpart=10000, delta=25, seedsize=31,
maxdiff=10000, match=1, mismatch=2, gapopen=5, gapextend=0,
greedy=False, min_ikmers=None, logstream=sys.stderr):
greedy=False, fallback=True, min_ikmers=None, logstream=sys.stderr):
assembler = assemble_greedy if greedy else assemble_fml_asm
for partition in pstream:
reads = list(partition)
Expand All @@ -27,6 +28,14 @@ def alac(pstream, refrfile, ksize=31, bigpart=10000, delta=25, seedsize=31,

# Assemble partitioned reads into contig(s)
contigs = list(assembler(reads, logstream=logstream))
if len(contigs) == 0 and assembler == assemble_fml_asm and fallback:
message = 'WARNING: no contig assembled by fermi-lite'
ccmatch = re.search('kvcc=(\d+)', reads[0].name)
if ccmatch:
message += ' for CC={:s}'.format(ccmatch.group(1))
message += '; attempting again with home-grown greedy assembler'
print('[kevlar::alac]', message, file=logstream)
contigs = list(assemble_greedy(reads, logstream=logstream))
if min_ikmers is not None:
# Apply min ikmer filter if it's set
contigs = [c for c in contigs if len(c.ikmers) >= min_ikmers]
Expand Down Expand Up @@ -58,8 +67,8 @@ def main(args):
pstream, args.refr, ksize=args.ksize, bigpart=args.bigpart,
delta=args.delta, seedsize=args.seed_size, maxdiff=args.max_diff,
match=args.match, mismatch=args.mismatch, gapopen=args.open,
gapextend=args.extend, greedy=args.greedy, min_ikmers=args.min_ikmers,
logstream=args.logfile
gapextend=args.extend, greedy=args.greedy, fallback=args.fallback,
min_ikmers=args.min_ikmers, logstream=args.logfile
)

for varcall in workflow:
Expand Down
73 changes: 36 additions & 37 deletions kevlar/assemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,10 +109,6 @@ def main_jca(args):
# Greedy assembly mode
# =============================================================================

class KevlarEdgelessGraphError(ValueError):
"""Raised if shared k-mer graph has no edges."""
pass


def merge_pair(pair):
"""
Expand Down Expand Up @@ -216,7 +212,7 @@ def assemble_with_greed(graph, ccindex, debugout=None):
pair = fetch_largest_overlapping_pair(graph)
newname = 'contig{:d}:cc={:d}'.format(count, ccindex)
newrecord = merge_and_reannotate(pair, newname)
if debugout:
if debugout: # pragma: no cover
print('### DEBUG', pair.tail.name, pair.head.name, pair.offset,
pair.overlap, pair.sameorient, file=debugout)
kevlar.print_augmented_fastx(newrecord, debugout)
Expand Down Expand Up @@ -273,39 +269,29 @@ def prune_graph(graph, quant=0.1):
return len(edges_to_drop)


def assemble_greedy(readstream, gmlfilename=None, debug=False,
logstream=sys.stderr):
def assemble_greedy(readstream, compat=0.6, debug=False, logstream=sys.stderr):
debugout = None
if debug:
if debug: # pragma: no cover
debugout = logstream

graph = kevlar.ReadGraph()
graph.load(readstream)
inputreads = set(graph.nodes())
message = 'loaded {:d} reads'.format(graph.number_of_nodes())
orignumnodes = graph.number_of_nodes()
message = 'loaded {:d} reads'.format(orignumnodes)
message += ' and {:d} interesting k-mers'.format(len(graph.ikmers))
print('[kevlar::assemble::default]', message, file=logstream)
print('[kevlar::assemble::greedy]', message, file=logstream)

graph.populate_edges(strict=True)
message = 'populated "shared interesting k-mers" graph'
message += ' with {:d} edges'.format(graph.number_of_edges())
# If number of nodes is less than number of reads, it's probably because
# some reads have no valid overlaps with other reads.
print('[kevlar::assemble::default]', message, file=logstream)
print('[kevlar::assemble::greedy]', message, file=logstream)

if graph.number_of_edges() == 0:
message = 'nothing to be done, aborting'
raise KevlarEdgelessGraphError(message)

if gmlfilename:
tempgraph = graph.copy()
for n1, n2 in tempgraph.edges():
ikmerset = tempgraph[n1][n2]['ikmers']
ikmerstr = ','.join(ikmerset)
tempgraph[n1][n2]['ikmers'] = ikmerstr
networkx.write_gml(tempgraph, gmlfilename)
message = 'graph written to {:s}'.format(gmlfilename)
print('[kevlar::assemble::default]', message, file=logstream)
print('[kevlar::assemble::greedy] nothing to be done', file=logstream)
return

edges_dropped = prune_graph(graph)
cc_stream = networkx.connected_component_subgraphs(graph, copy=False)
Expand All @@ -315,40 +301,53 @@ def assemble_greedy(readstream, gmlfilename=None, debug=False,
message += ', graph now has {:d} connected component(s)'.format(len(ccs))
message += ', {:d} nodes'.format(ccnodes)
message += ', and {:d} edges'.format(graph.number_of_edges())
print('[kevlar::assemble::default]', message, file=logstream)
print('[kevlar::assemble::greedy]', message, file=logstream)
if ccnodes / orignumnodes < compat:
msg = 'only {:d} of {:d} reads '.format(ccnodes, orignumnodes)
msg += 'have compatible overlaps; discarding'
print('[kevlar::assemble::greedy]', msg, file=logstream)
return
if len(ccs) > 1:
message = 'multiple connected components designated by cc=N in output'
print('[kevlar::assemble::default] WARNING:', message, file=logstream)
print('[kevlar::assemble::greedy] WARNING:', message, file=logstream)

contigcount = 0
contigs2report = list()
unassembledcount = 0
for n, cc in enumerate(ccs, 1):
cc = graph.full_cc(cc)
ccreads = list()
for readname in cc.nodes():
if readname in inputreads:
ccreads.append(graph.get_record(readname))
assemble_with_greed(cc, n, debugout)
for seqname in cc.nodes():
if seqname in inputreads:
unassembledcount += 1
continue
contigcount += 1
contigrecord = cc.get_record(seqname)
yield contigrecord
contig = next(kevlar.augment.augment(ccreads, [contigrecord]))
contigs2report.append(contig)

assembledcount = ccnodes - unassembledcount
message = '[kevlar::assemble] assembled'
message += ' {:d}/{:d} reads'.format(assembledcount, ccnodes)
message = 'assembled {:d}/{:d} reads'.format(assembledcount, ccnodes)
message += ' from {:d} connected component(s)'.format(len(ccs))
message += ' into {:d} contig(s)'.format(contigcount)
print(message, file=logstream)
message += ' into {:d} contig(s)'.format(len(contigs2report))
print('[kevlar::assemble::greedy]', message, file=logstream)
if assembledcount / ccnodes < compat:
message = 'too few reads assembled; discarding'
print('[kevlar::assemble::greedy]', message, file=logstream)
return

for contig in contigs2report:
yield contig


def main_greedy(args):
readstream = kevlar.parse_augmented_fastx(kevlar.open(args.augfastq, 'r'))
outstream = None # Only create output file if there are contigs
contigstream = assemble_greedy(readstream, args.gml, args.debug,
args.logfile)
outstream = kevlar.open(args.out, 'w')
contigstream = assemble_greedy(readstream, debug=args.debug,
logstream=args.logfile)
for contig in contigstream:
if outstream is None:
outstream = kevlar.open(args.out, 'w')
kevlar.print_augmented_fastx(contig, outstream)


Expand Down
28 changes: 22 additions & 6 deletions kevlar/call.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def __init__(self, seqid, pos, refr, alt, **kwargs):
self._alt = alt
self.info = dict()
for key, value in kwargs.items():
self.info[key] = value
self.annotate(key, value)

@property
def seqid(self):
Expand Down Expand Up @@ -172,10 +172,23 @@ def refrwindow(self):
"""Similar to `window`, but encapsulating the reference allele."""
return self.attribute('RW')

def annotate(self, key, value):
if key in self.info:
if isinstance(self.info[key], set):
self.info[key].add(value)
else:
oldvalue = self.info[key]
self.info[key] = set((oldvalue, value))
else:
self.info[key] = value

def attribute(self, key, pair=False):
if key not in self.info:
return None
value = self.info[key].replace(';', ':')
value = self.info[key]
if isinstance(value, set):
value = ','.join(sorted(value))
value = value.replace(';', ':')
if pair:
keyvaluepair = '{:s}={:s}'.format(key, value)
return keyvaluepair
Expand Down Expand Up @@ -352,6 +365,8 @@ def pointdist(point):
continue
d = pointdist(pos)
distances.append(d)
if len(distances) == 0:
return float('Inf')
return sum(distances) / len(distances)


Expand Down Expand Up @@ -439,16 +454,17 @@ def call(targetlist, querylist, match=1, mismatch=2, gapopen=5,
if len(aligns2report) > 1:
if refrfile and len(query.mateseqs) > 0:
mate_pos = list(align_mates(query, refrfile))
for aln in aligns2report:
aln.matedist = mate_distance(mate_pos, aln.interval)
aligns2report.sort(key=lambda aln: aln.matedist)
if len(mate_pos) > 0:
for aln in aligns2report:
aln.matedist = mate_distance(mate_pos, aln.interval)
aligns2report.sort(key=lambda aln: aln.matedist)

for n, alignment in enumerate(aligns2report):
for varcall in alignment.call_variants(ksize):
if alignment.matedist:
varcall.info['MD'] = '{:.2f}'.format(alignment.matedist)
if n > 0:
varcall.info['NC'] = 'matefail'
varcall.annotate('NC', 'matefail')
yield varcall


Expand Down
24 changes: 16 additions & 8 deletions kevlar/cli/alac.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,22 @@ def subparser(subparsers):
subparser = subparsers.add_parser('alac', description=desc, add_help=False)
subparser._positionals.title = 'Required inputs'

asmbl_args = subparser.add_argument_group('Read assembly')
asmbl_args.add_argument('-p', '--part-id', type=str, metavar='ID',
help='only process partition "PID" in the input')
asmbl_args.add_argument('--bigpart', type=int, metavar='N', default=10000,
help='do not attempt to assembly any partitions '
'with more than N reads (default: 10000)')
asmbl_args.add_argument('--greedy', action='store_true', help='Use a home-'
'grown greedy assembly algorithm instead of the '
'default fermi-lite algorithm')
asmbl_args.add_argument('--no-fallback', action='store_false',
dest='fallback', help='by default, if fermi-lite '
'fails to assemble a contig, `kevlar alac` will '
'attempt to assemble reads with a home-grown '
'greedy assembler; use this flag to deactivate '
'this behavior')

local_args = subparser.add_argument_group('Target extraction')
local_args.add_argument('-z', '--seed-size', type=int, default=31,
metavar='Z', help='seed size; default is 31')
Expand All @@ -43,19 +59,11 @@ def subparser(subparsers):
misc_args = subparser.add_argument_group('Miscellaneous settings')
misc_args.add_argument('-h', '--help', action='help',
help='show this help message and exit')
misc_args.add_argument('-p', '--part-id', type=str, metavar='ID',
help='only process partition "PID" in the input')
misc_args.add_argument('--bigpart', type=int, metavar='N', default=10000,
help='do not process any partitions with more than '
'N reads (default: 10000)')
misc_args.add_argument('-o', '--out', metavar='FILE',
help='output file; default is terminal (stdout)')
misc_args.add_argument('-i', '--min-ikmers', metavar='I', type=int,
default=None, help='do not report calls that a '
'supported by fewer than `I` interesting k-mers')
misc_args.add_argument('--greedy', action='store_true', help='Use the '
'greedy assembly algorithm instead of the default '
'fermi-lite algorithm')
misc_args.add_argument('-k', '--ksize', type=int, default=31, metavar='K',
help='k-mer size; default is 31')
subparser.add_argument('infile', help='partitioned reads in augmented '
Expand Down
1 change: 1 addition & 0 deletions kevlar/tests/data/.gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
pico-trio-refr.fa.gz.*
fiveparts-refr.fa.gz.*
inf-mate-dist/*.genome.fa.gz.*

0 comments on commit d3dd400

Please sign in to comment.