Skip to content

Commit

Permalink
Merge pull request #79 from dib-lab/class/overlap
Browse files Browse the repository at this point in the history
Refactoring overlap code into its own module
  • Loading branch information
standage committed May 20, 2017
2 parents a3a5414 + e99ad12 commit acbefd1
Show file tree
Hide file tree
Showing 6 changed files with 539 additions and 267 deletions.
1 change: 1 addition & 0 deletions kevlar/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from collections import namedtuple
import sys
from kevlar import seqio
from kevlar import overlap
from kevlar.seqio import parse_augmented_fastq, print_augmented_fastq
from kevlar import dump
from kevlar import novel
Expand Down
259 changes: 66 additions & 193 deletions kevlar/assemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,7 @@
import screed
import khmer
import kevlar


OverlappingReadPair = namedtuple('OverlappingReadPair',
'tail head offset overlap sameorient')
IncompatiblePair = OverlappingReadPair(None, None, None, None, None)
from kevlar.seqio import load_reads_and_kmers


def subparser(subparsers):
Expand All @@ -34,95 +30,16 @@ def subparser(subparsers):
help='output file; default is terminal (stdout)')
subparser.add_argument('--gml', metavar='FILE',
help='write graph to .gml file')
subparser.add_argument('-n', '--min-abund', type=int, metavar='N',
default=2, help='discard interesting k-mers that '
'occur fewer than N times')
subparser.add_argument('-x', '--max-abund', type=int, metavar='X',
default=500, help='discard interesting k-mers that '
'occur more than X times')
subparser.add_argument('augfastq', help='annotated reads in augmented '
'Fastq format')


def print_read_pair(read1, pos1, read2, pos2, ksize, offset, overlap,
sameorient, outstream):
seq2 = read2.sequence if sameorient else kevlar.revcom(read2.sequence)
print('\nDEBUG shared interesting kmer: ', read1.name,
' --(overlap={:d})'.format(overlap),
'(offset={:d})({})--> '.format(offset, sameorient), read2.name, '\n',
read1.sequence, '\n', ' ' * pos1, '|' * ksize, '\n', ' ' * offset,
seq2, '\n', sep='', file=outstream)


def calc_offset(read1, read2, minkmer, debugstream=None):
"""
Calculate offset between reads that share an interesting k-mer.
Each read is annotated with its associated interesting k-mers. These are
used to bait pairs of reads sharing interesting k-mers to build a graph of
shared interesting k-mers. Given a pair of reads sharing an interesting,
k-mer calculate the offset between them and determine whether they are in
the same orientation. Any mismatches between the aligned reads (outside the
shared k-mer) will render the offset invalid.
"""
maxkmer = kevlar.revcom(minkmer)
kmer1 = [k for k in read1.ikmers
if kevlar.same_seq(k.sequence, minkmer, maxkmer)][0]
kmer2 = [k for k in read2.ikmers
if kevlar.same_seq(k.sequence, minkmer, maxkmer)][0]
ksize = len(kmer1.sequence)

pos1 = kmer1.offset
pos2 = kmer2.offset
sameorient = True
if kmer1.sequence != kmer2.sequence:
assert kmer1.sequence == kevlar.revcom(kmer2.sequence)
sameorient = False
pos2 = len(read2.sequence) - (kmer2.offset + ksize)

tail, head = read1, read2
tailpos, headpos = pos1, pos2
read1contained = pos1 == pos2 and len(read2.sequence) > len(read1.sequence)
if pos2 > pos1 or read1contained:
tail, head = read2, read1
tailpos, headpos = headpos, tailpos
offset = tailpos - headpos

headseq = head.sequence if sameorient else kevlar.revcom(head.sequence)
seg2offset = len(head.sequence) - len(tail.sequence) + offset
if offset + len(headseq) <= len(tail.sequence):
segment1 = tail.sequence[offset:offset+len(headseq)]
segment2 = headseq
seg2offset = None
else:
segment1 = tail.sequence[offset:]
segment2 = headseq[:-seg2offset]

overlap1 = len(segment1)
overlap2 = len(segment2)
if overlap1 != overlap2:
print(
'DEBUG '
'tail="{tail}" head="{head}" offset={offset} altoffset={altoffset}'
' tailoverlap={overlap} headoverlap={headover} tailolvp={tailseq}'
' headolvp={headseq} kmer={minkmer},{maxkmer} tailseq={tailread}'
' headseq={headread}'.format(
tail=read1.name, head=read2.name, offset=offset,
altoffset=seg2offset, overlap=overlap1,
headover=len(segment2), tailseq=segment1, headseq=segment2,
minkmer=minkmer, maxkmer=maxkmer, tailread=tail.sequence,
headread=head.sequence
), file=sys.stderr
)
assert overlap1 == overlap2
if segment1 != segment2:
return IncompatiblePair

if debugstream:
print_read_pair(tail, tailpos, head, headpos, ksize, offset, overlap1,
sameorient, debugstream)

return OverlappingReadPair(tail=tail, head=head, offset=offset,
overlap=overlap1, sameorient=sameorient)


def merge_pair(pair):
"""
Assemble a pair of overlapping reads.
Expand Down Expand Up @@ -176,129 +93,64 @@ def merge_and_reannotate(pair, newname):
return newrecord


def load_reads(instream, logstream=None):
def fetch_largest_overlapping_pair(graph, reads):
"""
Load reads into lookup tables for convenient access.
Grab the edge with the largest overlap in the graph.
The first table is a dictionary of reads indexed by read name, and the
second table is a dictionary of read sets indexed by an interesting k-mer.
Sort the edges using 3 criteria. The first is the primary criterion, the
other two ensure deterministic behavior.
- overlap (largest first)
- lexicographically smaller read name
- lexicographically larger read name
"""
reads = dict()
kmers = defaultdict(set)
for n, record in enumerate(kevlar.parse_augmented_fastq(instream), 1):
if logstream and n % 10000 == 0:
print('[kevlar::assemble] loaded {:d} reads'.format(n),
file=logstream)
reads[record.name] = record
for kmer in record.ikmers:
kmerseq = kevlar.revcommin(kmer.sequence)
kmers[kmerseq].add(record.name)
return reads, kmers


def graph_init(reads, kmers, maxabund=500, logstream=None):
graph = networkx.Graph()
nkmers = len(kmers)
for n, minkmer in enumerate(kmers, 1):
if n % 100 == 0:
msg = 'processed {:d}/{:d} shared novel k-mers'.format(n, nkmers)
print('[kevlar::assemble] ', msg, sep='', file=logstream)
readnames = kmers[minkmer]
if maxabund and len(readnames) > maxabund:
msg = ' skipping k-mer with abundance {:d}'.format(
len(readnames)
)
print(msg, file=logstream)
continue
if len(readnames) < 2:
message = '[kevlar::assemble] WARNING: k-mer {}'.format(minkmer)
message += ' (rev. comp. {})'.format(kevlar.revcom(minkmer))
message += ' only has abundance {:d}'.format(len(readnames))
out = logstream if logstream is not None else sys.stderr
print(message, file=out)
continue
readset = [reads[rn] for rn in readnames]
for read1, read2 in itertools.combinations(readset, 2):
pair = calc_offset(read1, read2, minkmer, logstream)
if pair is IncompatiblePair: # Shared k-mer but bad overlap
continue
tailname, headname = pair.tail.name, pair.head.name
if tailname in graph and headname in graph[tailname]:
assert graph[tailname][headname]['offset'] == pair.offset
if graph[tailname][headname]['tail'] == tailname:
assert graph[tailname][headname]['overlap'] == pair.overlap
else:
graph.add_edge(tailname, headname, offset=pair.offset,
overlap=pair.overlap, ikmer=minkmer,
orient=pair.sameorient, tail=tailname)
return graph


def assemble_with_greed(reads, kmers, graph):
pass


def main(args):
debugout = None
if args.debug:
debugout = args.logfile

reads, kmers = load_reads(kevlar.open(args.augfastq, 'r'), debugout)
inputreads = list(reads.keys())
graph = graph_init(reads, kmers, args.max_abund, debugout)
if args.gml:
networkx.write_gml(graph, args.gml)
message = '[kevlar::assemble] graph written to {}'.format(args.gml)
print(message, file=args.logfile)

ccs = list(networkx.connected_components(graph))
assert len(ccs) == 1

edges = sorted(
graph.edges(),
reverse=True,
key=lambda e: (
graph[e[0]][e[1]]['overlap'],
max(e),
min(e),
)
)
read1, read2 = edges[0] # biggest overlap (greedy algorithm)
if read2 == graph[read1][read2]['tail']:
read1, read2 = read2, read1
return kevlar.overlap.OverlappingReadPair(
tail=reads[read1],
head=reads[read2],
offset=graph[read1][read2]['offset'],
overlap=graph[read1][read2]['overlap'],
sameorient=graph[read1][read2]['orient'],
)


def assemble_with_greed(reads, kmers, graph, debugout=None):
"""Find shortest common superstring using a greedy assembly algorithm."""
count = 0
while len(graph.edges()) > 0:
count += 1
# Sort the edges using 3 criteria. The first is the primary criterion,
# the other two ensure deterministic behavior.
# - overlap (largest first)
# - lexicographically smaller read name
# - lexicographically larger read name
edges = sorted(
graph.edges(),
reverse=True,
key=lambda e: (
graph[e[0]][e[1]]['overlap'],
max(e),
min(e),
)
)
read1, read2 = edges[0] # biggest overlap (greedy algorithm)
if read2 == graph[read1][read2]['tail']:
read1, read2 = read2, read1
pair = OverlappingReadPair(
tail=reads[read1],
head=reads[read2],
offset=graph[read1][read2]['offset'],
overlap=graph[read1][read2]['overlap'],
sameorient=graph[read1][read2]['orient'],
)

pair = fetch_largest_overlapping_pair(graph, reads)
newname = 'contig{:d}'.format(count)
newrecord = merge_and_reannotate(pair, newname)
if debugout:
print('### DEBUG', read1, read2, pair.offset, pair.overlap,
pair.sameorient, file=debugout)
print('### DEBUG', pair.tail.name, pair.head.name, pair.offset,
pair.overlap, pair.sameorient, file=debugout)
kevlar.print_augmented_fastq(newrecord, debugout)
for kmer in newrecord.ikmers:
kmerseq = kevlar.revcommin(kmer.sequence)
for readname in kmers[kmerseq]:
already_merged = readname not in graph
current_contig = readname in [read1, read2, newname]
current_contig = readname in [
pair.tail.name, pair.head.name, newname
]
if already_merged or current_contig:
continue
otherrecord = reads[readname]
newpair = calc_offset(
newpair = kevlar.overlap.calc_offset(
newrecord, otherrecord, kmerseq, debugout
)
if newpair == IncompatiblePair:
if newpair == kevlar.overlap.INCOMPATIBLE_PAIR:
continue
tn, hn = newpair.tail.name, newpair.head.name
if tn in graph and hn in graph[tn]:
Expand All @@ -312,8 +164,29 @@ def main(args):
kmers[kmerseq].add(newrecord.name)
reads[newrecord.name] = newrecord
graph.add_node(newrecord.name)
graph.remove_node(read1)
graph.remove_node(read2)
graph.remove_node(pair.tail.name)
graph.remove_node(pair.head.name)


def main(args):
debugout = None
if args.debug:
debugout = args.logfile

reads, kmers = load_reads_and_kmers(kevlar.open(args.augfastq, 'r'),
debugout)
inputreads = list(reads)
graph = kevlar.overlap.graph_init(reads, kmers, args.min_abund,
args.max_abund, debugout)
if args.gml:
networkx.write_gml(graph, args.gml)
message = '[kevlar::assemble] graph written to {}'.format(args.gml)
print(message, file=args.logfile)

ccs = list(networkx.connected_components(graph))
assert len(ccs) == 1

assemble_with_greed(reads, kmers, graph, debugout)

contigcount = 0
unassembledcount = 0
Expand Down

0 comments on commit acbefd1

Please sign in to comment.