Skip to content

Commit

Permalink
Merge pull request #85 from dib-lab/fix/overlap
Browse files Browse the repository at this point in the history
Fix issue with overlap/swap/orientation
  • Loading branch information
standage committed Jul 6, 2017
2 parents 87db6b2 + 457963e commit f1a90dc
Show file tree
Hide file tree
Showing 4 changed files with 120 additions and 23 deletions.
34 changes: 27 additions & 7 deletions kevlar/assemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,18 +25,24 @@ def merge_pair(pair):
Given a pair of compatible overlapping reads, collapse and merge them into
a single sequence.
"""
tailseq = pair.tail.sequence
headseq = pair.head.sequence
offset = pair.offset
if pair.sameorient is False:
headseq = kevlar.revcom(pair.head.sequence)
headindex = len(pair.tail.sequence) - pair.offset
if headseq in pair.tail.sequence:
return pair.tail.sequence
if pair.swapped:
tailseq, headseq = headseq, tailseq
offset += len(tailseq) - len(headseq)

headindex = len(tailseq) - offset
headsuffix = headseq[headindex:]
tailprefix = pair.tail.sequence[pair.offset:pair.offset+pair.overlap]
tailprefix = tailseq[offset:offset+pair.overlap]
assert tailprefix == headseq[:headindex], \
'error: attempted to assemble incompatible reads'

if headseq in pair.tail.sequence:
return pair.tail.sequence
return pair.tail.sequence + headsuffix
return tailseq + headsuffix


def merge_and_reannotate(pair, newname):
Expand Down Expand Up @@ -99,6 +105,7 @@ def fetch_largest_overlapping_pair(graph, reads):
offset=graph[read1][read2]['offset'],
overlap=graph[read1][read2]['overlap'],
sameorient=graph[read1][read2]['orient'],
swapped=graph[read1][read2]['swapped'],
)


Expand Down Expand Up @@ -138,7 +145,8 @@ def assemble_with_greed(reads, kmers, graph, ccindex, debugout=None):
else:
graph.add_edge(tn, hn, offset=newpair.offset,
overlap=newpair.overlap, ikmer=kmerseq,
orient=newpair.sameorient, tail=tn)
orient=newpair.sameorient, tail=tn,
swapped=newpair.swapped)
kmers[kmerseq].add(newrecord.name)
reads[newrecord.name] = newrecord
graph.add_node(newrecord.name)
Expand All @@ -160,8 +168,20 @@ def main(args):

graph = kevlar.overlap.graph_init_strict(reads, kmers, args.min_abund,
args.max_abund, debugout)
message = 'initialized "shared interesting k-mers" graph'
message += ' with {:d} nodes'.format(graph.number_of_nodes())
message += ' and {:d} edges'.format(graph.number_of_edges())
# If number of nodes is less than number of reads, it's probably because
# some reads have no valid overlaps with other reads.
print('[kevlar::assemble]', message, file=args.logfile)

if args.gml:
networkx.write_gml(graph, args.gml)
tempgraph = graph.copy()
for n1, n2 in tempgraph.edges():
ikmerset = tempgraph[n1][n2]['ikmers']
ikmerstr = ','.join(ikmerset)
tempgraph[n1][n2]['ikmers'] = ikmerstr
networkx.write_gml(tempgraph, args.gml)
message = '[kevlar::assemble] graph written to {}'.format(args.gml)
print(message, file=args.logfile)

Expand Down
19 changes: 13 additions & 6 deletions kevlar/overlap.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@


OverlappingReadPair = namedtuple('OverlappingReadPair',
'tail head offset overlap sameorient')
INCOMPATIBLE_PAIR = OverlappingReadPair(None, None, None, None, None)
'tail head offset overlap sameorient swapped')
INCOMPATIBLE_PAIR = OverlappingReadPair(None, None, None, None, None, None)


def print_read_pair(pair, position, outstream=sys.stderr):
Expand Down Expand Up @@ -100,14 +100,17 @@ def determine_relative_orientation(read1, read2, kmer1, kmer2):
return tail, head, offset, sameorient, tailpos


def validate_read_overlap(tail, head, offset, sameorient, minkmer):
def validate_read_overlap(tail, head, offset, sameorient, minkmer, swapped):
"""Verify that the overlap between two reads is identical."""
headseq = head.sequence if sameorient else kevlar.revcom(head.sequence)
seg2offset = len(head.sequence) - len(tail.sequence) + offset
if offset + len(headseq) <= len(tail.sequence):
segment1 = tail.sequence[offset:offset+len(headseq)]
segment2 = headseq
seg2offset = None
elif swapped:
segment1 = tail.sequence[:-offset]
segment2 = headseq[seg2offset:]
else:
segment1 = tail.sequence[offset:]
segment2 = headseq[:-seg2offset]
Expand Down Expand Up @@ -157,12 +160,15 @@ def calc_offset(read1, read2, minkmer, debugstream=None):
read1, read2, kmer1, kmer2
)

overlap = validate_read_overlap(tail, head, offset, sameorient, minkmer)
swapped = read1.name != tail.name and sameorient is False
overlap = validate_read_overlap(tail, head, offset, sameorient, minkmer,
swapped)
if overlap is None:
return INCOMPATIBLE_PAIR

pair = OverlappingReadPair(tail=tail, head=head, offset=offset,
overlap=overlap, sameorient=sameorient)
overlap=overlap, sameorient=sameorient,
swapped=swapped)
if debugstream:
print_read_pair(pair, tailpos, debugstream)

Expand All @@ -185,7 +191,8 @@ def graph_add_edge(graph, pair, minkmer):
else:
graph.add_edge(tailname, headname, offset=pair.offset,
overlap=pair.overlap, ikmers=set([minkmer]),
orient=pair.sameorient, tail=tailname)
orient=pair.sameorient, tail=tailname,
swapped=pair.swapped)


def graph_init_abund_check_pass(numreads, minkmer, minabund=5, maxabund=500,
Expand Down
23 changes: 13 additions & 10 deletions kevlar/tests/test_assemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,11 +177,11 @@ def test_merge_pair(record1, record2, record4):
ACGCAAAGCTATTTAAAACC
"""
pair = OverlappingReadPair(tail=record1, head=record2, offset=13,
overlap=7, sameorient=True)
overlap=7, sameorient=True, swapped=False)
assert merge_pair(pair) == 'GCTGCACCGATGTACGCAAAGCTATTTAAAACC'

pair = OverlappingReadPair(tail=record1, head=record4, offset=13,
overlap=7, sameorient=True)
overlap=7, sameorient=True, swapped=False)
with pytest.raises(AssertionError) as ae:
contig = merge_pair(pair)
assert 'attempted to assemble incompatible reads' in str(ae)
Expand All @@ -196,7 +196,7 @@ def test_merge_and_reannotate_same_orientation(record1, record2):
ACGCAAAGCTATTTAAAACC ***** *****
"""
pair = OverlappingReadPair(tail=record1, head=record2, offset=13,
overlap=7, sameorient=True)
overlap=7, sameorient=True, swapped=False)
newrecord = merge_and_reannotate(pair, 'contig1')
assert newrecord.name == 'contig1'
assert newrecord.sequence == 'GCTGCACCGATGTACGCAAAGCTATTTAAAACC'
Expand All @@ -214,7 +214,7 @@ def test_merge_and_reannotate_opposite_orientation(record1, record3):
ACGCAAAGCTATTTAAAACC ***** *****
"""
pair = OverlappingReadPair(tail=record1, head=record3, offset=13,
overlap=7, sameorient=False)
overlap=7, sameorient=False, swapped=False)
newrecord = merge_and_reannotate(pair, 'contig1')
assert newrecord.name == 'contig1'
assert newrecord.sequence == 'GCTGCACCGATGTACGCAAAGCTATTTAAAACC'
Expand All @@ -241,7 +241,7 @@ def test_merge_and_reannotate_edge_case_same_orientation(record7, record8):
|||||||||||||||||
""" # noqa
pair = OverlappingReadPair(tail=record7, head=record8, offset=39,
overlap=21, sameorient=True)
overlap=21, sameorient=True, swapped=False)
newrecord = merge_and_reannotate(pair, 'SoMeCoNtIg')
assert newrecord.name == 'SoMeCoNtIg'
assert newrecord.sequence == ('CAGGTCCCCACCCGGATACTTGAAGCAGGCAGCCTCAAGGTAT'
Expand Down Expand Up @@ -277,7 +277,7 @@ def test_merge_and_reannotate_edge_case_opposite_orientation(record7, record9):
|||||||||||||||||
""" # noqa
pair = OverlappingReadPair(tail=record7, head=record9, offset=39,
overlap=21, sameorient=False)
overlap=21, sameorient=False, swapped=False)
newrecord = merge_and_reannotate(pair, 'SoMeCoNtIg')
assert newrecord.name == 'SoMeCoNtIg'
assert newrecord.sequence == ('CAGGTCCCCACCCGGATACTTGAAGCAGGCAGCCTCAAGGTAT'
Expand Down Expand Up @@ -312,7 +312,7 @@ def test_merge_and_reannotate_contained(record7, record10):
|||||||||||||||||
"""
pair = OverlappingReadPair(tail=record7, head=record10, offset=0,
overlap=35, sameorient=True)
overlap=35, sameorient=True, swapped=False)
newrecord = merge_and_reannotate(pair, 'ContainedAtOne')
assert newrecord.name == 'ContainedAtOne'
assert newrecord.sequence == record7.sequence
Expand All @@ -334,7 +334,7 @@ def test_merge_and_reannotate_contained_with_offset(record7, record11):
|||||||||||||||||
"""
pair = OverlappingReadPair(tail=record7, head=record11, offset=10,
overlap=23, sameorient=True)
overlap=23, sameorient=True, swapped=False)
newrecord = merge_and_reannotate(pair, 'ContainedAtOffset')
assert newrecord.sequence == record7.sequence
assert newrecord.ikmers == record7.ikmers
Expand All @@ -358,7 +358,7 @@ def test_merge_contained_with_offset_and_error(record7, record12):
assert pair == INCOMPATIBLE_PAIR

pair = OverlappingReadPair(tail=record7, head=record12, offset=10,
overlap=23, sameorient=True)
overlap=23, sameorient=True, swapped=False)
with pytest.raises(AssertionError) as ae:
newrecord = merge_and_reannotate(pair, 'WontWork')
assert 'attempted to assemble incompatible reads' in str(ae)
Expand Down Expand Up @@ -412,7 +412,10 @@ def test_graph_init():
r37name = 'read37f start=9,mutations=0'
assert graph[r37name][r8name]['offset'] == 1
assert graph[r37name][r8name]['overlap'] == 99
pair = OverlappingReadPair(reads[r8name], reads[r37name], 1, 99, True)
pair = OverlappingReadPair(
tail=reads[r8name], head=reads[r37name], offset=1, overlap=99,
sameorient=True, swapped=False
)
assert merge_pair(pair) == ('CACTGTCCTTACAGGTGGATAGTCGCTTTGTAATAAAAGAGTTAC'
'ACCCCGGTTTTTAGAAGTCTCGACTTTAAGGAAGTGGGCCTACGG'
'CGGAAGCCGTC')
Expand Down
67 changes: 67 additions & 0 deletions kevlar/tests/test_overlap.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,54 @@ def record6():
)


@pytest.fixture
def picorecord1():
return screed.Record(
name='seq1_901350_901788_1:0:0_0:0:0_21ca1/2',
sequence=('GTTTTTTTTTTGTTTCCCAAAGTAAGGCTGAGTGAACAATATTTTCTCATAGTTTTGAC'
'AAAAACAAAGGAATCCTTAGTTATTAAACTCGGGAGTTTGA'),
ikmers=[
KmerOfInterest('TTTTTTGTTTCCCAAAGTAAGGCTG', 5, [19, 0, 0]),
KmerOfInterest('TTTTTGTTTCCCAAAGTAAGGCTGA', 6, [18, 1, 0]),
KmerOfInterest('TTTTGTTTCCCAAAGTAAGGCTGAG', 7, [18, 1, 0]),
KmerOfInterest('TTTGTTTCCCAAAGTAAGGCTGAGT', 8, [18, 0, 0]),
KmerOfInterest('TTGTTTCCCAAAGTAAGGCTGAGTG', 9, [17, 0, 0]),
],
)


@pytest.fixture
def picorecord2():
return screed.Record(
name='seq1_901428_901847_3:0:0_0:0:0_87d/1',
sequence=('TTACATTTATTCGTTTGTGCAGGCTGAGACCTCACTTCCAACTGTAATCCAAAAGCTTA'
'GTTTTTTTTTTGTTTCCCAAAGTAAGGCTGAGTGAACAATA'),
ikmers=[
KmerOfInterest('TTTTTTGTTTCCCAAAGTAAGGCTG', 64, [19, 0, 0]),
KmerOfInterest('TTTTTGTTTCCCAAAGTAAGGCTGA', 65, [18, 1, 0]),
KmerOfInterest('TTTTGTTTCCCAAAGTAAGGCTGAG', 66, [18, 1, 0]),
KmerOfInterest('TTTGTTTCCCAAAGTAAGGCTGAGT', 67, [18, 0, 0]),
KmerOfInterest('TTGTTTCCCAAAGTAAGGCTGAGTG', 68, [17, 0, 0]),
],
)


@pytest.fixture
def picorecord3():
return screed.Record(
name='seq1_901428_901847_3:0:0_0:0:0_87d/1',
sequence=('TATTGTTCACTCAGCCTTACTTTGGGAAACAAAAAAAAAACTAAGCTTTTGGATTACAG'
'TTGGAAGTGAGGTCTCAGCCTGCACAAACGAATAAATGTAA'),
ikmers=[
KmerOfInterest('CAGCCTTACTTTGGGAAACAAAAAA', 11, [17, 0, 0]),
KmerOfInterest('TCAGCCTTACTTTGGGAAACAAAAA', 10, [18, 0, 0]),
KmerOfInterest('CTCAGCCTTACTTTGGGAAACAAAA', 9, [18, 1, 0]),
KmerOfInterest('ACTCAGCCTTACTTTGGGAAACAAA', 8, [18, 1, 0]),
KmerOfInterest('CACTCAGCCTTACTTTGGGAAACAA', 7, [19, 0, 0]),
],
)


def test_print_read_pair_same_orient(record1, record2, capsys):
pair = kevlar.overlap.calc_offset(record1, record2, 'CGCAA')
print_read_pair(pair, 14, sys.stderr)
Expand Down Expand Up @@ -202,3 +250,22 @@ def test_calc_offset_weirdness(record5, record6):
for ikmer in ['CTGTCAA', 'TGTCAAG']:
pair = kevlar.overlap.calc_offset(record5, record6, ikmer)
assert pair == INCOMPATIBLE_PAIR


def test_pico_offset(picorecord1, picorecord2, picorecord3):
pair = kevlar.overlap.calc_offset(picorecord1, picorecord2,
'TTTTTTGTTTCCCAAAGTAAGGCTG')
assert pair.offset == 59
assert pair.head.name == 'seq1_901350_901788_1:0:0_0:0:0_21ca1/2'
assert pair.swapped is False
contig = kevlar.assemble.merge_pair(pair)
print(contig)

pair = kevlar.overlap.calc_offset(picorecord1, picorecord3,
'TTTTTTGTTTCCCAAAGTAAGGCTG')
assert pair.offset == 59
assert pair.head.name == 'seq1_901350_901788_1:0:0_0:0:0_21ca1/2'
assert pair.swapped is True
newcontig = kevlar.assemble.merge_pair(pair)
print(newcontig)
assert kevlar.same_seq(contig, newcontig)

0 comments on commit f1a90dc

Please sign in to comment.