Skip to content

Commit

Permalink
Merge pull request #255 from dib-lab/bwa-intervals
Browse files Browse the repository at this point in the history
Improved mate distance calculations
  • Loading branch information
standage committed Apr 26, 2018
2 parents 9965925 + 1993b05 commit 4ff6cc2
Show file tree
Hide file tree
Showing 17 changed files with 78 additions and 64 deletions.
23 changes: 12 additions & 11 deletions kevlar/call.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from collections import defaultdict
import sys
import kevlar
from kevlar.reference import bwa_align
from kevlar.varmap import VariantMapping
from kevlar.vcf import VariantFilter as vf

Expand All @@ -20,26 +21,24 @@ def align_mates(record, refrfile):
fasta += '>mateseq{:d}\n{:s}\n'.format(n, mateseq)
cmd = 'bwa mem {:s} -'.format(refrfile)
cmdargs = cmd.split()
for seqid, pos in kevlar.reference.bwa_align(cmdargs, seqstring=fasta):
yield seqid, pos
for seqid, start, end in bwa_align(cmdargs, seqstring=fasta):
yield seqid, start, end


def mate_distance(mate_positions, gdna_position):
gdnaseq, startpos, endpos = gdna_position

def pointdist(point):
if point < startpos:
return startpos - point
elif point > endpos:
return point - endpos
else:
return 0
def intvldist(istart, iend):
x, y = sorted(((startpos, endpos), (istart, iend)))
if x[0] <= x[1] < y[0]:
return y[0] - x[1]
return 0

distances = list()
for seqid, pos in mate_positions:
for seqid, start, end in mate_positions:
if seqid != gdnaseq:
continue
d = pointdist(pos)
d = intvldist(start, end)
if d < 10000:
distances.append(d)
if len(distances) == 0:
Expand Down Expand Up @@ -112,6 +111,8 @@ def prelim_call(targetlist, querylist, match=1, mismatch=2, gapopen=5,
if alignment.matedist:
avgdistance = '{:.2f}'.format(alignment.matedist)
varcall.annotate('MATEDIST', avgdistance)
if n > 0:
varcall.filter(vf.MateFail)
yield varcall


Expand Down
8 changes: 4 additions & 4 deletions kevlar/localize.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,8 @@ def get_exact_matches(contigstream, bwaindexfile, seedsize=31):
k=seedsize, idx=bwaindexfile
)
cmdargs = cmd.split(' ')
for seqid, pos in bwa_align(cmdargs, seqstring=kmers):
yield seqid, pos
for seqid, start, end in bwa_align(cmdargs, seqstring=kmers):
yield seqid, start, end


def localize(contigstream, refrfile, seedsize=31, delta=50, maxdiff=None,
Expand All @@ -126,8 +126,8 @@ def localize(contigstream, refrfile, seedsize=31, delta=50, maxdiff=None,
autoindex(refrfile, logstream)
contigs = list(contigstream)
localizer = Localizer(seedsize, delta)
for seqid, pos in get_exact_matches(contigs, refrfile, seedsize):
localizer.add_seed_match(seqid, pos)
for seqid, start, end in get_exact_matches(contigs, refrfile, seedsize):
localizer.add_seed_match(seqid, start)
if len(localizer) == 0:
message = 'WARNING: no reference matches'
print('[kevlar::localize]', message, file=logstream)
Expand Down
7 changes: 1 addition & 6 deletions kevlar/reference.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,12 +65,7 @@ def bwa_align(cmdargs, seqstring):
if record.is_unmapped:
continue
seqid = sam.get_reference_name(record.reference_id)
yield seqid, record.pos
if record.has_tag('XA'):
alts = record.get_tag('XA').strip(';').split(';')
for alt in alts:
seqid, pos = re.search(r'^([^,]+),[+-](\d+)', alt).groups()
yield seqid, int(pos)
yield seqid, record.reference_start, record.reference_end


class ReferenceCutout(object):
Expand Down
1 change: 1 addition & 0 deletions kevlar/tests/data/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ fiveparts-refr.fa.gz.*
inf-mate-dist/*.genome.fa.gz.*
calls-*.vcf
maxdiff-refr.fa.gz.*
mate-dist/cc130.refr.fa.gz.*
Binary file removed kevlar/tests/data/inf-mate-dist/cc138713.augfastq.gz
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file removed kevlar/tests/data/inf-mate-dist/cc26849.augfastq.gz
Binary file not shown.
Binary file removed kevlar/tests/data/inf-mate-dist/cc26849.genome.fa.gz
Binary file not shown.
Binary file added kevlar/tests/data/mate-dist/cc130.augfastq.gz
Binary file not shown.
Binary file not shown.
Binary file added kevlar/tests/data/mate-dist/cc130.refr.fa.gz
Binary file not shown.
44 changes: 19 additions & 25 deletions kevlar/tests/test_alac.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,39 +147,33 @@ def test_alac_bigpart():
assert len(calls) == 3


@pytest.mark.parametrize('cc,numcalls,numdist,numinf', [
('26849', 4, 2, 0),
('138713', 14, 14, 10),
])
def test_alac_inf_mate_dist(cc, numcalls, numdist, numinf):
readfile = data_file('inf-mate-dist/cc{}.augfastq.gz'.format(cc))
refrfile = data_file('inf-mate-dist/cc{}.genome.fa.gz'.format(cc))
def test_alac_matedist():
readfile = data_file('mate-dist/cc130.augfastq.gz')
refrfile = data_file('mate-dist/cc130.refr.fa.gz')
readstream = kevlar.parse_augmented_fastx(kevlar.open(readfile, 'r'))
partstream = kevlar.parse_partitioned_reads(readstream)
caller = kevlar.alac.alac(partstream, refrfile, ksize=31, delta=50,
seedsize=51, fallback=True)
seedsize=51)
calls = list(caller)
withdist = [c for c in calls if c.attribute('MATEDIST') is not None]
infdist = [c for c in calls if c.attribute('MATEDIST') == 'inf']
print('DEBUG total withdist infdist', len(calls), len(withdist),
len(infdist), file=sys.stderr)
assert len(calls) == numcalls
assert len(withdist) == numdist
assert len(infdist) == numinf


def test_alac_no_mates():
readfile = data_file('inf-mate-dist/cc26849-nomates.augfastq.gz')
refrfile = data_file('inf-mate-dist/cc26849.genome.fa.gz')
assert len(calls) == 3
passed = [c for c in calls if c.filterstr == 'PASS']
assert len(passed) == 1
assert passed[0].position == 127541 - 1


def test_alac_nomates():
readfile = data_file('mate-dist/cc130.nomates.augfastq.gz')
refrfile = data_file('mate-dist/cc130.refr.fa.gz')
readstream = kevlar.parse_augmented_fastx(kevlar.open(readfile, 'r'))
partstream = kevlar.parse_partitioned_reads(readstream)
caller = kevlar.alac.alac(partstream, refrfile, ksize=31, delta=50,
seedsize=51, fallback=True)
seedsize=51)
calls = list(caller)
print(*[c.vcf for c in calls], sep='\n', file=sys.stderr)
assert len(calls) == 4
filtcalls = [c for c in calls if c.filterstr == 'PASS']
assert len(filtcalls) == 3
assert len(calls) == 3
passed = [c for c in calls if c.filterstr == 'PASS']
assert len(passed) == 3
coords = set([c.position for c in passed])
assert coords == set([1476 - 1, 115378 - 1, 127541 - 1])


@pytest.mark.parametrize('vcfposition,X,cigar', [
Expand Down
26 changes: 17 additions & 9 deletions kevlar/tests/test_call.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,25 +197,33 @@ def test_align_mates():
refrfile = data_file('minitrio/refr.fa')
kevlar.reference.autoindex(refrfile)
positions = list(kevlar.call.align_mates(record, refrfile))
seqids = set([seqid for seqid, coord in positions])
coords = sorted([coord for seqid, coord in positions])
seqids = set([seqid for seqid, start, end in positions])
coords = sorted([(start, end) for seqid, start, end in positions])
print('DEBUG', coords, file=sys.stderr)
assert seqids == set(['seq1'])
assert coords == [
45332, 45377, 45393, 45428, 45440, 45447, 46092, 46093, 46099, 46127,
46131, 46146, 46148, 48025, 48035
(45332, 45432), (45377, 45477), (45393, 45493), (45428, 45528),
(45440, 45540), (45447, 45547), (46092, 46192), (46093, 46193),
(46099, 46199), (46127, 46227), (46131, 46231), (46146, 46246),
(46148, 46248), (48025, 48125), (48035, 48135),
]


def test_mate_distance():
coords = [
45332, 45377, 45393, 45428, 45440, 45447, 46092, 46093, 46099, 46127,
46131, 46146, 46148, 48025, 48035
(45332, 45432), (45377, 45477), (45393, 45493), (45428, 45528),
(45440, 45540), (45447, 45547), (46092, 46192), (46093, 46193),
(46099, 46199), (46127, 46227), (46131, 46231), (46146, 46246),
(46148, 46248), (48025, 48125), (48035, 48135),
]
positions = [('seq1', c) for c in coords]
positions = [('seq1', c[0], c[1]) for c in coords]
gdna_pos = ('seq1', 45727, 45916)
assert kevlar.call.mate_distance(positions, gdna_pos) == 506.46666666666664
assert kevlar.call.mate_distance(positions, gdna_pos) == 466.46666666666664

positions = [('seq2', 4000), ('seq2', 3000), ('seq2', 5100), ('seq3', 1)]
positions = [
('seq2', 3900, 4000), ('seq2', 2900, 3000), ('seq2', 5100, 5200),
('seq3', 1, 100)
]
gdna_pos = ('seq2', 5000, 5500)
assert kevlar.call.mate_distance(positions, gdna_pos) == 1000.0

Expand Down
10 changes: 10 additions & 0 deletions kevlar/tests/test_reference.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,16 @@ def test_autoindex():
rmtree(tmpdir)


def test_bwa_align_coords():
fasta = ('>seq1\nTGACGTGACCCCAAGAAAACACTGCACCCAACTTCTTTCTTTAAGCCTTCGTGTGTG'
'CAGGAGGAGGCCAGCCCTGGTTTCAAAATTGTTCCTCAGCATT\n')
args = ['bwa', 'mem', data_file('fiveparts-refr.fa.gz'), '-']
aligner = kevlar.reference.bwa_align(args, fasta)
mappings = list(aligner)
assert len(mappings) == 1
assert mappings[0] == ('seq1', 50, 150)


def test_bwa_failure():
args = ['bwa', 'mem', data_file('not-a-real-file.fa'), '-']
with pytest.raises(KevlarBWAError) as e:
Expand Down
16 changes: 8 additions & 8 deletions kevlar/tests/test_varmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,13 +216,13 @@ def test_call_indel_snv():
calls = list(aln.call_variants(31))
assert len(calls) == 2

assert calls[0]._refr == 'C'
assert calls[0]._alt == 'A'
assert calls[0]._pos == 474 - 1
assert calls[0]._refr == 'CA'
assert calls[0]._alt == 'C'
assert calls[0]._pos == 501 - 1

assert calls[1]._refr == 'CA'
assert calls[1]._alt == 'C'
assert calls[1]._pos == 501 - 1
assert calls[1]._refr == 'C'
assert calls[1]._alt == 'A'
assert calls[1]._pos == 474 - 1

calls = list(aln.call_variants(31, mindist=None))
assert len(calls) == 2
Expand Down Expand Up @@ -259,8 +259,8 @@ def test_passenger_screen():
aln = VariantMapping(contig, cutout)
calls = list(aln.call_variants(29))
assert len(calls) == 2
assert calls[0].filterstr == 'PassengerVariant'
assert calls[1].filterstr == 'PASS'
assert calls[0].filterstr == 'PASS'
assert calls[1].filterstr == 'PassengerVariant'


@pytest.mark.parametrize('query,target,refr,alt', [
Expand Down
7 changes: 6 additions & 1 deletion kevlar/varmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,11 @@ def call_variants(self, ksize, mindist=6, logstream=sys.stderr):
yield call
elif self.vartype == 'indel':
indelcaller = self.call_indel(ksize)
indel = next(indelcaller)
if self.is_passenger(indel):
indel.filter(vf.PassengerVariant)
yield indel

leftflankcaller = self.call_snv(
self.leftflank.query, self.leftflank.target, offset, ksize,
mindist, donocall=False
Expand All @@ -187,7 +192,7 @@ def call_variants(self, ksize, mindist=6, logstream=sys.stderr):
self.rightflank.query, self.rightflank.target, offset, ksize,
mindist, donocall=False
)
for call in chain(leftflankcaller, indelcaller, rightflankcaller):
for call in chain(leftflankcaller, rightflankcaller):
if self.is_passenger(call):
call.filter(vf.PassengerVariant)
yield call
Expand Down

0 comments on commit 4ff6cc2

Please sign in to comment.