Skip to content

Commit

Permalink
Merge pull request #198 from dib-lab/refactor/ksize
Browse files Browse the repository at this point in the history
Replace ksize with seed size in kevlar localize
  • Loading branch information
standage committed Feb 3, 2018
2 parents 76f0fbb + b596f35 commit ae8cc63
Show file tree
Hide file tree
Showing 7 changed files with 69 additions and 64 deletions.
14 changes: 7 additions & 7 deletions kevlar/alac.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ def augment_and_mark(augseqs, nakedseqs):
yield seq


def alac(pstream, refrfile, ksize=31, delta=25, maxdiff=10000, match=1,
mismatch=2, gapopen=5, gapextend=0, greedy=False, min_ikmers=None,
logstream=sys.stderr):
def alac(pstream, refrfile, ksize=31, delta=25, seedsize=31, maxdiff=10000,
match=1, mismatch=2, gapopen=5, gapextend=0, greedy=False,
min_ikmers=None, logstream=sys.stderr):
assembler = assemble_greedy if greedy else assemble_fml_asm
for partition in pstream:
reads = list(partition)
Expand All @@ -38,7 +38,7 @@ def alac(pstream, refrfile, ksize=31, delta=25, maxdiff=10000, match=1,
continue

# Identify the genomic region(s) associated with each contig
localizer = localize(contigs, refrfile, ksize, delta=delta,
localizer = localize(contigs, refrfile, seedsize, delta=delta,
logstream=logstream)
targets = list(localizer)
if len(targets) == 0:
Expand All @@ -57,9 +57,9 @@ def main(args):
outstream = kevlar.open(args.out, 'w')
workflow = alac(
pstream, args.refr, ksize=args.ksize, delta=args.delta,
maxdiff=args.max_diff, match=args.match, mismatch=args.mismatch,
gapopen=args.open, gapextend=args.extend, greedy=args.greedy,
min_ikmers=args.min_ikmers, logstream=args.logfile
seedsize=args.seed_size, maxdiff=args.max_diff, match=args.match,
mismatch=args.mismatch, gapopen=args.open, gapextend=args.extend,
greedy=args.greedy, min_ikmers=args.min_ikmers, logstream=args.logfile
)

for varcall in workflow:
Expand Down
2 changes: 2 additions & 0 deletions kevlar/cli/alac.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ def subparser(subparsers):
subparser._positionals.title = 'Required inputs'

local_args = subparser.add_argument_group('Target extraction')
local_args.add_argument('-z', '--seed-size', type=int, default=31,
metavar='Z', help='seed size; default is 31')
local_args.add_argument('-d', '--delta', type=int, default=25, metavar='D',
help='retrieve the genomic interval from the '
'reference by extending beyond the span of all '
Expand Down
4 changes: 2 additions & 2 deletions kevlar/cli/localize.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def subparser(subparsers):
'of all k-mer starting positions by D bp')
subparser.add_argument('-o', '--out', metavar='FILE', default='-',
help='output file; default is terminal (stdout)')
subparser.add_argument('-k', '--ksize', type=int, metavar='K', default=31,
help='k-mer size; default is 31')
subparser.add_argument('-z', '--seed-size', type=int, metavar='Z',
default=31, help='seed size; default is 31')
subparser.add_argument('contigs', help='assembled reads in Fasta format')
subparser.add_argument('refr', help='BWA indexed reference genome')
2 changes: 2 additions & 0 deletions kevlar/cli/simplex.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,8 @@ def subparser(subparsers):
'contig is aligned to a cutout of the reference genome, and the '
'variant is called from the alignment.'
)
alac_args.add_argument('-z', '--seed-size', type=int, default=31,
metavar='Z', help='seed size; default is 31')
alac_args.add_argument(
'-d', '--delta', type=int, default=50, metavar='D', help='extend the '
'genomic cutout by D bp; default is 50'
Expand Down
46 changes: 23 additions & 23 deletions kevlar/localize.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,11 @@ class KevlarRefrSeqNotFoundError(ValueError):
pass


class KmerMatchSet(object):
"""Store a set exact k-mer matches, indexed by sequence ID."""
def __init__(self, ksize):
class SeedMatchSet(object):
"""Store a set exact seed matches, indexed by sequence ID."""
def __init__(self, seedsize):
self._positions = defaultdict(list)
self._ksize = ksize
self._seedsize = seedsize

def __len__(self):
return len(self._positions)
Expand All @@ -57,31 +57,31 @@ def get_spans(self, seqid, clusterdist=10000):
dist = nextpos - prevpos
if dist > clusterdist:
if len(cluster) > 0:
span = (cluster[0], cluster[-1] + self._ksize)
span = (cluster[0], cluster[-1] + self._seedsize)
clusterspans.append(span)
cluster = list()
cluster.append(nextpos)
prevpos = nextpos
if len(cluster) > 0:
span = (cluster[0], cluster[-1] + self._ksize)
span = (cluster[0], cluster[-1] + self._seedsize)
clusterspans.append(span)
return clusterspans

return [(positions[0], positions[-1] + self._ksize)]
return [(positions[0], positions[-1] + self._seedsize)]

@property
def seqids(self):
return set(list(self._positions.keys()))


def get_unique_kmers(recordstream, ksize=31):
def get_unique_seeds(recordstream, seedsize=31):
"""
Grab all unique k-mers from the specified sequence file.
Grab all unique seeds from the specified sequence file.
Input is expected to be an iterable containing screed or khmer sequence
records.
"""
ct = khmer.Counttable(ksize, 1, 1)
ct = khmer.Counttable(seedsize, 1, 1)
kmers = set()
for record in recordstream:
for kmer in ct.get_kmers(record.sequence):
Expand All @@ -91,31 +91,31 @@ def get_unique_kmers(recordstream, ksize=31):
yield kmer


def unique_kmer_string(recordstream, ksize=31):
def unique_seed_string(recordstream, seedsize=31):
"""
Convert contigs to k-mer Fasta for BWA input.
Convert contigs to Fasta records of seed sequences for BWA input.
Input is expected to be an iterable containing screed or khmer sequence
records.
"""
output = ''
for n, kmer in enumerate(get_unique_kmers(recordstream, ksize)):
for n, kmer in enumerate(get_unique_seeds(recordstream, seedsize)):
output += '>kmer{:d}\n{:s}\n'.format(n, kmer)
return output


def get_exact_matches(contigstream, bwaindexfile, ksize=31):
def get_exact_matches(contigstream, bwaindexfile, seedsize=31):
"""
Compute a list of exact k-mer matches using BWA MEM.
Compute a list of exact seed matches using BWA MEM.
Input should be an iterable containing contigs generated by
`kevlar assemble`. This function decomposes the contigs into their
constituent k-mers and searches for exact matches in the reference using
constituent seeds and searches for exact matches in the reference using
`bwa mem`. This function is a generator, and yields tuples of
(seqid, startpos) for each exact match found.
"""
kmers = unique_kmer_string(contigstream, ksize)
cmd = 'bwa mem -k {k} -T {k} {idx} -'.format(k=ksize, idx=bwaindexfile)
kmers = unique_seed_string(contigstream, seedsize)
cmd = 'bwa mem -k {k} -T {k} {idx} -'.format(k=seedsize, idx=bwaindexfile)
cmdargs = cmd.split(' ')
with TemporaryFile() as samfile:
bwaproc = Popen(cmdargs, stdin=PIPE, stdout=samfile, stderr=PIPE,
Expand Down Expand Up @@ -176,18 +176,18 @@ def autoindex(refrfile, logstream=sys.stderr):
raise KevlarBWAError('Could not run "bwa index"') from err


def localize(contigstream, refrfile, ksize=31, delta=25, maxdiff=10000,
def localize(contigstream, refrfile, seedsize=31, delta=25, maxdiff=10000,
logstream=sys.stderr):
"""
Wrap the `kevlar localize` task as a generator.
Input is an iterable containing contigs (assembled by `kevlar assemble`)
stored as khmer or screed sequence records, the filename of the reference
genome sequence, and the desired k-size.
genome sequence, and the desired seed size.
"""
autoindex(refrfile, logstream)
seedmatches = KmerMatchSet(ksize)
for seqid, pos in get_exact_matches(contigstream, refrfile, ksize):
seedmatches = SeedMatchSet(seedsize)
for seqid, pos in get_exact_matches(contigstream, refrfile, seedsize):
seedmatches.add(seqid, pos)
if len(seedmatches) == 0:
message = 'WARNING: no reference matches'
Expand All @@ -203,7 +203,7 @@ def main(args):
contigstream = kevlar.parse_augmented_fastx(kevlar.open(args.contigs, 'r'))
outstream = kevlar.open(args.out, 'w')
localizer = localize(
contigstream, args.refr, ksize=args.ksize, delta=args.delta,
contigstream, args.refr, seedsize=args.seed_size, delta=args.delta,
maxdiff=args.max_diff, logstream=args.logfile
)
for record in localizer:
Expand Down
13 changes: 7 additions & 6 deletions kevlar/simplex.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
def simplex(case, casecounts, controlcounts, refrfile, ctrlmax=0, casemin=5,
mask=None, filtermem=1e6, filterfpr=0.001,
partminabund=2, partmaxabund=200, dedup=True,
delta=50, match=1, mismatch=2, gapopen=5, gapextend=0,
delta=50, seedsize=31, match=1, mismatch=2, gapopen=5, gapextend=0,
ksize=31, logstream=sys.stderr):
"""
Execute the simplex germline variant discovery workflow.
Expand Down Expand Up @@ -49,6 +49,7 @@ def simplex(case, casecounts, controlcounts, refrfile, ctrlmax=0, casemin=5,
- dedup: boolean indicating whether PCR duplicate removal should be run
Parameters for assembling, aligning, and calling variants:
- seedsize: size of seeds to use for localizing contigs
- delta: number of bp to extend genomic cutout
- match: alignment match score
- mismatch: alignment mismatch penalty
Expand All @@ -69,8 +70,8 @@ def simplex(case, casecounts, controlcounts, refrfile, ctrlmax=0, casemin=5,
)

caller = alac(
partitioner, refrfile, ksize=ksize, delta=delta, match=match,
mismatch=mismatch, gapopen=gapopen, gapextend=gapextend,
partitioner, refrfile, ksize=ksize, delta=delta, seedsize=seedsize,
match=match, mismatch=mismatch, gapopen=gapopen, gapextend=gapextend,
logstream=logstream
)

Expand Down Expand Up @@ -101,9 +102,9 @@ def main(args):
ctrlmax=args.ctrl_max, casemin=args.case_min, mask=mask,
filtermem=args.filter_memory, filterfpr=args.filter_fpr,
partminabund=args.part_min_abund, partmaxabund=args.part_max_abund,
dedup=args.dedup, delta=args.delta, match=args.match,
mismatch=args.mismatch, gapopen=args.open, gapextend=args.extend,
logstream=args.logfile
dedup=args.dedup, delta=args.delta, seedsize=args.seed_size,
match=args.match, mismatch=args.mismatch, gapopen=args.open,
gapextend=args.extend, logstream=args.logfile
)
for variant in workflow:
print(variant.vcf, file=outstream)
52 changes: 26 additions & 26 deletions kevlar/tests/test_localize.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,15 @@
import pytest
import screed
import kevlar
from kevlar.localize import KmerMatchSet
from kevlar.localize import SeedMatchSet
from kevlar.localize import KevlarRefrSeqNotFoundError
from kevlar.localize import (extract_regions, get_unique_kmers,
unique_kmer_string)
from kevlar.localize import (extract_regions, get_unique_seeds,
unique_seed_string)
from kevlar.tests import data_file


def test_kmer_match_set_simple():
intervals = KmerMatchSet(ksize=25)
def test_seed_match_set_simple():
intervals = SeedMatchSet(seedsize=25)
assert intervals.get_spans('chr1') is None

intervals.add('chr1', 100)
Expand All @@ -39,36 +39,36 @@ def test_kmer_match_set_simple():
('smallseq.augfasta'),
('smallseq.fq'),
])
def test_get_unique_kmers(infile):
def test_get_unique_seeds(infile):
infile = data_file(infile)
instream = kevlar.parse_augmented_fastx(kevlar.open(infile, 'r'))
kmers = set([k for k in get_unique_kmers(instream, ksize=9)])
testkmers = set(
seeds = set([k for k in get_unique_seeds(instream, seedsize=9)])
testseeds = set(
['TTAATTGGC', 'CTTAATTGG', 'TAATTGGCC', 'ATTACCGGT',
'TTACCGGTA', 'CCTTAATTG', 'GCCTTAATT', 'GGCCTTAAT']
)
print(sorted(kmers))
print(sorted(testkmers))
assert kmers == testkmers
print(sorted(seeds))
print(sorted(testseeds))
assert seeds == testseeds


def test_unique_kmer_string():
def test_unique_seed_string():
infile = data_file('smallseq.augfasta')
instream = kevlar.parse_augmented_fastx(kevlar.open(infile, 'r'))
fastastring = unique_kmer_string(instream, ksize=9)
fastastring = unique_seed_string(instream, seedsize=9)
fastafile = StringIO(fastastring)
kmers = set([s for d, s in kevlar.seqio.parse_fasta(fastafile)])
testkmers = set(
seeds = set([s for d, s in kevlar.seqio.parse_fasta(fastafile)])
testseeds = set(
['TTAATTGGC', 'CTTAATTGG', 'TAATTGGCC', 'ATTACCGGT',
'TTACCGGTA', 'CCTTAATTG', 'GCCTTAATT', 'GGCCTTAAT']
)
print(sorted(kmers))
print(sorted(testkmers))
assert kmers == testkmers
print(sorted(seeds))
print(sorted(testseeds))
assert seeds == testseeds


def test_extract_regions_basic():
intervals = KmerMatchSet(ksize=10)
intervals = SeedMatchSet(seedsize=10)
intervals.add('bogus-genome-chr2', 10)
instream = open(data_file('bogus-genome/refr.fa'), 'r')
regions = [r for r in extract_regions(instream, intervals, delta=0)]
Expand All @@ -77,7 +77,7 @@ def test_extract_regions_basic():


def test_extract_regions_basic_2():
intervals = KmerMatchSet(ksize=21)
intervals = SeedMatchSet(seedsize=21)
intervals.add('simple', 49)
intervals.add('simple', 52)
intervals.add('simple', 59)
Expand All @@ -89,7 +89,7 @@ def test_extract_regions_basic_2():


def test_extract_regions_large_span():
intervals = KmerMatchSet(ksize=21)
intervals = SeedMatchSet(seedsize=21)
intervals.add('simple', 100)
intervals.add('simple', 200)
instream = open(data_file('simple-genome-ctrl1.fa'), 'r')
Expand All @@ -103,7 +103,7 @@ def test_extract_regions_large_span():


def test_extract_regions_missing_seq():
intervals = KmerMatchSet(ksize=21)
intervals = SeedMatchSet(seedsize=21)
intervals.add('simple', 100)
intervals.add('simple', 200)
intervals.add('TheCakeIsALie', 42)
Expand All @@ -116,14 +116,14 @@ def test_extract_regions_missing_seq():


def test_extract_regions_boundaries():
intervals = KmerMatchSet(ksize=31)
intervals = SeedMatchSet(seedsize=31)
intervals.add('simple', 15)
instream = open(data_file('simple-genome-ctrl1.fa'), 'r')
regions = [r for r in extract_regions(instream, intervals, delta=20)]
assert len(regions) == 1
assert regions[0][0] == 'simple_0-66'

intervals = KmerMatchSet(ksize=31)
intervals = SeedMatchSet(seedsize=31)
intervals.add('simple', 925)
intervals.add('simple', 955)
intervals.add('simple', 978)
Expand All @@ -136,7 +136,7 @@ def test_extract_regions_boundaries():
def test_main(capsys):
contig = data_file('localize-contig.fa')
refr = data_file('localize-refr.fa')
arglist = ['localize', '--ksize', '23', '--delta', '50', contig, refr]
arglist = ['localize', '--seed-size', '23', '--delta', '50', contig, refr]
args = kevlar.cli.parser().parse_args(arglist)
kevlar.localize.main(args)
out, err = capsys.readouterr()
Expand All @@ -146,7 +146,7 @@ def test_main(capsys):
def test_main_no_matches(capsys):
contig = data_file('localize-contig-bad.fa')
refr = data_file('localize-refr.fa')
arglist = ['localize', '--ksize', '23', contig, refr]
arglist = ['localize', '--seed-size', '23', contig, refr]
args = kevlar.cli.parser().parse_args(arglist)
kevlar.localize.main(args)
out, err = capsys.readouterr()
Expand Down

0 comments on commit ae8cc63

Please sign in to comment.