Skip to content

Commit

Permalink
Merge pull request #285 from dib-lab/filter/autosomes
Browse files Browse the repository at this point in the history
Enable filtering cutouts/calls by sequence ID
  • Loading branch information
standage committed Oct 12, 2018
2 parents 280872a + babff5d commit fb3ff51
Show file tree
Hide file tree
Showing 8 changed files with 117 additions and 18 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,5 @@ kevlar/*.cpython-*.so
dist/
*.egg-info
.ipynb_checkpoints/
.eggs/
.pytest_cache/
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ help: Makefile

## devenv: install software development pre-requisites
devenv:
pip install --upgrade pip setuptools pytest pytest-cov pytest-xdist pycodestyle cython sphinx sphinx-argparse
pip install --upgrade pip setuptools pytest==3.6.4 pytest-cov pytest-xdist pycodestyle cython sphinx sphinx-argparse

## style: check Python code style against PEP8
style:
Expand Down
20 changes: 12 additions & 8 deletions kevlar/alac.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,10 @@


def make_call_from_reads(queue, idx, calls, refrfile, ksize=31, delta=50,
seedsize=31, maxdiff=None, match=1, mismatch=2,
gapopen=5, gapextend=0, min_ikmers=None,
refrseqs=None, logstream=sys.stderr):
seedsize=31, maxdiff=None, exclpattern=None,
inclpattern=None, match=1, mismatch=2, gapopen=5,
gapextend=0, min_ikmers=None, refrseqs=None,
logstream=sys.stderr):
while True:
if queue.empty():
sleep(3)
Expand Down Expand Up @@ -51,6 +52,7 @@ def make_call_from_reads(queue, idx, calls, refrfile, ksize=31, delta=50,
# Identify the genomic region(s) associated with each contig
localizer = localize(
contigs, refrfile, seedsize, delta=delta, maxdiff=maxdiff,
inclpattern=inclpattern, exclpattern=exclpattern,
refrseqs=refrseqs, logstream=logstream
)
targets = list(localizer)
Expand All @@ -69,8 +71,9 @@ def make_call_from_reads(queue, idx, calls, refrfile, ksize=31, delta=50,


def alac(pstream, refrfile, threads=1, ksize=31, bigpart=10000, delta=50,
seedsize=31, maxdiff=None, match=1, mismatch=2, gapopen=5,
gapextend=0, min_ikmers=None, logstream=sys.stderr):
seedsize=31, maxdiff=None, inclpattern=None, exclpattern=None,
match=1, mismatch=2, gapopen=5, gapextend=0, min_ikmers=None,
logstream=sys.stderr):
part_queue = Queue(maxsize=max(32, 12 * threads))

refrstream = kevlar.open(refrfile, 'r')
Expand All @@ -84,8 +87,8 @@ def alac(pstream, refrfile, threads=1, ksize=31, bigpart=10000, delta=50,
target=make_call_from_reads,
args=(
part_queue, idx, thread_calls, refrfile, ksize, delta,
seedsize, maxdiff, match, mismatch, gapopen, gapextend,
min_ikmers, refrseqs, logstream,
seedsize, maxdiff, inclpattern, exclpattern, match, mismatch,
gapopen, gapextend, min_ikmers, refrseqs, logstream,
)
)
worker.setDaemon(True)
Expand Down Expand Up @@ -117,7 +120,8 @@ def main(args):
workflow = alac(
pstream, args.refr, threads=args.threads, ksize=args.ksize,
bigpart=args.bigpart, delta=args.delta, seedsize=args.seed_size,
maxdiff=args.max_diff, match=args.match, mismatch=args.mismatch,
maxdiff=args.max_diff, inclpattern=args.include,
exclpattern=args.exclude, match=args.match, mismatch=args.mismatch,
gapopen=args.open, gapextend=args.extend, min_ikmers=args.min_ikmers,
logstream=args.logfile
)
Expand Down
8 changes: 8 additions & 0 deletions kevlar/cli/alac.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
# licensed under the MIT license: see LICENSE.
# -----------------------------------------------------------------------------

import re


def subparser(subparsers):

Expand Down Expand Up @@ -39,6 +41,12 @@ def subparser(subparsers):
'length of the longest contig; each bin specifies '
'a reference target sequence against which '
'assembled contigs will be aligned')
local_args.add_argument('--include', metavar='REGEX', type=re.escape,
help='discard alignments to any chromosomes whose '
'sequence IDs do not match the given pattern')
local_args.add_argument('--exclude', metavar='REGEX', type=re.escape,
help='discard alignments to any chromosomes whose '
'sequence IDs match the given pattern')

score_args = subparser.add_argument_group('Alignment scoring')
score_args.add_argument('-A', '--match', type=int, default=1, metavar='A',
Expand Down
8 changes: 8 additions & 0 deletions kevlar/cli/localize.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
# licensed under the MIT license: see LICENSE.
# -----------------------------------------------------------------------------

import re


def subparser(subparsers):
"""Define the `kevlar localize` command-line interface."""
Expand All @@ -32,5 +34,11 @@ def subparser(subparsers):
'reference targets if the distance between two '
'seed matches is > X; by default, X is 3 times the '
'length of the longest contig')
subparser.add_argument('--include', metavar='REGEX', type=re.escape,
help='discard alignments to any chromosomes whose '
'sequence IDs do not match the given pattern')
subparser.add_argument('--exclude', metavar='REGEX', type=re.escape,
help='discard alignments to any chromosomes whose '
'sequence IDs match the given pattern')
subparser.add_argument('contigs', help='assembled reads in Fasta format')
subparser.add_argument('refr', help='BWA indexed reference genome')
33 changes: 26 additions & 7 deletions kevlar/localize.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from collections import defaultdict
import os.path
import re
import sys

import kevlar
Expand All @@ -23,19 +24,35 @@ class KevlarRefrSeqNotFoundError(ValueError):


class Localizer(object):
def __init__(self, seedsize, delta=0):
def __init__(self, seedsize, delta=0, incl=None, excl=None):
self._positions = defaultdict(list)
self._seedsize = seedsize
self._delta = delta
self.inclpattern = incl
self.exclpattern = excl

def __len__(self):
return len(self._positions)
return sum([
len(self._positions[s]) for s in self._positions
if not self.ignore_seqid(s)
])

def ignore_seqid(self, seqid):
include = True
exclude = False
if self.inclpattern:
include = re.search(self.inclpattern, seqid) is not None
if self.exclpattern:
exclude = re.search(self.exclpattern, seqid) is not None
return exclude or not include

def add_seed_match(self, seqid, pos):
self._positions[seqid].append(pos)

def get_cutouts(self, refrseqs=None, clusterdist=1000):
for seqid in sorted(self._positions):
if self.ignore_seqid(seqid):
continue
matchpos = sorted(self._positions[seqid])
assert len(matchpos) > 0
if refrseqs:
Expand Down Expand Up @@ -66,8 +83,7 @@ def new_cutout(cluster):
cluster = list()
cluster.append(nextpos)
prevpos = nextpos
if len(cluster) > 0:
yield new_cutout(cluster)
yield new_cutout(cluster)


def get_unique_seeds(recordstream, seedsize):
Expand Down Expand Up @@ -109,15 +125,17 @@ def get_exact_matches(contigstream, bwaindexfile, seedsize=31):


def localize(contigstream, refrfile, seedsize=31, delta=50, maxdiff=None,
refrseqs=None, logstream=sys.stderr):
inclpattern=None, exclpattern=None, refrseqs=None,
logstream=sys.stderr):
"""Wrap the `kevlar localize` task as a generator.
Input is an iterable containing contigs (assembled by `kevlar assemble`),
the filename of the reference genome sequence, and the desired seed size.
"""
autoindex(refrfile, logstream)
contigs = list(contigstream)
localizer = Localizer(seedsize, delta)
localizer = Localizer(seedsize, delta=delta, incl=inclpattern,
excl=exclpattern)
for seqid, start, end in get_exact_matches(contigs, refrfile, seedsize):
localizer.add_seed_match(seqid, start)
if len(localizer) == 0:
Expand All @@ -141,7 +159,8 @@ def main(args):
outstream = kevlar.open(args.out, 'w')
localizer = localize(
contigstream, args.refr, seedsize=args.seed_size, delta=args.delta,
maxdiff=args.max_diff, logstream=args.logfile
maxdiff=args.max_diff, inclpattern=args.include,
exclpattern=args.exclude, logstream=args.logfile
)
for cutout in localizer:
record = Record(name=cutout.defline, sequence=cutout.sequence)
Expand Down
13 changes: 13 additions & 0 deletions kevlar/tests/test_alac.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,19 @@ def test_alac_single_partition_badlabel(capsys):
assert out == ''


def test_alac_exclude(capsys):
readfile = data_file('fiveparts.augfastq.gz')
refrfile = data_file('fiveparts-refr.fa.gz')
arglist = ['alac', '--exclude', '"^seq"', readfile, refrfile]
args = kevlar.cli.parser().parse_args(arglist)
kevlar.alac.main(args)
out, err = capsys.readouterr()

# grep -v ^'#' out
out = '\n'.join([l for l in out.split('\n') if not l.startswith('#')])
assert out == ''


def test_alac_bigpart():
readfile = data_file('fiveparts.augfastq.gz')
refrfile = data_file('fiveparts-refr.fa.gz')
Expand Down
49 changes: 47 additions & 2 deletions kevlar/tests/test_localize.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,42 @@ def test_localizer_simple():
]


def test_localizer_incl_excl():
intervals = Localizer(seedsize=25)
intervals.add_seed_match('1', 100)
intervals.add_seed_match('1', 120)
intervals.add_seed_match('12', 200)
intervals.add_seed_match('12', 209)
intervals.add_seed_match('12', 213)
intervals.add_seed_match('X', 1234)
intervals.add_seed_match('X', 1245)
intervals.add_seed_match('Un', 13579)
intervals.add_seed_match('Un', 13597)

testint = [c.interval for c in intervals.get_cutouts()]
assert sorted(testint) == [
('1', 100, 145),
('12', 200, 238),
('Un', 13579, 13622),
('X', 1234, 1270),
]

intervals.exclpattern = 'Un'
testint = [c.interval for c in intervals.get_cutouts()]
assert sorted(testint) == [
('1', 100, 145),
('12', 200, 238),
('X', 1234, 1270),
]

intervals.inclpattern = r'^\d+$'
testint = [c.interval for c in intervals.get_cutouts()]
assert sorted(testint) == [
('1', 100, 145),
('12', 200, 238),
]


@pytest.mark.parametrize('infile', [
('smallseq.fa.gz'),
('smallseq.augfasta'),
Expand Down Expand Up @@ -178,14 +214,23 @@ def test_maxdiff(X, numtargets):
assert len(targets) == numtargets


def test_main(capsys):
@pytest.mark.parametrize('incl,excl,output', [
(None, None, '>seq1_10-191'),
(r'seq1', None, '>seq1_10-191'),
(None, 'seq1', 'WARNING: no reference matches'),
(r'chr[XY]', None, 'WARNING: no reference matches'),
(None, r'b0Gu$', '>seq1_10-191'),
])
def test_main(incl, excl, output, capsys):
contig = data_file('localize-contig.fa')
refr = data_file('localize-refr.fa')
arglist = ['localize', '--seed-size', '23', '--delta', '50', contig, refr]
args = kevlar.cli.parser().parse_args(arglist)
args.include = incl
args.exclude = excl
kevlar.localize.main(args)
out, err = capsys.readouterr()
assert '>seq1_10-191' in out
assert output in out or output in err


def test_main_no_matches(capsys):
Expand Down

0 comments on commit fb3ff51

Please sign in to comment.