Skip to content

Commit

Permalink
Merge pull request #212 from dib-lab/refactor/reference
Browse files Browse the repository at this point in the history
Refactor reference-related code
  • Loading branch information
standage committed Feb 15, 2018
2 parents cd793d8 + 69ae799 commit be6be75
Show file tree
Hide file tree
Showing 5 changed files with 354 additions and 190 deletions.
3 changes: 2 additions & 1 deletion kevlar/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from kevlar import seqio
from kevlar import overlap
from kevlar import sketch
from kevlar import reference
from kevlar.mutablestring import MutableString
from kevlar.readgraph import ReadGraph
from kevlar.seqio import parse_augmented_fastx, print_augmented_fastx
Expand Down Expand Up @@ -122,7 +123,7 @@ def clean_subseqs(sequence, ksize):


def vcf_header(outstream, version='4.2', source='kevlar', infoheader=False):
print('##fileFormat=VCFv', version, sep='', file=outstream)
print('##fileformat=VCFv', version, sep='', file=outstream)
print('##source=', source, sep='', file=outstream)
if infoheader:
print('##INFO=<GT,Number=3,Type=String,Description="Genotypes of each '
Expand Down
163 changes: 50 additions & 113 deletions kevlar/localize.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,74 +9,68 @@

from __future__ import print_function
from collections import defaultdict
from subprocess import Popen, PIPE, check_call
from tempfile import TemporaryFile
import os.path
import sys

import kevlar
from kevlar.reference import bwa_align, autoindex, ReferenceCutout
import khmer
import pysam


class KevlarBWAError(RuntimeError):
"""Raised if the delegated BWA call fails for any reason."""
pass


class KevlarRefrSeqNotFoundError(ValueError):
"""Raised if the reference sequence cannot be found."""
pass


class SeedMatchSet(object):
"""Store a set exact seed matches, indexed by sequence ID."""
def __init__(self, seedsize):
class Localizer(object):
def __init__(self, seedsize, delta=0):
self._positions = defaultdict(list)
self._seedsize = seedsize
self._delta = delta

def __len__(self):
return len(self._positions)

def add(self, seqid, pos):
def add_seed_match(self, seqid, pos):
self._positions[seqid].append(pos)

def get_spans(self, seqid, clusterdist=10000):
positions = sorted(self._positions[seqid])
if len(positions) == 0:
return None
def get_cutouts(self, refrseqs=None, clusterdist=10000):
for seqid in sorted(self._positions):
matchpos = sorted(self._positions[seqid])
assert len(matchpos) > 0
if refrseqs:
if seqid not in refrseqs:
raise KevlarRefrSeqNotFoundError(seqid)

def new_cutout(cluster):
startpos = max(cluster[0] - self._delta, 0)
endpos = cluster[-1] + self._seedsize + self._delta
subseq = None
if refrseqs:
endpos = min(endpos, len(refrseqs[seqid]))
subseq = refrseqs[seqid][startpos:endpos]
defline = '{:s}_{:d}-{:d}'.format(seqid, startpos, endpos)
return ReferenceCutout(defline, subseq)

if not clusterdist:
yield new_cutout(matchpos)
continue

clusterspans = list()
if clusterdist:
cluster = list()
for nextpos in positions:
if len(cluster) == 0:
cluster.append(nextpos)
prevpos = nextpos
continue
dist = nextpos - prevpos
if dist > clusterdist:
if len(cluster) > 0:
span = (cluster[0], cluster[-1] + self._seedsize)
clusterspans.append(span)
prevpos = None
for nextpos in matchpos:
if prevpos:
dist = nextpos - prevpos
if dist > clusterdist:
yield new_cutout(cluster)
cluster = list()
cluster.append(nextpos)
prevpos = nextpos
if len(cluster) > 0:
span = (cluster[0], cluster[-1] + self._seedsize)
clusterspans.append(span)
return clusterspans

return [(positions[0], positions[-1] + self._seedsize)]
yield new_cutout(cluster)

@property
def seqids(self):
return set(list(self._positions.keys()))


def get_unique_seeds(recordstream, seedsize=31):
"""
Grab all unique seeds from the specified sequence file.
def get_unique_seeds(recordstream, seedsize):
"""Grab all unique seeds from the specified sequence file.
Input is expected to be an iterable containing screed or khmer sequence
records.
Expand All @@ -91,9 +85,8 @@ def get_unique_seeds(recordstream, seedsize=31):
yield kmer


def unique_seed_string(recordstream, seedsize=31):
"""
Convert contigs to Fasta records of seed sequences for BWA input.
def unique_seed_string(recordstream, seedsize):
"""Convert contigs to Fasta records of seed sequences for BWA input.
Input is expected to be an iterable containing screed or khmer sequence
records.
Expand All @@ -105,8 +98,7 @@ def unique_seed_string(recordstream, seedsize=31):


def get_exact_matches(contigstream, bwaindexfile, seedsize=31):
"""
Compute a list of exact seed matches using BWA MEM.
"""Compute a list of exact seed matches using BWA MEM.
Input should be an iterable containing contigs generated by
`kevlar assemble`. This function decomposes the contigs into their
Expand All @@ -117,86 +109,31 @@ def get_exact_matches(contigstream, bwaindexfile, seedsize=31):
kmers = unique_seed_string(contigstream, seedsize)
cmd = 'bwa mem -k {k} -T {k} {idx} -'.format(k=seedsize, idx=bwaindexfile)
cmdargs = cmd.split(' ')
with TemporaryFile() as samfile:
bwaproc = Popen(cmdargs, stdin=PIPE, stdout=samfile, stderr=PIPE,
universal_newlines=True)
stdout, stderr = bwaproc.communicate(input=kmers)
if bwaproc.returncode != 0: # pragma: no cover
print(stderr, file=sys.stderr)
raise KevlarBWAError('problem running BWA')
samfile.seek(0)
sam = pysam.AlignmentFile(samfile, 'r')
for record in sam:
if record.is_unmapped:
continue
seqid = sam.get_reference_name(record.reference_id)
yield seqid, record.pos


def extract_regions(refr, seedmatches, delta=25, maxdiff=10000):
"""
Extract the specified genomic region from the provided file object.
The start and end parameters define a 0-based half-open genomic interval.
Bounds checking must be performed on the end parameter.
"""
observed_seqids = set()
for defline, sequence in kevlar.seqio.parse_fasta(refr):
seqid = defline[1:].split()[0]
observed_seqids.add(seqid)

regions = seedmatches.get_spans(seqid, maxdiff)
if regions is None:
continue

for start, end in regions:
newstart = max(start - delta, 0)
newend = min(end + delta, len(sequence))
subseqid = '{}_{}-{}'.format(seqid, newstart, newend)
subseq = sequence[newstart:newend]
yield subseqid, subseq

missing = [s for s in seedmatches.seqids if s not in observed_seqids]
if len(missing) > 0:
raise KevlarRefrSeqNotFoundError(','.join(missing))


def autoindex(refrfile, logstream=sys.stderr):
bwtfile = refrfile + '.bwt'
if os.path.isfile(bwtfile):
return

message = 'WARNING: BWA index not found for "{:s}"'.format(refrfile)
message += ', indexing now'
print('[kevlar::localize]', message, file=logstream)

try:
check_call(['bwa', 'index', refrfile])
except Exception as err: # pragma: no cover
raise KevlarBWAError('Could not run "bwa index"') from err
for seqid, pos in bwa_align(cmdargs, seqstring=kmers):
yield seqid, pos


def localize(contigstream, refrfile, seedsize=31, delta=25, maxdiff=10000,
logstream=sys.stderr):
"""
Wrap the `kevlar localize` task as a generator.
refrseqs=None, logstream=sys.stderr):
"""Wrap the `kevlar localize` task as a generator.
Input is an iterable containing contigs (assembled by `kevlar assemble`)
stored as khmer or screed sequence records, the filename of the reference
genome sequence, and the desired seed size.
"""
autoindex(refrfile, logstream)
seedmatches = SeedMatchSet(seedsize)
localizer = Localizer(seedsize, delta)
for seqid, pos in get_exact_matches(contigstream, refrfile, seedsize):
seedmatches.add(seqid, pos)
if len(seedmatches) == 0:
localizer.add_seed_match(seqid, pos)
if len(localizer) == 0:
message = 'WARNING: no reference matches'
print('[kevlar::localize]', message, file=logstream)
return
refrstream = kevlar.open(refrfile, 'r')
for subseqid, subseq in extract_regions(refrstream, seedmatches,
delta=delta, maxdiff=maxdiff):
yield khmer.Read(name=subseqid, sequence=subseq)
if not refrseqs:
refrstream = kevlar.open(refrfile, 'r')
seqs = kevlar.seqio.parse_seq_dict(refrstream)
for cutout in localizer.get_cutouts(refrseqs=seqs, clusterdist=maxdiff):
yield khmer.Read(name=cutout.defline, sequence=cutout.sequence)


def main(args):
Expand Down
133 changes: 133 additions & 0 deletions kevlar/reference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
#!/usr/bin/env python
#
# -----------------------------------------------------------------------------
# Copyright (c) 2018 The Regents of the University of California
#
# This file is part of kevlar (http://github.com/dib-lab/kevlar) and is
# licensed under the MIT license: see LICENSE.
# -----------------------------------------------------------------------------

import os.path
import re
from subprocess import Popen, PIPE, check_call
import sys
from tempfile import TemporaryFile

import khmer
import kevlar
import pysam
import screed


class KevlarBWAError(RuntimeError):
"""Raised if a delegated BWA call fails for any reason."""
pass


class KevlarInvalidCutoutDeflineError(ValueError):
pass


class KevlarDeflineSequenceLengthMismatchError(RuntimeError):
pass


def autoindex(refrfile, logstream=sys.stderr):
if not os.path.isfile(refrfile):
message = 'reference file {:s} does not exist'.format(refrfile)
raise KevlarBWAError(message)

bwtfile = refrfile + '.bwt'
if os.path.isfile(bwtfile):
return

message = 'WARNING: BWA index not found for "{:s}"'.format(refrfile)
message += ', indexing now'
print('[kevlar::reference]', message, file=logstream)

try:
check_call(['bwa', 'index', refrfile])
except Exception as err: # pragma: no cover
raise KevlarBWAError('Could not run "bwa index"') from err


def bwa_align(cmdargs, seqstring=None):
with TemporaryFile() as samfile:
bwaproc = Popen(cmdargs, stdin=PIPE, stdout=samfile, stderr=PIPE,
universal_newlines=True)
if seqstring:
stdout, stderr = bwaproc.communicate(input=seqstring)
else:
stdout, stderr = bwaproc.communicate()
if bwaproc.returncode != 0: # pragma: no cover
print(stderr, file=sys.stderr)
raise KevlarBWAError('problem running BWA')
samfile.seek(0)
sam = pysam.AlignmentFile(samfile, 'r')
for record in sam:
if record.is_unmapped:
continue
seqid = sam.get_reference_name(record.reference_id)
yield seqid, record.pos


class ReferenceCutout(object):
"""An interval of the reference genome matched by a variant contig.
Kevlar identifies, filters, and partitions reads containing novel k-mers,
and then assembles these reads into contigs representing putative variants.
This class is for handling small regions of the reference genome that have
similarity to one of these contigs, to facilitate variant calling.
Reference cutouts are operationally defined as follows.
- decompose the contig into its constituent k-mers (seeds); note, the value
of k used here is not necessarily the same k that is used for variant
discovery
- find all perfect reference genome matches of all seeds
- sort each seed match by genomic position
- split two adjacent matches into distinct bins if they are on different
reference sequences (chromosomes) or if they are separated by more than X
nucleotides
- compute the smallest interval spanning all matches in a bin
- extend the interval by delta nucleotides and designate as a reference
cutout
"""
def __init__(self, defline=None, sequence=None):
self.defline = defline
self.sequence = sequence
self._seqid = None
self._startpos = None
self._endpos = None
if defline:
self.parse_defline(defline)

def __len__(self):
return self._endpos - self._startpos

def parse_defline(self, defline):
match = re.search('(\S+)_(\d+)-(\d+)', defline)
if not match:
raise KevlarInvalidCutoutDeflineError(defline)
self._seqid = match.group(1)
self._startpos = int(match.group(2))
self._endpos = int(match.group(3))
if not self.sequence:
return
if len(self) != len(self.sequence):
message = 'defline length: {:d}, sequence length: {:d}'.format(
len(self), len(self.sequence)
)
raise KevlarDeflineSequenceLengthMismatchError(message)

@property
def interval(self):
return self._seqid, self._startpos, self._endpos

def local_to_global(self, coordinate):
return self._startpos + coordinate


def load_refr_cutouts(instream):
for defline, sequence in kevlar.seqio.parse_fasta(instream):
yield ReferenceCutout(defline[1:], sequence)

0 comments on commit be6be75

Please sign in to comment.