Skip to content

Commit

Permalink
New partition filtering strategy (#283)
Browse files Browse the repository at this point in the history
The main motivation behind this PR was to discard unassemblable partitions *at the partitioning stage* to avoid wasting a lot of compute time trying to assemble many small poorly connected read sets. The intuition is that if it's a partition with a small number of reads and a small number of compatible overlaps, it can safely be discarded.

It turns out this intuition doesn't really pan out. There is no clear separation between assemblable and unassemblable partitions in terms of the number of edges in the read graph, the number of strict-mode edges in the read graph, or the ratio of the two.

So I've disabled the proposed filtering strategy, but left it commented out in the code for expository purposes. The refactoring in support of this filtering strategy was also valuable and has been retained.
  • Loading branch information
standage committed Oct 18, 2018
1 parent c602954 commit 3b45276
Show file tree
Hide file tree
Showing 10 changed files with 2,602 additions and 1,068 deletions.
6 changes: 2 additions & 4 deletions kevlar/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,11 @@
from kevlar import cigar
from kevlar import varmap
from kevlar import vcf
from kevlar.readpair import ReadPair
from kevlar.readgraph import ReadGraph
from kevlar.mutablestring import MutableString
from kevlar.sequence import parse_augmented_fastx, print_augmented_fastx
from kevlar.sequence import revcom
from kevlar.seqio import parse_partitioned_reads, parse_single_partition
from kevlar.timer import Timer

Expand Down Expand Up @@ -86,10 +88,6 @@ def mkdirp(path, trim=False):
return outdir


def revcom(seq):
return screed.dna.reverse_complement(str(seq))


def revcommin(seq):
rc = revcom(seq)
minseq = sorted((seq, rc))[0]
Expand Down
18 changes: 14 additions & 4 deletions kevlar/readgraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,7 @@ def check_edge(self, pair, minkmer):
else:
self.add_edge(tailname, headname, offset=pair.offset,
overlap=pair.overlap, ikmers=set([minkmer]),
orient=pair.sameorient, tail=tailname,
swapped=pair.swapped)
orient=pair.sameorient, tail=tailname)

def populate_edges(self, strict=False):
"""
Expand All @@ -113,10 +112,12 @@ def populate_edges(self, strict=False):
readset = self.ikmers[kmer]
for read1, read2 in itertools.combinations(readset, 2):
if strict:
if read1 in self and read2 in self[read1]:
continue
record1 = self.get_record(read1)
record2 = self.get_record(read2)
pair = kevlar.overlap.calc_offset(record1, record2, kmer)
if pair is kevlar.overlap.INCOMPATIBLE_PAIR:
pair = kevlar.ReadPair(record1, record2, kmer)
if pair.incompatible:
# Shared k-mer but bad overlap
continue
self.check_edge(pair, kmer)
Expand Down Expand Up @@ -146,6 +147,15 @@ def partitions(self, dedup=True, minabund=None, maxabund=None,
if abundfilt:
if minabund and partition.number_of_nodes() < minabund:
continue # Skip partitions that are too small
# # Ill-advised strategy for pre-emptively discarding
# # unassemblable partitions. It turns out that number of edges
# # in the read graph (even in strict mode) doesn't really
# # distinguish between what assembles and what doesn't.
# if partition.number_of_nodes() < 10:
# partition.populate_edges(strict=True)
# nedges = partition.number_of_edges()
# if minabund and nedges < minabund:
# continue
yield partition
else:
yield cc
179 changes: 179 additions & 0 deletions kevlar/readpair.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
#!/usr/bin/env python
#
# -----------------------------------------------------------------------------
# Copyright (c) 2018 The Regents of the University of California
#
# This file is part of kevlar (http://github.com/dib-lab/kevlar) and is
# licensed under the MIT license: see LICENSE.
# -----------------------------------------------------------------------------

from collections import namedtuple
import sys
import kevlar
from kevlar.sequence import Record, KmerOfInterest


class ReadWithKmer(object):
def __init__(self, read, kmerseq):
self.read = read
self.kmer = read.ikmers[kmerseq]
self.kmerseq = self.read.ikmerseq(self.kmer) if self.kmer else None
self.num_occurrences = (
self.read.sequence.count(kmerseq) +
self.read.sequence.count(kevlar.revcom(kmerseq))
)

def __len__(self):
return len(self.read)

@property
def offset(self):
return self.kmer.offset

@property
def name(self):
return self.read.name

def revcom(self):
seq = kevlar.revcom(self.read.sequence)
kmerseqrc = kevlar.revcom(self.kmerseq)
newoffset = len(seq) - self.kmer.offset - self.kmer.ksize
kmer = KmerOfInterest(self.kmer.ksize, newoffset, self.kmer.abund)
kdict = {self.kmerseq: kmer, kmerseqrc: kmer}
newread = Record(self.read.name, seq, annotations=[kmer], ikmers=kdict)
return ReadWithKmer(newread, self.kmerseq)


class ReadPair(object):
"""Class for handling overlapping pairs of reads.
Each read pair is anchored by 1 share k-mer at a time. These shared k-mers
are used to compute relative orientation, offset, sequence compatibility/
identity, and to "assemble" (merge) the two sequences.
"""
def __init__(self, read1, read2, sharedkmer):
self._r1 = ReadWithKmer(read1, sharedkmer)
self._r2 = ReadWithKmer(read2, sharedkmer)
self._r1rc = self._r1.revcom()
self._r2rc = self._r2.revcom()
self._seedkmer = sharedkmer
self._merged = None

self.head = None
self.tail = None
self.headkmer = None
self.tailkmer = None
self.overlap = None
self.sameorient = None

self.validate()

def __str__(self):
return '{tailseq}\n{koffset}{match}\n{offset}{headseq}'.format(
tailseq=self.tail.read.sequence, koffset=' ' * self.tail.offset,
match='|' * self.tail.kmer.ksize, offset=' ' * self.offset,
headseq=self.head.read.sequence
)

@property
def incompatible(self):
return self._merged is None

def assign_by_largest_kmer_offset(self):
"""Assign head and tail based on largest k-mer offset."""
# Enumerate all possible arragements of the read pair based on the
# relative orientation of the reads.
if self.sameorient:
arrangements = [(self._r1, self._r2), (self._r1rc, self._r2rc)]
else:
arrangements = [(self._r1, self._r2rc), (self._r1rc, self._r2)]

# If both arrangements have the same largest k-mer offset, no assigment
# can be made.
offsets = [
max([read.kmer.offset for read in arr]) for arr in arrangements
]
if len(set(offsets)) == 1:
return

# Determine the optimal arrangement and assign head and tail.
if offsets[0] > offsets[1]:
optimal_arrangement = arrangements[0]
else:
optimal_arrangement = arrangements[1]
self.tail = max(optimal_arrangement, key=lambda r: r.kmer.offset)
self.head = min(optimal_arrangement, key=lambda r: r.kmer.offset)

def assign_by_read_length(self):
if len(self._r1) == len(self._r2):
return
elif len(self._r1) > len(self._r2):
self.tail = self._r1
self.head = self._r2 if self.sameorient else self._r2rc
else:
self.tail = self._r2 if self.sameorient else self._r2rc
self.head = self._r1

def assign_by_read_name(self):
if self._r1.read.name < self._r2.read.name:
self.tail = self._r1
self.head = self._r2 if self.sameorient else self._r2rc
else:
self.tail = self._r2 if self.sameorient else self._r2rc
self.head = self._r1

def set_head_and_tail(self):
"""Determine which read is `head` and which is `tail`.
The `tail` read is situated to the left and always retains its
orientation. The `head` read is situated to the right and will be
reverse complemented if needed to match the `tail` read's orientation.
The criteria for determining heads and tails is as follows.
- the read with the higher k-mer offset is the tail; all possible
arrangements are considered
- if all arrangements have the same k-mer offset, the longer read is
the tail
- if the reads are of equal length, the read whose name/ID is
lexicographically smaller is the tail
"""
self.assign_by_largest_kmer_offset()
if self.tail is None:
self.assign_by_read_length()
if self.tail is None:
self.assign_by_read_name()
assert self.tail is not None

def calc_offset(self):
if self.tail.offset < self.head.offset:
self.head, self.tail = self.tail, self.head
self.offset = self.tail.offset - self.head.offset
self.overlap = len(self.tail) - self.offset

@property
def mergedseq(self):
return self._merged

def _merge(self):
tailseq = self.tail.read.sequence
headseq = self.head.read.sequence
if headseq in tailseq:
self._merged = tailseq
return
elif tailseq in headseq:
self._merged = tailseq
return

headindex = len(tailseq) - self.offset
headsuffix = headseq[headindex:]
tailprefix = tailseq[self.offset:self.offset+self.overlap]
if tailprefix == headseq[:headindex]:
self._merged = tailseq + headsuffix

def validate(self):
if self._r1.num_occurrences != 1 or self._r2.num_occurrences != 1:
return
self.sameorient = self._r1.kmerseq == self._r2.kmerseq
self.set_head_and_tail()
self.calc_offset()
self._merge()

0 comments on commit 3b45276

Please sign in to comment.