Skip to content

Commit

Permalink
Merge pull request #210 from dib-lab/feature/pairing
Browse files Browse the repository at this point in the history
Retain mates of paired-end novel reads
  • Loading branch information
standage committed Feb 23, 2018
2 parents be6be75 + 2b0b3e8 commit d11fa78
Show file tree
Hide file tree
Showing 26 changed files with 37,222 additions and 34,354 deletions.
34 changes: 34 additions & 0 deletions kevlar/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,14 @@
from os import makedirs
from os.path import dirname
import re
from subprocess import Popen, PIPE, check_call
import sys
from tempfile import TemporaryFile

# Third-party libraries
import khmer
from khmer.utils import broken_paired_reader
import pysam
import screed

# Internal modules
Expand Down Expand Up @@ -116,6 +120,16 @@ def multi_file_iter_khmer(filenames):
yield record


def paired_reader(readstream):
i = 0
for n, ispaired, read1, read2 in broken_paired_reader(readstream):
i += 1
yield i, read1, read2
if ispaired:
i += 1
yield i, read2, read1


def clean_subseqs(sequence, ksize):
for subseq in re.split('[^ACGT]', sequence):
if len(subseq) >= ksize:
Expand All @@ -139,4 +153,24 @@ def vcf_header(outstream, version='4.2', source='kevlar', infoheader=False):
sep='\t', file=outstream)


def bwa_align(cmd, seqstring=None):
with TemporaryFile() as samfile:
bwaproc = Popen(cmd, stdin=PIPE, stdout=samfile, stderr=PIPE,
universal_newlines=True)
if seqstring:
stdout, stderr = bwaproc.communicate(input=seqstring)
else:
stdout, stderr = bwaproc.communicate()
if bwaproc.returncode != 0: # pragma: no cover
print(stderr, file=sys.stderr)
raise KevlarBWAError('problem running BWA')
samfile.seek(0)
sam = pysam.AlignmentFile(samfile, 'r')
for record in sam:
if record.is_unmapped:
continue
seqid = sam.get_reference_name(record.reference_id)
yield seqid, record.pos


KmerOfInterest = namedtuple('KmerOfInterest', 'sequence offset abund')
10 changes: 1 addition & 9 deletions kevlar/alac.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,10 @@
import sys
import kevlar
from kevlar.assemble import assemble_greedy, assemble_fml_asm
from kevlar.augment import augment
from kevlar.localize import localize
from kevlar.call import call


def augment_and_mark(augseqs, nakedseqs):
for seq in augment(augseqs, nakedseqs):
seq.name = '{:s} nikmers={:d}'.format(seq.name, len(seq.ikmers))
yield seq


def alac(pstream, refrfile, ksize=31, bigpart=10000, delta=25, seedsize=31,
maxdiff=10000, match=1, mismatch=2, gapopen=5, gapextend=0,
greedy=False, min_ikmers=None, logstream=sys.stderr):
Expand All @@ -34,7 +27,6 @@ def alac(pstream, refrfile, ksize=31, bigpart=10000, delta=25, seedsize=31,

# Assemble partitioned reads into contig(s)
contigs = list(assembler(reads, logstream=logstream))
contigs = list(augment_and_mark(reads, contigs))
if min_ikmers is not None:
# Apply min ikmer filter if it's set
contigs = [c for c in contigs if len(c.ikmers) >= min_ikmers]
Expand All @@ -50,7 +42,7 @@ def alac(pstream, refrfile, ksize=31, bigpart=10000, delta=25, seedsize=31,

# Align contigs to genomic targets to make variant calls
caller = call(targets, contigs, match, mismatch, gapopen, gapextend,
ksize)
ksize, refrfile)
for varcall in caller:
yield varcall

Expand Down
4 changes: 2 additions & 2 deletions kevlar/assemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,14 @@ def assemble_fml_asm(readstream, logstream=sys.stderr):
for n, contig in enumerate(assembler, 1):
name = 'contig{:d}'.format(n)
record = screed.Record(name=name, sequence=contig)
yield record
yield next(kevlar.augment.augment(reads, [record]))


def main_fml_asm(args):
reads = kevlar.parse_augmented_fastx(kevlar.open(args.augfastq, 'r'))
outstream = kevlar.open(args.out, 'w')
for contig in assemble_fml_asm(reads):
khmer.utils.write_record(contig, outstream)
kevlar.print_augmented_fastx(contig, outstream)


# =============================================================================
Expand Down
8 changes: 6 additions & 2 deletions kevlar/augment.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,14 @@ def augment(augseqstream, nakedseqstream):
"""
ksize = None
ikmers = dict()
mateseqs = set()
for record in augseqstream:
for ikmer in record.ikmers:
ikmers[ikmer.sequence] = ikmer.abund
ikmers[kevlar.revcom(ikmer.sequence)] = ikmer.abund
ksize = len(ikmer.sequence)
mateseqs.update(record.mateseqs)
mateseqs = sorted(mateseqs)

for record in nakedseqstream:
newikmers = list()
Expand All @@ -40,11 +43,12 @@ def augment(augseqstream, nakedseqstream):
if hasattr(record, 'quality'):
newrecord = screed.Record(
name=record.name, sequence=record.sequence, ikmers=newikmers,
quality=record.quality
quality=record.quality, mateseqs=mateseqs
)
else:
newrecord = screed.Record(
name=record.name, sequence=record.sequence, ikmers=newikmers
name=record.name, sequence=record.sequence, ikmers=newikmers,
mateseqs=mateseqs
)
yield newrecord

Expand Down

0 comments on commit d11fa78

Please sign in to comment.