Skip to content

Commit

Permalink
Merge pull request #71 from dib-lab/feature/sketch_load
Browse files Browse the repository at this point in the history
New convenience functions, and sketch autoload. Closes #67.
  • Loading branch information
standage committed May 11, 2017
2 parents 7ee0e72 + 09fd954 commit f6ae65a
Show file tree
Hide file tree
Showing 8 changed files with 143 additions and 0 deletions.
82 changes: 82 additions & 0 deletions kevlar/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from kevlar.variantset import VariantSet
from kevlar.timer import Timer
from gzip import open as gzopen
import khmer
import screed

from kevlar._version import get_versions
Expand All @@ -46,6 +47,43 @@ def open(filename, mode):
return openfunc(filename, mode)


def sketch_autoload(infile, count=True, graph=False,
ksize=31, table_size=1e4, num_tables=4,
num_bands=0, band=0):
"""
Use file extension to conditionally load sketch into memory.
If the file extension is one of the following, treat the file as a sketch
that has been written to disk and load it with `kevlar.load_sketch`.
Sketch attributes such as ksize, table size, and number of tables will
be set automatically.
- `.ct` or `.counttable`: `Counttable`
- `.nt` or `.nodetable`: `Nodetable`
- `.cg` or `.countgraph`: `Countgraph`
- `.ng` or `.nodegraph`: `Nodegraph`
Otherwise, a sketch will be created using the specified arguments and the
input file will be treated as a Fasta/Fastq file to be loaded with
`.consume_seqfile` or `.consume_seqfile_banding`.
"""
sketch_extensions = (
'.ct', '.counttable', '.nt', '.nodetable',
'.cg', '.countgraph', '.ng', '.nodegraph',
)

if infile.endswith(sketch_extensions):
return load_sketch(infile, count=count, graph=graph, smallcount=False)
else:
sketch = allocate_sketch(ksize, table_size, num_tables, count=count,
graph=graph, smallcount=False)
if num_bands > 1:
sketch.consume_seqfile_banding(infile, num_bands, band)
else:
sketch.consume_seqfile(infile)
return sketch


def calc_fpr(table):
"""Stolen shamelessly from khmer/__init__.py"""
sizes = table.hashsizes()
Expand Down Expand Up @@ -73,4 +111,48 @@ def same_seq(seq1, seq2, seq2revcom=None):
return seq1 == seq2 or seq1 == seq2revcom


def load_sketch(filename, count=False, graph=False, smallcount=False):
"""Convenience function for loading a sketch from the specified file."""
if count and graph:
if smallcount:
createfunc = khmer._SmallCountgraph
else:
createfunc = khmer._Countgraph
elif count and not graph:
if smallcount:
createfunc = khmer._SmallCounttable
else:
createfunc = khmer._Counttable
elif not count and graph:
createfunc = khmer._Nodegraph
elif not count and not graph:
createfunc = khmer._Nodetable

sketch = createfunc(1, [1])
sketch.load(filename)
return sketch


def allocate_sketch(ksize, target_tablesize, num_tables=4, count=False,
graph=False, smallcount=False):
"""Convenience function for allocating memory for a new sketch."""
if count and graph:
if smallcount:
createfunc = khmer.SmallCountgraph
else:
createfunc = khmer.Countgraph
elif count and not graph:
if smallcount:
createfunc = khmer.SmallCounttable
else:
createfunc = khmer.Counttable
elif not count and graph:
createfunc = khmer.Nodegraph
elif not count and not graph:
createfunc = khmer.Nodetable

sketch = createfunc(ksize, target_tablesize, num_tables)
return sketch


KmerOfInterest = namedtuple('KmerOfInterest', 'sequence offset abund')
Binary file added kevlar/tests/data/test.countgraph
Binary file not shown.
Binary file added kevlar/tests/data/test.counttable
Binary file not shown.
Binary file added kevlar/tests/data/test.nodegraph
Binary file not shown.
Binary file added kevlar/tests/data/test.nodetable
Binary file not shown.
Binary file added kevlar/tests/data/test.smallcountgraph
Binary file not shown.
Binary file added kevlar/tests/data/test.smallcounttable
Binary file not shown.
61 changes: 61 additions & 0 deletions kevlar/tests/test_various.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#!/usr/bin/env python
#
# -----------------------------------------------------------------------------
# Copyright (c) 2017 The Regents of the University of California
#
# This file is part of kevlar (http://github.com/standage/kevlar) and is
# licensed under the MIT license: see LICENSE.
# -----------------------------------------------------------------------------

import pytest
import khmer
import kevlar


@pytest.mark.parametrize('filename,count,graph,smallcount,testkmer', [
('test.countgraph', True, True, False, 'TGGAACCGGCAACGACGAAAA'),
('test.smallcountgraph', True, True, True, 'CTGTACTACAGCTACTACAGT'),
('test.counttable', True, False, False, 'CCTGATATCCGGAATCTTAGC'),
('test.smallcounttable', True, False, True, 'GGGCCCCCATCTCTATCTTGC'),
('test.nodegraph', False, True, False, 'GGGAACTTACCTGGGGGTGCG'),
('test.nodetable', False, False, False, 'CTGTTCGATATGAGGAATCTG'),
])
def test_load_sketch(filename, count, graph, smallcount, testkmer):
infile = kevlar.tests.data_file(filename)
sketch = kevlar.load_sketch(infile, count, graph, smallcount)
assert sketch.get(testkmer) > 0
assert sketch.get('GATTACA' * 3) == 0


@pytest.mark.parametrize('count,smallcount', [
(True, True),
(True, False),
(False, False),
])
def test_allocate_sketch_graphy(count, smallcount):
sequence = 'AATCAACGCTTCTTAATAGGCATAGTGTCTCTGCTGCGCATGGACGTGCCATAGCCACTACT'
kmer = 'GCATAGTGTCTCTGCTGCGCA'

sketch = kevlar.allocate_sketch(21, 1e4, 4, count, True, smallcount)
sketch.consume(sequence)
sketch.get(kmer) == 1
kmer_hash = sketch.hash(kmer)
assert kevlar.same_seq(sketch.reverse_hash(kmer_hash), kmer)


@pytest.mark.parametrize('count,smallcount', [
(True, True),
(True, False),
(False, False),
])
def test_allocate_sketch_non_graphy(count, smallcount):
sequence = 'TGCCACGATCCGGCTATGGCGGAAGGGCACACCTAACCGCGATGACGGAGTAACTCGCAGCA'
kmer = 'CTATGGCGGAAGGGCACACCTAACCGCGATGACGG'

sketch = kevlar.allocate_sketch(35, 1e4, 4, count, False, smallcount)
sketch.consume(sequence)
sketch.get(kmer) == 1
kmer_hash = sketch.hash(kmer)
with pytest.raises(ValueError) as ve:
_ = sketch.reverse_hash(kmer_hash)
assert 'not implemented' in str(ve)

0 comments on commit f6ae65a

Please sign in to comment.