Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added mechanisms for retrieving tags by position within a sequence #749

Merged
merged 5 commits into from Feb 8, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 6 additions & 0 deletions ChangeLog
@@ -1,3 +1,9 @@
2015-02-01 Titus Brown <titus@idyll.org>

* khmer/_khmermodule.cc: added functions hash_find_all_tags_list and
hash_get_tags_and_positions to CountingHash objects.
* tests/test_counting_hash.py: added tests for new functionality.

2015-01-25 Titus Brown <titus@idyll.org>

* sandbox/correct-errors.py: fixed sequence output so that quality
Expand Down
89 changes: 88 additions & 1 deletion khmer/_khmermodule.cc
Expand Up @@ -1326,6 +1326,91 @@ static PyObject * hash_consume_and_tag(PyObject * self, PyObject * args)
return Py_BuildValue("K", n_consumed);
}

static PyObject * hash_get_tags_and_positions(PyObject * self, PyObject * args)
{
khmer_KCountingHashObject * me = (khmer_KCountingHashObject *) self;
CountingHash * counting = me->counting;

const char * seq;

if (!PyArg_ParseTuple(args, "s", &seq)) {
return NULL;
}

// call the C++ function, and trap signals => Python

std::vector<unsigned int> posns;
std::vector<HashIntoType> tags;
try {
unsigned int pos = 1;
KMerIterator kmers(seq, counting->ksize());

while (!kmers.done()) {
HashIntoType kmer = kmers.next();
if (set_contains(counting->all_tags, kmer)) {
posns.push_back(pos);
tags.push_back(kmer);
}
pos++;
}
} catch (_khmer_signal &e) {
PyErr_SetString(PyExc_ValueError, e.get_message().c_str());
return NULL;
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

1356-1357 lack testing coverage and won't be called ever anyhow

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed!

On Tue, Feb 10, 2015 at 11:16:55AM -0800, Michael R. Crusoe wrote:

  • try {
  •    unsigned int pos = 1;
    
  •    KMerIterator kmers(seq, counting->ksize());
    
  •    while (!kmers.done()) {
    
  •        HashIntoType kmer = kmers.next();
    
  •        if (set_contains(counting->all_tags, kmer)) {
    
  •             posns.push_back(pos);
    
  •             tags.push_back(kmer);
    
  •        }
    
  •        pos++;
    
  •    }
    
  • } catch (_khmer_signal &e) {
  •    PyErr_SetString(PyExc_ValueError, e.get_message().c_str());
    
  •    return NULL;
    
  • }

1356-1357 lack testing coverage and won't be called ever anyhow


Reply to this email directly or view it on GitHub:

https://github.com/ged-lab/khmer/pull/749/files#r24441205

C. Titus Brown, ctbrown@ucdavis.edu

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks; making new pull request


PyObject * posns_list = PyList_New(posns.size());
for (size_t i = 0; i < posns.size(); i++) {
PyObject * tup = Py_BuildValue("IK", posns[i], tags[i]);
PyList_SET_ITEM(posns_list, i, tup);
}

return posns_list;
}

static PyObject * hash_find_all_tags_list(PyObject * self, PyObject *args)
{
khmer_KCountingHashObject * me = (khmer_KCountingHashObject *) self;
CountingHash * counting = me->counting;

const char * kmer_s = NULL;

if (!PyArg_ParseTuple(args, "s", &kmer_s)) {
return NULL;
}

if (strlen(kmer_s) < counting->ksize()) {
PyErr_SetString( PyExc_ValueError,
"starting kmer is smaller than the K size of the counting table");
return NULL;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code branch is untested

}

SeenSet tags;

Py_BEGIN_ALLOW_THREADS

HashIntoType kmer_f, kmer_r;
_hash(kmer_s, counting->ksize(), kmer_f, kmer_r);

counting->partition->find_all_tags(kmer_f, kmer_r, tags,
counting->all_tags);

Py_END_ALLOW_THREADS

PyObject * x = PyList_New(tags.size());
if (x == NULL) {
return NULL;
}
SeenSet::iterator si;
unsigned long long i = 0;
for (si = tags.begin(); si != tags.end(); ++si) {
// type K for python unsigned long long
PyList_SET_ITEM(x, i, Py_BuildValue("K", *si));
i++;
}

return x;
}

static PyObject * hash_consume_fasta_and_tag(PyObject * self, PyObject * args)
{
khmer_KCountingHashObject * me = (khmer_KCountingHashObject *) self;
Expand Down Expand Up @@ -1485,6 +1570,8 @@ static PyMethodDef khmer_counting_methods[] = {
METH_VARARGS, ""
},
{ "consume_and_tag", hash_consume_and_tag, METH_VARARGS, "Consume a sequence and tag it" },
{ "get_tags_and_positions", hash_get_tags_and_positions, METH_VARARGS, "Retrieve tags and their positions in a sequence." },
{ "find_all_tags_list", hash_find_all_tags_list, METH_VARARGS, "Find all tags within range of the given k-mer, return as list" },
{ "consume_fasta_and_tag", hash_consume_fasta_and_tag, METH_VARARGS, "Count all k-mers in a given file" },
{ "do_subset_partition_with_abundance", hash_do_subset_partition_with_abundance, METH_VARARGS, "" },
{ "find_all_tags_truncate_on_abundance", hash_find_all_tags_truncate_on_abundance, METH_VARARGS, "" },
Expand Down Expand Up @@ -2479,7 +2566,7 @@ static PyObject * hashbits_find_all_tags(PyObject * self, PyObject *args)
return NULL;
}

if (strlen(kmer_s) < hashbits->ksize()) { // @@
if (strlen(kmer_s) < hashbits->ksize()) {
PyErr_SetString( PyExc_ValueError,
"starting kmer is smaller than the K size of the hashbits");
return NULL;
Expand Down
48 changes: 48 additions & 0 deletions tests/test_counting_hash.py
Expand Up @@ -10,6 +10,7 @@
import khmer
import khmer_tst_utils as utils
from khmer import ReadParser
import screed

from nose.plugins.attrib import attr

Expand Down Expand Up @@ -965,3 +966,50 @@ def test_consume_fasta_and_tag():
except TypeError as err:
print str(err)
countingtable.consume_fasta_and_tag(utils.get_test_data("test-graph2.fa"))


def test_consume_and_retrieve_tags_1():
ct = khmer.new_counting_hash(4, 4 ** 4, 4)

# first, for each sequence, build tags.
for record in screed.open(utils.get_test_data('test-graph2.fa')):
ct.consume_and_tag(record.sequence)

# check that all the tags in sequences are retrieved by iterating
# across the sequence and retrieving by neighborhood.

ss = set()
tt = set()
for record in screed.open(utils.get_test_data('test-graph2.fa')):
for p, tag in ct.get_tags_and_positions(record.sequence):
ss.add(tag)

for start in range(len(record.sequence) - 20):
kmer = record.sequence[start:start + 21]
tt.update(ct.find_all_tags_list(kmer))

assert ss == tt


def test_consume_and_retrieve_tags_empty():
ct = khmer.new_counting_hash(4, 4 ** 4, 4)

# load each sequence but do not build tags - everything should be empty.
for record in screed.open(utils.get_test_data('test-graph2.fa')):
ct.consume(record.sequence)

# check that all the tags in sequences are retrieved by iterating
# across the sequence and retrieving by neighborhood.

ss = set()
tt = set()
for record in screed.open(utils.get_test_data('test-graph2.fa')):
for p, tag in ct.get_tags_and_positions(record.sequence):
ss.add(tag)

for start in range(len(record.sequence) - 20):
kmer = record.sequence[start:start + 21]
tt.update(ct.find_all_tags_list(kmer))

assert not ss
assert not tt