dib-lab · luizirber · Feb 8, 2015 · Feb 1, 2015 · Feb 1, 2015 · Feb 1, 2015
diff --git a/ChangeLog b/ChangeLog
@@ -1,3 +1,9 @@
+2015-02-01  Titus Brown  <titus@idyll.org>
+
+   * khmer/_khmermodule.cc: added functions hash_find_all_tags_list and
+   hash_get_tags_and_positions to CountingHash objects.
+   * tests/test_counting_hash.py: added tests for new functionality.
+
 2015-01-25  Titus Brown  <titus@idyll.org>
 
    * sandbox/correct-errors.py: fixed sequence output so that quality

diff --git a/khmer/_khmermodule.cc b/khmer/_khmermodule.cc
@@ -1326,6 +1326,91 @@ static PyObject * hash_consume_and_tag(PyObject * self, PyObject * args)
     return Py_BuildValue("K", n_consumed);
 }
 
+static PyObject * hash_get_tags_and_positions(PyObject * self, PyObject * args)
+{
+    khmer_KCountingHashObject * me = (khmer_KCountingHashObject *) self;
+    CountingHash * counting = me->counting;
+
+    const char * seq;
+
+    if (!PyArg_ParseTuple(args, "s", &seq)) {
+        return NULL;
+    }
+
+    // call the C++ function, and trap signals => Python
+
+    std::vector<unsigned int> posns;
+    std::vector<HashIntoType> tags;
+    try {
+        unsigned int pos = 1;
+        KMerIterator kmers(seq, counting->ksize());
+
+        while (!kmers.done()) {
+            HashIntoType kmer = kmers.next();
+            if (set_contains(counting->all_tags, kmer)) {
+                 posns.push_back(pos);
+                 tags.push_back(kmer);
+            }
+            pos++;
+        }
+    } catch (_khmer_signal &e) {
+        PyErr_SetString(PyExc_ValueError, e.get_message().c_str());
+        return NULL;
+    }
+
+    PyObject * posns_list = PyList_New(posns.size());
+    for (size_t i = 0; i < posns.size(); i++) {
+        PyObject * tup = Py_BuildValue("IK", posns[i], tags[i]);
+        PyList_SET_ITEM(posns_list, i, tup);
+    }
+
+    return posns_list;
+}
+
+static PyObject * hash_find_all_tags_list(PyObject * self, PyObject *args)
+{
+    khmer_KCountingHashObject * me = (khmer_KCountingHashObject *) self;
+    CountingHash * counting = me->counting;
+
+    const char * kmer_s = NULL;
+
+    if (!PyArg_ParseTuple(args, "s", &kmer_s)) {
+        return NULL;
+    }
+
+    if (strlen(kmer_s) < counting->ksize()) {
+        PyErr_SetString( PyExc_ValueError,
+                         "starting kmer is smaller than the K size of the counting table");
+        return NULL;
+    }
+
+    SeenSet tags;
+
+    Py_BEGIN_ALLOW_THREADS
+
+    HashIntoType kmer_f, kmer_r;
+    _hash(kmer_s, counting->ksize(), kmer_f, kmer_r);
+
+    counting->partition->find_all_tags(kmer_f, kmer_r, tags,
+                                       counting->all_tags);
+
+    Py_END_ALLOW_THREADS
+
+    PyObject * x =  PyList_New(tags.size());
+    if (x == NULL) {
+        return NULL;
+    }
+    SeenSet::iterator si;
+    unsigned long long i = 0;
+    for (si = tags.begin(); si != tags.end(); ++si) {
+        // type K for python unsigned long long
+        PyList_SET_ITEM(x, i, Py_BuildValue("K", *si));
+        i++;
+    }
+
+    return x;
+}
+
 static PyObject * hash_consume_fasta_and_tag(PyObject * self, PyObject * args)
 {
     khmer_KCountingHashObject * me = (khmer_KCountingHashObject *) self;
@@ -1485,6 +1570,8 @@ static PyMethodDef khmer_counting_methods[] = {
         METH_VARARGS, ""
     },
     { "consume_and_tag", hash_consume_and_tag, METH_VARARGS, "Consume a sequence and tag it" },
+    { "get_tags_and_positions", hash_get_tags_and_positions, METH_VARARGS, "Retrieve tags and their positions in a sequence." },
+    { "find_all_tags_list", hash_find_all_tags_list, METH_VARARGS, "Find all tags within range of the given k-mer, return as list" },
     { "consume_fasta_and_tag", hash_consume_fasta_and_tag, METH_VARARGS, "Count all k-mers in a given file" },
     { "do_subset_partition_with_abundance", hash_do_subset_partition_with_abundance, METH_VARARGS, "" },
     { "find_all_tags_truncate_on_abundance", hash_find_all_tags_truncate_on_abundance, METH_VARARGS, "" },
@@ -2479,7 +2566,7 @@ static PyObject * hashbits_find_all_tags(PyObject * self, PyObject *args)
         return NULL;
     }
 
-    if (strlen(kmer_s) < hashbits->ksize()) { // @@
+    if (strlen(kmer_s) < hashbits->ksize()) {
         PyErr_SetString( PyExc_ValueError,
                          "starting kmer is smaller than the K size of the hashbits");
         return NULL;

diff --git a/tests/test_counting_hash.py b/tests/test_counting_hash.py
@@ -10,6 +10,7 @@
 import khmer
 import khmer_tst_utils as utils
 from khmer import ReadParser
+import screed
 
 from nose.plugins.attrib import attr
 
@@ -965,3 +966,50 @@ def test_consume_fasta_and_tag():
     except TypeError as err:
         print str(err)
     countingtable.consume_fasta_and_tag(utils.get_test_data("test-graph2.fa"))
+
+
+def test_consume_and_retrieve_tags_1():
+    ct = khmer.new_counting_hash(4, 4 ** 4, 4)
+
+    # first, for each sequence, build tags.
+    for record in screed.open(utils.get_test_data('test-graph2.fa')):
+        ct.consume_and_tag(record.sequence)
+
+    # check that all the tags in sequences are retrieved by iterating
+    # across the sequence and retrieving by neighborhood.
+
+    ss = set()
+    tt = set()
+    for record in screed.open(utils.get_test_data('test-graph2.fa')):
+        for p, tag in ct.get_tags_and_positions(record.sequence):
+            ss.add(tag)
+
+        for start in range(len(record.sequence) - 20):
+            kmer = record.sequence[start:start + 21]
+            tt.update(ct.find_all_tags_list(kmer))
+
+    assert ss == tt
+
+
+def test_consume_and_retrieve_tags_empty():
+    ct = khmer.new_counting_hash(4, 4 ** 4, 4)
+
+    # load each sequence but do not build tags - everything should be empty.
+    for record in screed.open(utils.get_test_data('test-graph2.fa')):
+        ct.consume(record.sequence)
+
+    # check that all the tags in sequences are retrieved by iterating
+    # across the sequence and retrieving by neighborhood.
+
+    ss = set()
+    tt = set()
+    for record in screed.open(utils.get_test_data('test-graph2.fa')):
+        for p, tag in ct.get_tags_and_positions(record.sequence):
+            ss.add(tag)
+
+        for start in range(len(record.sequence) - 20):
+            kmer = record.sequence[start:start + 21]
+            tt.update(ct.find_all_tags_list(kmer))
+
+    assert not ss
+    assert not tt