-
Notifications
You must be signed in to change notification settings - Fork 295
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[MRG] Move HLLCounter into the glorious Cython future #1730
Changes from all commits
ed41ad3
52eac78
80d0b4f
fca8e36
f6977cf
c993318
6850b3d
9f45b2d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,38 +1,47 @@ | ||
from libcpp cimport bool | ||
from libcpp.memory cimport unique_ptr | ||
from libcpp.sting cimport string | ||
from libcpp.string cimport string | ||
from libcpp.vector cimport vector | ||
from libc.stdint cimport uint64_t | ||
|
||
from oxli_types cimport * | ||
from parsing cimport CpReadParser | ||
|
||
|
||
cdef extern from "oxli_exception_convert.hh": | ||
cdef void oxli_raise_py_error() | ||
|
||
|
||
cdef extern from "oxli/hllcounter.hh" namespace "oxli": | ||
cdef cppclass CpHLLCounter "oxli::HLLCounter": | ||
CpHLLCounter(double, WordLength) | ||
CpHLLCounter(int, WordLength) | ||
CpHLLCounter(double, WordLength) except +oxli_raise_py_error | ||
CpHLLCounter(int, WordLength) except +oxli_raise_py_error | ||
|
||
void add(const string &) | ||
unsigned int consume_string(const string &) | ||
void consume_seqfile[SeqIO](const string &, | ||
bool, | ||
unsigned int &, | ||
unsigned long long &) | ||
uint64_t &) except +oxli_raise_py_error | ||
|
||
void consume_seqfile[SeqIO](unique_ptr[CpReadParser[SeqIO]]&, | ||
bool, | ||
unsigned int &, | ||
unsigned long long &) | ||
# void consume_seqfile[SeqIO](unique_ptr[CpReadParser[SeqIO]]&, | ||
# bool, | ||
# unsigned int &, | ||
# unsigned long long &) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would this https://github.com/dib-lab/khmer/pull/1675/files#diff-91092b4a16651bcd34d40d87061f3845R175 help with not having to comment this out? |
||
unsigned int check_and_process_read(string &, bool &) | ||
bool check_and_normalize_read(string &) const | ||
uint64_t estimate_cardinality() | ||
void merge(CpHLLCounter &) | ||
void merge(CpHLLCounter &) except +oxli_raise_py_error | ||
double get_alpha() | ||
int get_p() | ||
int get_m() | ||
void set_ksize(WordLegth) | ||
void set_ksize(WordLength) except +oxli_raise_py_error | ||
int get_ksize() | ||
vector[int] get_M() | ||
double get_erate() | ||
void set_erate(double) | ||
void set_erate(double) except +oxli_raise_py_error | ||
|
||
|
||
cdef class HLLCounter: | ||
cdef unique_ptr[CpHLLCounter] _this | ||
cpdef tuple consume_seqfile(self, filename, bool stream_records=*) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
# cython: c_string_type=unicode, c_string_encoding=utf8 | ||
from cython.operator cimport dereference as deref, address | ||
|
||
from parsing cimport CpFastxReader | ||
from .utils cimport _bstring, _ustring | ||
|
||
cdef class HLLCounter: | ||
"""HyperLogLog counter. | ||
|
||
A HyperLogLog counter is a probabilistic data structure specialized on | ||
cardinality estimation. | ||
There is a precision/memory consumption trade-off: error rate determines | ||
how much memory is consumed. | ||
|
||
# Creating a new HLLCounter: | ||
|
||
>>> khmer.HLLCounter(error_rate, ksize) | ||
|
||
where the default values are: | ||
- error_rate: 0.01 | ||
- ksize: 20 | ||
""" | ||
|
||
def __cinit__(self, double error_rate=0.01, int ksize=20): | ||
self._this.reset(new CpHLLCounter(error_rate, ksize)) | ||
|
||
def __len__(self): | ||
"""Return the cardinality estimate.""" | ||
return self.estimate_cardinality() | ||
|
||
def add(self, kmer): | ||
"""Add a k-mer to the counter.""" | ||
deref(self._this).add(_bstring(kmer)) | ||
|
||
def estimate_cardinality(self): | ||
"""Return the current estimative.""" | ||
return deref(self._this).estimate_cardinality() | ||
|
||
def consume_string(self, seq): | ||
"""Break a sequence into k-mers and add each k-mer to the counter.""" | ||
return deref(self._this).consume_string(_bstring(seq)) | ||
|
||
cpdef tuple consume_seqfile(self, filename, bool stream_records=False): | ||
"Read sequences from file, break into k-mers, " | ||
"and add each k-mer to the counter. If optional keyword 'stream_out' " | ||
"is True, also prints each sequence to stdout." | ||
cdef unsigned long long n_consumed = 0 | ||
cdef unsigned int total_reads = 0 | ||
|
||
deref(self._this).consume_seqfile[CpFastxReader]( | ||
_bstring(filename), stream_records, | ||
total_reads, n_consumed) | ||
|
||
return total_reads, n_consumed | ||
|
||
def merge(self, HLLCounter other): | ||
"""Merge other counter into this one.""" | ||
deref(self._this).merge(deref(other._this)) | ||
|
||
@property | ||
def alpha(self): | ||
"""alpha constant for this HLL counter.""" | ||
return deref(self._this).get_alpha() | ||
|
||
@property | ||
def error_rate(self): | ||
"Error rate for this HLL counter." | ||
"Can be changed prior to first counting, but becomes read-only after " | ||
"that (raising AttributeError)" | ||
return deref(self._this).get_erate() | ||
|
||
@error_rate.setter | ||
def error_rate(self, erate): | ||
deref(self._this).set_erate(float(erate)) | ||
|
||
@error_rate.deleter | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added this to reproduce behavior from previous version, but I don't think anyone ever tried to delete a property out of HLLCounter in practice. |
||
def error_rate(self): | ||
raise TypeError("Cannot delete attribute") | ||
|
||
@property | ||
def ksize(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So far this has been a method you need to call instead of a property for Counttable. Should we switch all of them over? #consistency There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For HLL it has always been a property, just wondering if we should use this as an excuse to switch |
||
"k-mer size for this HLL counter." | ||
"Can be changed prior to first counting, but becomes read-only after " | ||
"that (raising AttributeError)" | ||
return deref(self._this).get_ksize() | ||
|
||
@ksize.setter | ||
def ksize(self, object new_k): | ||
if new_k <= 0: | ||
raise ValueError("Please set k-mer size to a value greater " | ||
"than zero") | ||
if isinstance(new_k, float): | ||
raise TypeError("Please use an integer value for k-mer size") | ||
deref(self._this).set_ksize(<int>new_k) | ||
|
||
@ksize.deleter | ||
def ksize(self): | ||
raise TypeError("Cannot delete attribute") | ||
|
||
@property | ||
def counters(self): | ||
"""Read-only internal counters.""" | ||
return deref(self._this).get_M() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
#include "Python.h" | ||
#include <exception> | ||
#include <string> | ||
#include "oxli/oxli_exception.hh" | ||
#include "oxli_exception_convert.hh" | ||
|
||
|
||
void oxli_raise_py_error() | ||
{ | ||
try { | ||
throw; | ||
} | ||
catch (oxli::ReadOnlyAttribute& e) { | ||
PyErr_SetString(PyExc_AttributeError, e.what()); | ||
} | ||
catch (oxli::InvalidValue& e) { | ||
PyErr_SetString(PyExc_ValueError, e.what()); | ||
} | ||
catch (oxli::InvalidStream& e) { | ||
PyErr_SetString(PyExc_OSError, e.what()); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
void oxli_raise_py_error(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This allows capturing oxli exceptions in C++ and converting to appropriate Python exceptions. (But maybe we can use a shorter name...)