diff --git a/include/oxli/hashtable.hh b/include/oxli/hashtable.hh index 97ce6ba461..7ef3e4c7cc 100644 --- a/include/oxli/hashtable.hh +++ b/include/oxli/hashtable.hh @@ -540,6 +540,118 @@ public: } }; + +class FNVKmerHashIterator : public KmerHashIterator +{ + const char * _seq; + const char _ksize; + unsigned int index; + unsigned int length; + bool _initialized; +public: + FNVKmerHashIterator(const char * seq, unsigned char k) : + _seq(seq), _ksize(k), index(0), _initialized(false) + { + length = strlen(_seq); + }; + + HashIntoType first() + { + _initialized = true; + return next(); + } + + HashIntoType next() + { + if (!_initialized) { + _initialized = true; + } + + if (done()) { + throw oxli_exception("past end of iterator"); + } + + std::string kmer; + kmer.assign(_seq + index, _ksize); + index += 1; + return _hash_fnv(kmer, _ksize); + } + + bool done() const + { + return (index + _ksize > length); + } + + unsigned int get_start_pos() const + { + if (!_initialized) { + return 0; + } + return index - 1; + } + unsigned int get_end_pos() const + { + if (!_initialized) { + return _ksize; + } + return index + _ksize - 1; + } +}; + + +class FNVHashtable : public oxli::Hashtable +{ +public: + explicit FNVHashtable(WordLength ksize, Storage * s) + : Hashtable(ksize, s) { }; + + inline + virtual + HashIntoType + hash_dna(const char * kmer) const + { + if (!(strlen(kmer) >= _ksize)) { + throw oxli_value_exception("Supplied kmer string doesn't match the underlying k-size."); + } + return _hash_fnv(kmer, _ksize); + } + + inline virtual HashIntoType + hash_dna_top_strand(const char * kmer) const + { + throw oxli_value_exception("not implemented"); + } + + inline virtual HashIntoType + hash_dna_bottom_strand(const char * kmer) const + { + throw oxli_value_exception("not implemented"); + } + + inline virtual std::string + unhash_dna(HashIntoType hashval) const + { + throw oxli_value_exception("not implemented"); + } + + virtual KmerHashIteratorPtr new_kmer_iterator(const char * sp) const + { + KmerHashIterator * ki = new FNVKmerHashIterator(sp, _ksize); + return unique_ptr(ki); + } + + virtual void save(std::string filename) + { + store->save(filename, _ksize); + } + virtual void load(std::string filename) + { + store->load(filename, _ksize); + _init_bitstuff(); + } +}; + + // Hashtable-derived class with ByteStorage. class Counttable : public oxli::MurmurHashtable { diff --git a/include/oxli/kmer_hash.hh b/include/oxli/kmer_hash.hh index da1228d6df..93ce5d917e 100644 --- a/include/oxli/kmer_hash.hh +++ b/include/oxli/kmer_hash.hh @@ -115,6 +115,14 @@ HashIntoType _hash_murmur(const std::string& kmer, const WordLength k, HashIntoType _hash_murmur_forward(const std::string& kmer, const WordLength k); + +HashIntoType _hash_fnv(const std::string& kmer, const WordLength k); +HashIntoType _hash_fnv(const std::string& kmer, const WordLength k, + HashIntoType& h, HashIntoType& r); +HashIntoType _hash_fnv_forward(const std::string& kmer, + const WordLength k); + + // Function to support k-mer banding. std::pair compute_band_interval(unsigned int num_bands, unsigned int band); diff --git a/src/oxli/kmer_hash.cc b/src/oxli/kmer_hash.cc index 28e19eeecd..060cb8f17a 100644 --- a/src/oxli/kmer_hash.cc +++ b/src/oxli/kmer_hash.cc @@ -214,6 +214,62 @@ HashIntoType _hash_murmur_forward(const std::string& kmer, const WordLength k) return h; } + +inline const uint64_t hash_64_fnv1a(const char* data, const uint64_t len) { + uint64_t hash = 0xcbf29ce484222325; + const uint64_t prime = 0x100000001b3; + + for(uint64_t i = 0; i < len; ++i) { + //const uint8_t value = data[i]; + //hash = hash ^ value; + hash = hash ^ data[i]; + hash *= prime; + } + + return hash; +} + +constexpr uint64_t val_64_const = 0xcbf29ce484222325; +constexpr uint64_t prime_64_const = 0x100000001b3; +inline constexpr uint64_t hash_64_fnv1a_const(const char* const str, const uint64_t value = val_64_const) noexcept { + return (str[0] == '\0') ? value : hash_64_fnv1a_const(&str[1], (value ^ uint64_t(str[0])) * prime_64_const); +} + +HashIntoType _hash_fnv(const std::string& kmer, const WordLength k) +{ + HashIntoType h = 0; + HashIntoType r = 0; + + return oxli::_hash_fnv(kmer, k, h, r); +} + +HashIntoType _hash_fnv(const std::string& kmer, const WordLength k, + HashIntoType& h, HashIntoType& r) { + h = hash_64_fnv1a_const(kmer.c_str()); + + assert(kmer.length() == k); // an assumption of the below code + std::string rev = oxli::_revcomp(kmer); + if (rev == kmer) { + // self complement kmer, can't use bitwise XOR + r = h; + return h; + } + r = hash_64_fnv1a_const(rev.c_str()); + + return h ^ r; +} + +HashIntoType _hash_fnv_forward(const std::string& kmer, const WordLength k) +{ + HashIntoType h = 0; + HashIntoType r = 0; + + oxli::_hash_fnv(kmer, k, h, r); + + return h; +} + + std::pair compute_band_interval(unsigned int num_bands, unsigned int band) {