Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Move Rabin fingerprinting code to folly.
Summary: Also generate fingerprint tables every time, so the code doesn't rot. TODO(tudorb): move benchmark to folly TODO(tudorb): Include the program used to generate the polynomials (can't build as it requires NTL from http://www.shoup.net/ntl/) Test Plan: folly/test Reviewed By: andrei.alexandrescu@fb.com FB internal diff: D492455
- Loading branch information
Showing
5 changed files
with
838 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -0,0 +1,266 @@ | |||
/* | |||
* Copyright 2012 Facebook, Inc. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
*/ | |||
|
|||
/** | |||
* Compute 64-, 96-, and 128-bit Rabin fingerprints, as described in | |||
* Michael O. Rabin (1981) | |||
* Fingerprinting by Random Polynomials | |||
* Center for Research in Computing Technology, Harvard University | |||
* Tech Report TR-CSE-03-01 | |||
* | |||
* The implementation follows the optimization described in | |||
* Andrei Z. Broder (1993) | |||
* Some applications of Rabin's fingerprinting method | |||
* | |||
* extended for fingerprints larger than 64 bits, and modified to use | |||
* 64-bit instead of 32-bit integers for computation. | |||
* | |||
* The precomputed tables are in FingerprintTable.cpp, which is automatically | |||
* generated by ComputeFingerprintTable.cpp. | |||
* | |||
* Benchmarked on 10/13/2009 on a 2.5GHz quad-core Xeon L5420, | |||
* - Fingerprint<64>::update64() takes about 12ns | |||
* - Fingerprint<96>::update64() takes about 30ns | |||
* - Fingerprint<128>::update128() takes about 30ns | |||
* (unsurprisingly, Fingerprint<96> and Fingerprint<128> take the | |||
* same amount of time, as they both use 128-bit operations; the least | |||
* significant 32 bits of Fingerprint<96> will always be 0) | |||
* | |||
* @author Tudor Bosman (tudorb@facebook.com) | |||
*/ | |||
|
|||
#ifndef FOLLY_FINGERPRINT_H_ | |||
#define FOLLY_FINGERPRINT_H_ | |||
|
|||
#include <cstdint> | |||
|
|||
#include "folly/Range.h" | |||
|
|||
namespace folly { | |||
|
|||
namespace detail { | |||
template <int BITS> | |||
struct FingerprintTable { | |||
static const uint64_t poly[1 + (BITS-1)/64]; | |||
static const uint64_t table[8][256][1 + (BITS-1)/64]; | |||
}; | |||
} // namespace detail | |||
|
|||
/** | |||
* Compute the Rabin fingerprint. | |||
* | |||
* TODO(tudorb): Extend this to allow removing values from the computed | |||
* fingerprint (so we can fingerprint a sliding window, as in the Rabin-Karp | |||
* string matching algorithm) | |||
* | |||
* update* methods return *this, so you can chain them together: | |||
* Fingerprint<96>().update8(x).update(str).update64(val).write(output); | |||
*/ | |||
template <int BITS> | |||
class Fingerprint { | |||
public: | |||
Fingerprint() { | |||
// Use a non-zero starting value. We'll use (1 << (BITS-1)) | |||
fp_[0] = 1UL << 63; | |||
for (int i = 1; i < size(); i++) | |||
fp_[i] = 0; | |||
} | |||
|
|||
Fingerprint& update8(uint8_t v) { | |||
uint8_t out = shlor8(v); | |||
xortab(detail::FingerprintTable<BITS>::table[0][out]); | |||
return *this; | |||
} | |||
|
|||
// update32 and update64 are convenience functions to update the fingerprint | |||
// with 4 and 8 bytes at a time. They are faster than calling update8 | |||
// in a loop. They process the bytes in big-endian order. | |||
Fingerprint& update32(uint32_t v) { | |||
uint32_t out = shlor32(v); | |||
for (int i = 0; i < 4; i++) { | |||
xortab(detail::FingerprintTable<BITS>::table[i][out&0xff]); | |||
out >>= 8; | |||
} | |||
return *this; | |||
} | |||
|
|||
Fingerprint& update64(uint64_t v) { | |||
uint64_t out = shlor64(v); | |||
for (int i = 0; i < 8; i++) { | |||
xortab(detail::FingerprintTable<BITS>::table[i][out&0xff]); | |||
out >>= 8; | |||
} | |||
return *this; | |||
} | |||
|
|||
Fingerprint& update(StringPiece str) { | |||
// TODO(tudorb): We could be smart and do update64 or update32 if aligned | |||
for (auto c : str) { | |||
update8(uint8_t(c)); | |||
} | |||
return *this; | |||
} | |||
|
|||
/** | |||
* Return the number of uint64s needed to hold the fingerprint value. | |||
*/ | |||
static int size() { | |||
return 1 + (BITS-1)/64; | |||
} | |||
|
|||
/** | |||
* Write the computed fingeprint to an array of size() uint64_t's. | |||
* For Fingerprint<64>, size()==1; we write 64 bits in out[0] | |||
* For Fingerprint<96>, size()==2; we write 64 bits in out[0] and | |||
* the most significant 32 bits of out[1] | |||
* For Fingerprint<128>, size()==2; we write 64 bits in out[0] and | |||
* 64 bits in out[1]. | |||
*/ | |||
void write(uint64_t* out) const { | |||
for (int i = 0; i < size(); i++) { | |||
out[i] = fp_[i]; | |||
} | |||
} | |||
|
|||
private: | |||
// XOR the fingerprint with a value from one of the tables. | |||
void xortab(const uint64_t* tab) { | |||
for (int i = 0; i < size(); i++) { | |||
fp_[i] ^= tab[i]; | |||
} | |||
} | |||
|
|||
// Helper functions: shift the fingerprint value left by 8/32/64 bits, | |||
// return the "out" value (the bits that were shifted out), and add "v" | |||
// in the bits on the right. | |||
uint8_t shlor8(uint8_t v); | |||
uint32_t shlor32(uint32_t v); | |||
uint64_t shlor64(uint64_t v); | |||
|
|||
uint64_t fp_[1 + (BITS-1)/64]; | |||
}; | |||
|
|||
// Convenience functions | |||
|
|||
/** | |||
* Return the 64-bit Rabin fingerprint of a string. | |||
*/ | |||
inline uint64_t fingerprint64(StringPiece str) { | |||
uint64_t fp; | |||
Fingerprint<64>().update(str).write(&fp); | |||
return fp; | |||
} | |||
|
|||
/** | |||
* Compute the 96-bit Rabin fingerprint of a string. | |||
* Return the 64 most significant bits in *msb, and the 32 least significant | |||
* bits in *lsb. | |||
*/ | |||
inline void fingerprint96(StringPiece str, | |||
uint64_t* msb, uint32_t* lsb) { | |||
uint64_t fp[2]; | |||
Fingerprint<96>().update(str).write(fp); | |||
*msb = fp[0]; | |||
*lsb = (uint32_t)(fp[1] >> 32); | |||
} | |||
|
|||
/** | |||
* Compute the 128-bit Rabin fingerprint of a string. | |||
* Return the 64 most significant bits in *msb, and the 64 least significant | |||
* bits in *lsb. | |||
*/ | |||
inline void fingerprint128(StringPiece str, | |||
uint64_t* msb, uint64_t* lsb) { | |||
uint64_t fp[2]; | |||
Fingerprint<128>().update(str).write(fp); | |||
*msb = fp[0]; | |||
*lsb = fp[1]; | |||
} | |||
|
|||
|
|||
template <> | |||
inline uint8_t Fingerprint<64>::shlor8(uint8_t v) { | |||
uint8_t out = (uint8_t)(fp_[0] >> 56); | |||
fp_[0] = (fp_[0] << 8) | ((uint64_t)v); | |||
return out; | |||
} | |||
|
|||
template <> | |||
inline uint32_t Fingerprint<64>::shlor32(uint32_t v) { | |||
uint32_t out = (uint32_t)(fp_[0] >> 32); | |||
fp_[0] = (fp_[0] << 32) | ((uint64_t)v); | |||
return out; | |||
} | |||
|
|||
template <> | |||
inline uint64_t Fingerprint<64>::shlor64(uint64_t v) { | |||
uint64_t out = fp_[0]; | |||
fp_[0] = v; | |||
return out; | |||
} | |||
|
|||
template <> | |||
inline uint8_t Fingerprint<96>::shlor8(uint8_t v) { | |||
uint8_t out = (uint8_t)(fp_[0] >> 56); | |||
fp_[0] = (fp_[0] << 8) | (fp_[1] >> 56); | |||
fp_[1] = (fp_[1] << 8) | ((uint64_t)v << 32); | |||
return out; | |||
} | |||
|
|||
template <> | |||
inline uint32_t Fingerprint<96>::shlor32(uint32_t v) { | |||
uint32_t out = (uint32_t)(fp_[0] >> 32); | |||
fp_[0] = (fp_[0] << 32) | (fp_[1] >> 32); | |||
fp_[1] = ((uint64_t)v << 32); | |||
return out; | |||
} | |||
|
|||
template <> | |||
inline uint64_t Fingerprint<96>::shlor64(uint64_t v) { | |||
uint64_t out = fp_[0]; | |||
fp_[0] = fp_[1] | (v >> 32); | |||
fp_[1] = v << 32; | |||
return out; | |||
} | |||
|
|||
template <> | |||
inline uint8_t Fingerprint<128>::shlor8(uint8_t v) { | |||
uint8_t out = (uint8_t)(fp_[0] >> 56); | |||
fp_[0] = (fp_[0] << 8) | (fp_[1] >> 56); | |||
fp_[1] = (fp_[1] << 8) | ((uint64_t)v); | |||
return out; | |||
} | |||
|
|||
template <> | |||
inline uint32_t Fingerprint<128>::shlor32(uint32_t v) { | |||
uint32_t out = (uint32_t)(fp_[0] >> 32); | |||
fp_[0] = (fp_[0] << 32) | (fp_[1] >> 32); | |||
fp_[1] = (fp_[1] << 32) | ((uint64_t)v); | |||
return out; | |||
} | |||
|
|||
template <> | |||
inline uint64_t Fingerprint<128>::shlor64(uint64_t v) { | |||
uint64_t out = fp_[0]; | |||
fp_[0] = fp_[1]; | |||
fp_[1] = v; | |||
return out; | |||
} | |||
|
|||
} // namespace folly | |||
|
|||
#endif /* FOLLY_FINGERPRINT_H_ */ | |||
|
Oops, something went wrong.