2424#include < folly/portability/Builtins.h>
2525#include < thrift/lib/cpp2/type/Id.h>
2626
27+ #if defined(__cpp_lib_bitops) && __cpp_lib_bitops >= 201907L
28+ #include < bit>
29+ #endif
30+
2731// We need 64-bit for __mm_extra_epi64 and _pext_u64. MSVC support seems to be
2832// difficult to detect, so disable the BMI2 and SIMD versions entirely there.
2933#if defined(__BMI2__) && FOLLY_X64 && !defined(_MSC_VER)
@@ -222,9 +226,15 @@ inline size_t readContiguousVarintMediumSlowU64BMI2(
222226 // interested in for varint decoding purposes).
223227 uint64_t bits = folly::loadUnaligned<uint64_t >(p);
224228
225- const uint64_t kContinuationBitMask = 0x8080808080808080ULL ;
226- uint64_t continuationBits = bits & kContinuationBitMask ;
227- if (continuationBits == kContinuationBitMask ) {
229+ const uint64_t kDataBitMask = 0x7F7F7F7F7F7F7F7FULL ;
230+ const uint64_t allDataBitsSet = bits | kDataBitMask ;
231+ const uint64_t oneInLowestUnsetContinuationBitWithTrailingBitsZero =
232+ allDataBitsSet + 1 ;
233+ // NOTE: by hoisting the increment of allDataBitsSet above this
234+ // check, we can perform the check against zero rather than -1.
235+ // This accelerates the check because the INC instruction will set
236+ // ZF if the result is 0, netting a savings of one CMP instruction.
237+ if (oneInLowestUnsetContinuationBitWithTrailingBitsZero == 0 ) {
228238 // The maximum bytes of int64 would be ceil(64/7)=10, as we had used
229239 // tryReadFirstTwoBytesU64 to read 2 bytes, there would be 8 bytes leftover
230240 // in maximum.
@@ -234,15 +244,29 @@ inline size_t readContiguousVarintMediumSlowU64BMI2(
234244 }
235245 // By reset data bits and toggle the continuation bits, the tailing zeros
236246 // should be intBytes*8-1
237- size_t maskShift = __builtin_ctzll (continuationBits ^ kContinuationBitMask );
247+ #if defined(__cpp_lib_bitops) && __cpp_lib_bitsops >= 201907L
248+ // __builtin_ctzll(0) is defined as UB, even though TZCNT does what we want in
249+ // that case. std::countr_zero is the same thing without the UB.
250+ size_t maskShift =
251+ std::countr_zero (oneInLowestUnsetContinuationBitWithTrailingBitsZero);
252+ #else
253+ // We settle for folly::findFirstSet and fix up the result if we have to.
254+ size_t maskShift =
255+ folly::findFirstSet (oneInLowestUnsetContinuationBitWithTrailingBitsZero);
256+ if (maskShift == 0 ) {
257+ maskShift = 64 ;
258+ } else {
259+ maskShift--;
260+ }
261+ #endif
238262 size_t intBytes = (maskShift >> 3 ) + 1 ;
239263
240264 uint64_t mask = (1ULL << maskShift) - 1 ;
241265 // You might think it would make more sense to to the pext first and mask
242266 // afterwards (avoiding having two pexts in a single dependency chain at 3
243267 // cycles / pop); this seems not to be borne out in microbenchmarks. The
244268 // mask you need ends up being more complicated to compute.
245- uint64_t highBits = _pext_u64 ((bits & mask), 0x7F7F7F7F7F7F7F7FULL );
269+ uint64_t highBits = _pext_u64 ((bits & mask), kDataBitMask );
246270 result |= (highBits << 14 );
247271
248272 value = result;
0 commit comments