Permalink
Browse files

Detect popcnt instruction at runtime, use it if available.

Summary:
If compiled for a popcnt-supporting target (-march=corei7, for example),
use __builtin_popcount, as it's presumably inlined.  Otherwise, detect
on startup (in the same way as glibc dispatches to one of the many
flavors of memcpy): GCC allows us to add a resolver function which the
dynamic loader will call on startup to resolve a function to one of
various alternatives; we check (using the cpuid instruction) whether
popcnt is supported, and use it if available.

Test Plan: tests added

Reviewed By: soren@fb.com

FB internal diff: D542977
  • Loading branch information...
tudor committed Aug 8, 2012
1 parent 192cff5 commit 7fd87e7e86bb78dc693da2111bf915761346d540
Showing with 298 additions and 115 deletions.
  1. +77 −0 folly/Bits.cpp
  2. +28 −69 folly/Bits.h
  3. +112 −0 folly/CpuId.h
  4. +47 −0 folly/detail/BitsDetail.h
  5. +0 −25 folly/experimental/Bits.h
  6. +5 −21 folly/test/BitsTest.cpp
  7. +29 −0 folly/test/CpuIdTest.cpp
View
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2012 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "folly/Bits.h"
+
+#include "folly/CpuId.h"
+
+// None of this is necessary if we're compiling for a target that supports
+// popcnt
+#ifndef __POPCNT__
+
+namespace {
+
+int popcount_inst(unsigned int x) {
+ asm ("popcntl %0, %0" : "=r" (x) : "0" (x));
+ return x;
+}
+
+int popcount_builtin(unsigned int x) {
+ return __builtin_popcount(x);
+}
+
+int popcountll_inst(unsigned long long x) {
+ asm ("popcntq %0, %0" : "=r" (x) : "0" (x));
+ return x;
+}
+
+int popcountll_builtin(unsigned long long x) {
+ return __builtin_popcountll(x);
+}
+
+typedef decltype(popcount_builtin) Type_popcount;
+typedef decltype(popcountll_builtin) Type_popcountll;
+
+} // namespace
+
+// This function is called on startup to resolve folly::detail::popcount
+extern "C" Type_popcount* folly_popcount_ifunc() {
+ return folly::CpuId().popcnt() ? popcount_inst : popcount_builtin;
+}
+
+// This function is called on startup to resolve folly::detail::popcountll
+extern "C" Type_popcountll* folly_popcountll_ifunc() {
+ return folly::CpuId().popcnt() ? popcountll_inst : popcountll_builtin;
+}
+
+namespace folly {
+namespace detail {
+
+// Call folly_popcount_ifunc on startup to resolve to either popcount_inst
+// or popcount_builtin
+int popcount(unsigned int x)
+ __attribute__((ifunc("folly_popcount_ifunc")));
+
+// Call folly_popcount_ifunc on startup to resolve to either popcountll_inst
+// or popcountll_builtin
+int popcountll(unsigned long long x)
+ __attribute__((ifunc("folly_popcountll_ifunc")));
+
+} // namespace detail
+} // namespace folly
+
+#endif /* !__POPCNT__ */
+
View
@@ -26,6 +26,9 @@
* 1-based. 0 = no bits are set (x == 0)
* for x != 0, findLastSet(x) == 1 + floor(log2(x))
*
+ * popcount(x)
+ * return the number of 1 bits in x
+ *
* nextPowTwo(x)
* Finds the next power of two >= x.
*
@@ -55,6 +58,11 @@
#define _GNU_SOURCE 1
#endif
+#ifndef __GNUC__
+#error GCC required
+#endif
+
+#include "folly/detail/BitsDetail.h"
#include "folly/detail/BitIteratorDetail.h"
#include "folly/Likely.h"
@@ -125,21 +133,6 @@ typename std::enable_if<
return findFirstSet(static_cast<typename std::make_signed<T>::type>(x));
}
-namespace detail {
-
-// Portable, but likely slow...
-inline unsigned int findLastSetPortable(uint64_t x) {
- unsigned int r = (x != 0); // 1-based index, except for x==0
- while (x >>= 1) {
- ++r;
- }
- return r;
-}
-
-} // namespace detail
-
-#ifdef __GNUC__
-
// findLastSet: return the 1-based index of the highest bit set
// for x > 0, findLastSet(x) == 1 + floor(log2(x))
template <class T>
@@ -179,19 +172,6 @@ typename std::enable_if<
return x ? 8 * sizeof(unsigned long long) - __builtin_clzll(x) : 0;
}
-#else /* !__GNUC__ */
-
-template <class T>
-typename std::enable_if<
- (std::is_integral<T>::value &&
- std::is_unsigned<T>::value),
- unsigned int>::type
- findLastSet(T x) {
- return detail:findLastSetPortable(x);
-}
-
-#endif
-
template <class T>
typename std::enable_if<
(std::is_integral<T>::value &&
@@ -201,36 +181,6 @@ typename std::enable_if<
return findLastSet(static_cast<typename std::make_unsigned<T>::type>(x));
}
-namespace detail {
-
-template <class T>
-inline
-typename std::enable_if<
- std::is_integral<T>::value && std::is_unsigned<T>::value,
- T>::type
-nextPowTwoPortable(T v) {
- if (UNLIKELY(v == 0)) {
- return 1;
- }
-
- --v;
- for (uint32_t i = 1; i < sizeof(T) * 8; i <<= 8) {
- v |= (v >> i);
- v |= (v >> (i << 1));
- v |= (v >> (i << 2));
- v |= (v >> (i << 3));
- v |= (v >> (i << 4));
- v |= (v >> (i << 5));
- v |= (v >> (i << 6));
- v |= (v >> (i << 7));
- }
- return v + 1;
-}
-
-} // namespace detail
-
-#ifdef __GNUC__
-
template <class T>
inline
typename std::enable_if<
@@ -243,20 +193,29 @@ nextPowTwo(T v) {
return 1ul << findLastSet(v - 1);
}
-#else /* __GNUC__ */
-
+/**
+ * Population count
+ */
template <class T>
-inline
-typename std::enable_if<
- std::is_integral<T>::value && std::is_unsigned<T>::value,
- T>::type
-nextPowTwo(T v) {
- return detail::nextPowTwoPortable(v);
+inline typename std::enable_if<
+ (std::is_integral<T>::value &&
+ std::is_unsigned<T>::value &&
+ sizeof(T) <= sizeof(unsigned int)),
+ size_t>::type
+ popcount(T x) {
+ return detail::popcount(x);
}
-#endif /* __GNUC__ */
-
-
+template <class T>
+inline typename std::enable_if<
+ (std::is_integral<T>::value &&
+ std::is_unsigned<T>::value &&
+ sizeof(T) > sizeof(unsigned int) &&
+ sizeof(T) <= sizeof(unsigned long long)),
+ size_t>::type
+ popcount(T x) {
+ return detail::popcountll(x);
+}
/**
* Endianness detection and manipulation primitives.
View
@@ -0,0 +1,112 @@
+/*
+ * Copyright 2012 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FOLLY_CPUID_H_
+#define FOLLY_CPUID_H_
+
+#include <cstdint>
+
+namespace folly {
+
+/**
+ * Identification of an Intel CPU.
+ * Supports CPUID (EAX=1) feature flags.
+ * Values from http://www.intel.com/content/www/us/en/processors/processor-identification-cpuid-instruction-note.html
+ */
+class CpuId {
+ public:
+ CpuId() {
+ __asm__("cpuid" : "=c"(c_), "=d"(d_) : "a"(1) : "ebx");
+ }
+#define X(name, r, bit) bool name() const { return r & (1U << bit); }
+#define C(name, bit) X(name, c_, bit)
+#define D(name, bit) X(name, d_, bit)
+ C(sse3, 0)
+ C(pclmuldq, 1)
+ C(dtes64, 2)
+ C(monitor, 3)
+ C(dscpl, 4)
+ C(vmx, 5)
+ C(smx, 6)
+ C(eist, 7)
+ C(tm2, 8)
+ C(ssse3, 9)
+ C(cnxtid, 10)
+ // 11 is reserved
+ C(fma, 12)
+ C(cx16, 13)
+ C(xtpr, 14)
+ C(pdcm, 15)
+ // 16 is reserved
+ C(pcid, 17)
+ C(dca, 18)
+ C(sse41, 19)
+ C(sse42, 20)
+ C(x2apic, 21)
+ C(movbe, 22)
+ C(popcnt, 23)
+ C(tscdeadline, 24)
+ C(aes, 25)
+ C(xsave, 26)
+ C(osxsave, 27)
+ C(avx, 28)
+ C(f16c, 29)
+ C(rdrand, 30)
+ // 31 is not used
+ D(fpu, 0)
+ D(vme, 1)
+ D(de, 2)
+ D(pse, 3)
+ D(tsc, 4)
+ D(msr, 5)
+ D(pae, 6)
+ D(mce, 7)
+ D(cx8, 8)
+ D(apic, 9)
+ // 10 is reserved
+ D(sep, 11)
+ D(mtrr, 12)
+ D(pge, 13)
+ D(mca, 14)
+ D(cmov, 15)
+ D(pat, 16)
+ D(pse36, 17)
+ D(psn, 18)
+ D(clfsh, 19)
+ // 20 is reserved
+ D(ds, 21)
+ D(acpi, 22)
+ D(mmx, 23)
+ D(fxsr, 24)
+ D(sse, 25)
+ D(sse2, 26)
+ D(ss, 27)
+ D(htt, 28)
+ D(tm, 29)
+ // 30 is reserved
+ D(pbe, 31)
+#undef D
+#undef C
+#undef X
+ private:
+ uint32_t c_; // ECX
+ uint32_t d_; // EDX
+};
+
+} // namespace folly
+
+#endif /* FOLLY_CPUID_H_ */
+
View
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2012 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FOLLY_DETAIL_BITSDETAIL_H_
+#define FOLLY_DETAIL_BITSDETAIL_H_
+
+namespace folly {
+namespace detail {
+
+// If we're targeting an architecture with popcnt support, use
+// __builtin_popcount directly, as it's presumably inlined.
+// If not, use runtime detection using __attribute__((ifunc))
+// (see Bits.cpp)
+#ifdef __POPCNT__
+
+inline int popcount(unsigned int x) {
+ return __builtin_popcount(x);
+}
+inline int popcountll(unsigned long long x) {
+ return __builtin_popcountll(x);
+}
+
+#else /* !__POPCNT__ */
+
+int popcount(unsigned int x);
+int popcountll(unsigned long long x);
+
+#endif /* !__POPCNT__ */
+
+} // namespace detail
+} // namespace folly
+
+#endif /* FOLLY_DETAIL_BITSDETAIL_H_ */
+
Oops, something went wrong.

0 comments on commit 7fd87e7

Please sign in to comment.