Permalink
Browse files

qfind_first_byte_of may suffer from global initialization order

Summary: ##static## handling adds ~2 more ns overhead per call (and the first call is kinda slow), but now the logic is correct now. Also inlined ##qfind_first_byte_of##.

Test Plan: unittests

Reviewed By: tudorb@fb.com

FB internal diff: D687947
  • Loading branch information...
1 parent cc86cd3 commit 4988b28c014b35129a6e84a47a68b03ce1ed23ca @philippv philippv committed with jdelong Jan 25, 2013
Showing with 49 additions and 51 deletions.
  1. +31 −45 folly/Range.cpp
  2. +16 −2 folly/Range.h
  3. +2 −4 folly/test/RangeTest.cpp
View
@@ -14,14 +14,11 @@
* limitations under the License.
*/
-//
// @author Mark Rabkin (mrabkin@fb.com)
// @author Andrei Alexandrescu (andrei.alexandrescu@fb.com)
-//
#include "folly/Range.h"
-#include "folly/CpuId.h"
#include "folly/Likely.h"
namespace folly {
@@ -86,39 +83,6 @@ size_t qfind_first_byte_of_needles16(const StringPiece& haystack,
return StringPiece::npos;
}
-size_t qfind_first_byte_of_sse42(const StringPiece& haystack,
- const StringPiece& needles)
- __attribute__ ((__target__("sse4.2"), noinline));
-
-size_t qfind_first_byte_of_sse42(const StringPiece& haystack,
- const StringPiece& needles) {
- if (UNLIKELY(needles.empty() || haystack.empty())) {
- return StringPiece::npos;
- } else if (needles.size() <= 16) {
- // we can save some unnecessary load instructions by optimizing for
- // the common case of needles.size() <= 16
- return qfind_first_byte_of_needles16(haystack, needles);
- }
-
- size_t index = haystack.size();
- for (size_t i = 0; i < haystack.size(); i += 16) {
- size_t b = 16;
- auto arr1 = __builtin_ia32_loaddqu(haystack.data() + i);
- for (size_t j = 0; j < needles.size(); j += 16) {
- auto arr2 = __builtin_ia32_loaddqu(needles.data() + j);
- auto index = __builtin_ia32_pcmpestri128(arr2, needles.size() - j,
- arr1, haystack.size() - i, 0);
- b = std::min<size_t>(index, b);
- }
- if (b < 16) {
- return i + b;
- }
- };
- return StringPiece::npos;
-}
-
-typedef decltype(qfind_first_byte_of_sse42) Type_qfind_first_byte_of;
-
// Aho, Hopcroft, and Ullman refer to this trick in "The Design and Analysis
// of Computer Algorithms" (1974), but the best description is here:
// http://research.swtch.com/sparse
@@ -163,6 +127,37 @@ size_t qfind_first_byte_of_byteset(const StringPiece& haystack,
return StringPiece::npos;
}
+size_t qfind_first_byte_of_sse42(const StringPiece& haystack,
+ const StringPiece& needles)
+ __attribute__ ((__target__("sse4.2"), noinline));
+
+size_t qfind_first_byte_of_sse42(const StringPiece& haystack,
+ const StringPiece& needles) {
+ if (UNLIKELY(needles.empty() || haystack.empty())) {
+ return StringPiece::npos;
+ } else if (needles.size() <= 16) {
+ // we can save some unnecessary load instructions by optimizing for
+ // the common case of needles.size() <= 16
+ return qfind_first_byte_of_needles16(haystack, needles);
+ }
+
+ size_t index = haystack.size();
+ for (size_t i = 0; i < haystack.size(); i += 16) {
+ size_t b = 16;
+ auto arr1 = __builtin_ia32_loaddqu(haystack.data() + i);
+ for (size_t j = 0; j < needles.size(); j += 16) {
+ auto arr2 = __builtin_ia32_loaddqu(needles.data() + j);
+ auto index = __builtin_ia32_pcmpestri128(arr2, needles.size() - j,
+ arr1, haystack.size() - i, 0);
+ b = std::min<size_t>(index, b);
+ }
+ if (b < 16) {
+ return i + b;
+ }
+ };
+ return StringPiece::npos;
+}
+
size_t qfind_first_byte_of_nosse(const StringPiece& haystack,
const StringPiece& needles) {
if (UNLIKELY(needles.empty() || haystack.empty())) {
@@ -183,14 +178,5 @@ size_t qfind_first_byte_of_nosse(const StringPiece& haystack,
return qfind_first_byte_of_memchr(haystack, needles);
}
-auto const qfind_first_byte_of_fn =
- folly::CpuId().sse42() ? qfind_first_byte_of_sse42
- : qfind_first_byte_of_nosse;
-
-size_t qfind_first_byte_of(const StringPiece& haystack,
- const StringPiece& needles) {
- return qfind_first_byte_of_fn(haystack, needles);
-}
-
} // namespace detail
} // namespace folly
View
@@ -32,6 +32,7 @@
#include <boost/utility/enable_if.hpp>
#include <boost/type_traits.hpp>
#include <bits/c++config.h>
+#include "folly/CpuId.h"
#include "folly/Traits.h"
namespace folly {
@@ -593,8 +594,21 @@ size_t qfind(const Range<T>& haystack,
}
namespace detail {
-size_t qfind_first_byte_of(const StringPiece& haystack,
- const StringPiece& needles);
+
+size_t qfind_first_byte_of_sse42(const StringPiece& haystack,
+ const StringPiece& needles);
+
+size_t qfind_first_byte_of_nosse(const StringPiece& haystack,
+ const StringPiece& needles);
+
+inline size_t qfind_first_byte_of(const StringPiece& haystack,
+ const StringPiece& needles) {
+ static auto const qfind_first_byte_of_fn =
+ folly::CpuId().sse42() ? qfind_first_byte_of_sse42
+ : qfind_first_byte_of_nosse;
+ return qfind_first_byte_of_fn(haystack, needles);
+}
+
} // namespace detail
template <class T, class Comp>
View
@@ -14,7 +14,6 @@
* limitations under the License.
*/
-//
// @author Kristina Holst (kholst@fb.com)
// @author Andrei Alexandrescu (andrei.alexandrescu@fb.com)
@@ -25,16 +24,15 @@
#include "folly/Range.h"
namespace folly { namespace detail {
+
// declaration of functions in Range.cpp
size_t qfind_first_byte_of_memchr(const StringPiece& haystack,
const StringPiece& needles);
size_t qfind_first_byte_of_byteset(const StringPiece& haystack,
const StringPiece& needles);
-size_t qfind_first_byte_of_nosse(const StringPiece& haystack,
- const StringPiece& needles);
-}}
+}} // namespaces
using namespace folly;
using namespace std;

0 comments on commit 4988b28

Please sign in to comment.