Skip to content
This repository
Browse code

qfind_first_byte_of may suffer from global initialization order

Summary: ##static## handling adds ~2 more ns overhead per call (and the first call is kinda slow), but now the logic is correct now. Also inlined ##qfind_first_byte_of##.

Test Plan: unittests

Reviewed By: tudorb@fb.com

FB internal diff: D687947
  • Loading branch information...
commit 4988b28c014b35129a6e84a47a68b03ce1ed23ca 1 parent cc86cd3
Philip Pronin authored January 24, 2013 jdelong committed February 04, 2013
76  folly/Range.cpp
@@ -14,14 +14,11 @@
14 14
  * limitations under the License.
15 15
  */
16 16
 
17  
-//
18 17
 // @author Mark Rabkin (mrabkin@fb.com)
19 18
 // @author Andrei Alexandrescu (andrei.alexandrescu@fb.com)
20  
-//
21 19
 
22 20
 #include "folly/Range.h"
23 21
 
24  
-#include "folly/CpuId.h"
25 22
 #include "folly/Likely.h"
26 23
 
27 24
 namespace folly {
@@ -86,39 +83,6 @@ size_t qfind_first_byte_of_needles16(const StringPiece& haystack,
86 83
   return StringPiece::npos;
87 84
 }
88 85
 
89  
-size_t qfind_first_byte_of_sse42(const StringPiece& haystack,
90  
-                                 const StringPiece& needles)
91  
-  __attribute__ ((__target__("sse4.2"), noinline));
92  
-
93  
-size_t qfind_first_byte_of_sse42(const StringPiece& haystack,
94  
-                                 const StringPiece& needles) {
95  
-  if (UNLIKELY(needles.empty() || haystack.empty())) {
96  
-    return StringPiece::npos;
97  
-  } else if (needles.size() <= 16) {
98  
-    // we can save some unnecessary load instructions by optimizing for
99  
-    // the common case of needles.size() <= 16
100  
-    return qfind_first_byte_of_needles16(haystack, needles);
101  
-  }
102  
-
103  
-  size_t index = haystack.size();
104  
-  for (size_t i = 0; i < haystack.size(); i += 16) {
105  
-    size_t b = 16;
106  
-    auto arr1 = __builtin_ia32_loaddqu(haystack.data() + i);
107  
-    for (size_t j = 0; j < needles.size(); j += 16) {
108  
-      auto arr2 = __builtin_ia32_loaddqu(needles.data() + j);
109  
-      auto index = __builtin_ia32_pcmpestri128(arr2, needles.size() - j,
110  
-                                               arr1, haystack.size() - i, 0);
111  
-      b = std::min<size_t>(index, b);
112  
-    }
113  
-    if (b < 16) {
114  
-      return i + b;
115  
-    }
116  
-  };
117  
-  return StringPiece::npos;
118  
-}
119  
-
120  
-typedef decltype(qfind_first_byte_of_sse42) Type_qfind_first_byte_of;
121  
-
122 86
 // Aho, Hopcroft, and Ullman refer to this trick in "The Design and Analysis
123 87
 // of Computer Algorithms" (1974), but the best description is here:
124 88
 // http://research.swtch.com/sparse
@@ -163,6 +127,37 @@ size_t qfind_first_byte_of_byteset(const StringPiece& haystack,
163 127
   return StringPiece::npos;
164 128
 }
165 129
 
  130
+size_t qfind_first_byte_of_sse42(const StringPiece& haystack,
  131
+                                 const StringPiece& needles)
  132
+  __attribute__ ((__target__("sse4.2"), noinline));
  133
+
  134
+size_t qfind_first_byte_of_sse42(const StringPiece& haystack,
  135
+                                 const StringPiece& needles) {
  136
+  if (UNLIKELY(needles.empty() || haystack.empty())) {
  137
+    return StringPiece::npos;
  138
+  } else if (needles.size() <= 16) {
  139
+    // we can save some unnecessary load instructions by optimizing for
  140
+    // the common case of needles.size() <= 16
  141
+    return qfind_first_byte_of_needles16(haystack, needles);
  142
+  }
  143
+
  144
+  size_t index = haystack.size();
  145
+  for (size_t i = 0; i < haystack.size(); i += 16) {
  146
+    size_t b = 16;
  147
+    auto arr1 = __builtin_ia32_loaddqu(haystack.data() + i);
  148
+    for (size_t j = 0; j < needles.size(); j += 16) {
  149
+      auto arr2 = __builtin_ia32_loaddqu(needles.data() + j);
  150
+      auto index = __builtin_ia32_pcmpestri128(arr2, needles.size() - j,
  151
+                                               arr1, haystack.size() - i, 0);
  152
+      b = std::min<size_t>(index, b);
  153
+    }
  154
+    if (b < 16) {
  155
+      return i + b;
  156
+    }
  157
+  };
  158
+  return StringPiece::npos;
  159
+}
  160
+
166 161
 size_t qfind_first_byte_of_nosse(const StringPiece& haystack,
167 162
                                  const StringPiece& needles) {
168 163
   if (UNLIKELY(needles.empty() || haystack.empty())) {
@@ -183,14 +178,5 @@ size_t qfind_first_byte_of_nosse(const StringPiece& haystack,
183 178
   return qfind_first_byte_of_memchr(haystack, needles);
184 179
 }
185 180
 
186  
-auto const qfind_first_byte_of_fn =
187  
-  folly::CpuId().sse42() ? qfind_first_byte_of_sse42
188  
-                         : qfind_first_byte_of_nosse;
189  
-
190  
-size_t qfind_first_byte_of(const StringPiece& haystack,
191  
-                           const StringPiece& needles) {
192  
-  return qfind_first_byte_of_fn(haystack, needles);
193  
-}
194  
-
195 181
 }  // namespace detail
196 182
 }  // namespace folly
18  folly/Range.h
@@ -32,6 +32,7 @@
32 32
 #include <boost/utility/enable_if.hpp>
33 33
 #include <boost/type_traits.hpp>
34 34
 #include <bits/c++config.h>
  35
+#include "folly/CpuId.h"
35 36
 #include "folly/Traits.h"
36 37
 
37 38
 namespace folly {
@@ -593,8 +594,21 @@ size_t qfind(const Range<T>& haystack,
593 594
 }
594 595
 
595 596
 namespace detail {
596  
-size_t qfind_first_byte_of(const StringPiece& haystack,
597  
-                           const StringPiece& needles);
  597
+
  598
+size_t qfind_first_byte_of_sse42(const StringPiece& haystack,
  599
+                                 const StringPiece& needles);
  600
+
  601
+size_t qfind_first_byte_of_nosse(const StringPiece& haystack,
  602
+                                 const StringPiece& needles);
  603
+
  604
+inline size_t qfind_first_byte_of(const StringPiece& haystack,
  605
+                                  const StringPiece& needles) {
  606
+  static auto const qfind_first_byte_of_fn =
  607
+    folly::CpuId().sse42() ? qfind_first_byte_of_sse42
  608
+                           : qfind_first_byte_of_nosse;
  609
+  return qfind_first_byte_of_fn(haystack, needles);
  610
+}
  611
+
598 612
 } // namespace detail
599 613
 
600 614
 template <class T, class Comp>
6  folly/test/RangeTest.cpp
@@ -14,7 +14,6 @@
14 14
  * limitations under the License.
15 15
  */
16 16
 
17  
-//
18 17
 // @author Kristina Holst (kholst@fb.com)
19 18
 // @author Andrei Alexandrescu (andrei.alexandrescu@fb.com)
20 19
 
@@ -25,6 +24,7 @@
25 24
 #include "folly/Range.h"
26 25
 
27 26
 namespace folly { namespace detail {
  27
+
28 28
 // declaration of functions in Range.cpp
29 29
 size_t qfind_first_byte_of_memchr(const StringPiece& haystack,
30 30
                                   const StringPiece& needles);
@@ -32,9 +32,7 @@ size_t qfind_first_byte_of_memchr(const StringPiece& haystack,
32 32
 size_t qfind_first_byte_of_byteset(const StringPiece& haystack,
33 33
                                    const StringPiece& needles);
34 34
 
35  
-size_t qfind_first_byte_of_nosse(const StringPiece& haystack,
36  
-                                 const StringPiece& needles);
37  
-}}
  35
+}}  // namespaces
38 36
 
39 37
 using namespace folly;
40 38
 using namespace std;

0 notes on commit 4988b28

Please sign in to comment.
Something went wrong with that request. Please try again.