Skip to content
This repository
Browse code

Better unsigned to string conversion

Summary:
In https://phabricator.fb.com/D511928 Brian mentioned the current API for string append is insufficient for appending to a buffer. That made me curious about the relative performance of classic and table-based number to ASCII conversions.

The results were interesting as on the average (over all digit lengths) the table-based conversion was faster, but performance was lackluster (in the worst case half as fast as the classic implementation) for large numbers, I presume due to the cache misses incurred by the tables.

This diff proposes an improved unsigned-to-ASCII primitive that is much faster than both table-based (previous Folly) and classic primitive. The key is a fast digits10() implementation that precomputes the space required by the conversion. After that, the digits are issued in place, no more reverse required. The new routine is up to 14x faster than the classic implementation, depending on the number of digits (benchmarks in comments).

Adding a few people who may be interested in the matter. Brian, thanks for bringing this matter up; if this gets in you may want to use the folly routine in proxygen.

Test Plan: unittest and benchmarks.

Reviewed By: simpkins@fb.com

FB internal diff: D515572
  • Loading branch information...
commit 43bdc5d7d0bee944c1f437f3e159f32d4c57f3cd 1 parent 3a5767e
Andrei Alexandrescu authored July 10, 2012 jdelong committed October 11, 2012
7  folly/Benchmark.h
@@ -190,11 +190,14 @@ addBenchmark(const char* file, const char* name, Lambda&& lambda) {
190 190
     timespec start, end;
191 191
 
192 192
     // CORE MEASUREMENT STARTS
193  
-    CHECK_EQ(0, clock_gettime(detail::DEFAULT_CLOCK_ID, &start));
  193
+    auto const r1 = clock_gettime(detail::DEFAULT_CLOCK_ID, &start);
194 194
     lambda(times);
195  
-    CHECK_EQ(0, clock_gettime(detail::DEFAULT_CLOCK_ID, &end));
  195
+    auto const r2 = clock_gettime(detail::DEFAULT_CLOCK_ID, &end);
196 196
     // CORE MEASUREMENT ENDS
197 197
 
  198
+    CHECK_EQ(0, r1);
  199
+    CHECK_EQ(0, r2);
  200
+
198 201
     return detail::timespecDiff(end, start) - BenchmarkSuspender::nsSpent;
199 202
   };
200 203
 
92  folly/Conv.h
@@ -116,41 +116,63 @@ typename std::tuple_element<
116 116
   return getLastElement(vs...);
117 117
 }
118 118
 
  119
+} // namespace detail
  120
+
119 121
 /*******************************************************************************
120 122
  * Conversions from integral types to string types.
121 123
  ******************************************************************************/
122 124
 
123  
-// Returns the offset of the formatted string from the start of
124  
-// the supplied buffer. The new string will be at range
125  
-// [buf+begin,buf+bufLen). Uint will be either uint32_t or uint64_t.
126  
-template <class Uint>
127  
-size_t uintToBuffer(char*const buffer, size_t bufLen, Uint v) {
128  
-  extern const char digit1[101], digit2[101];
  125
+/**
  126
+ * Returns the number of digits in the base 10 representation of an
  127
+ * uint64_t. Useful for preallocating buffers and such. It's also used
  128
+ * internally, see below. Measurements suggest that defining a
  129
+ * separate overload for 32-bit integers is not worthwhile.
  130
+ */
  131
+
  132
+inline uint32_t digits10(uint64_t v) {
  133
+  uint32_t result = 1;
129 134
   for (;;) {
130  
-    if (v < 100) {
131  
-      if (v < 10) {
132  
-        buffer[--bufLen] = static_cast<char>(v + '0');
133  
-      } else {
134  
-        size_t r = static_cast<size_t>(v);
135  
-        bufLen -= 2;
136  
-        buffer[bufLen] = digit1[r];
137  
-        buffer[bufLen + 1] = digit2[r];
138  
-      }
139  
-      break;
140  
-    }
141  
-    Uint t = v;
142  
-    v /= 100;
143  
-    size_t r = static_cast<size_t> (t - v * 100);
144  
-    bufLen -= 2;
145  
-    buffer[bufLen] = digit1[r];
146  
-    buffer[bufLen + 1] = digit2[r];
  135
+    if (LIKELY(v < 10)) return result;
  136
+    if (LIKELY(v < 100)) return result + 1;
  137
+    if (LIKELY(v < 1000)) return result + 2;
  138
+    if (LIKELY(v < 10000)) return result + 3;
  139
+    // Skip ahead by 4 orders of magnitude
  140
+    v /= 10000U;
  141
+    result += 4;
147 142
   }
148  
-  return bufLen;
149 143
 }
150 144
 
151  
-const size_t kMaxInt64BufLen = 21;// 19 + 1 for possible '-' sign + 1 for \0
  145
+/**
  146
+ * Copies the ASCII base 10 representation of v into buffer and
  147
+ * returns the number of bytes written. Does NOT append a \0. Assumes
  148
+ * the buffer points to digits10(v) bytes of valid memory. Note that
  149
+ * uint64 needs at most 20 bytes, uint32_t needs at most 10 bytes,
  150
+ * uint16_t needs at most 5 bytes, and so on. Measurements suggest
  151
+ * that defining a separate overload for 32-bit integers is not
  152
+ * worthwhile.
  153
+ *
  154
+ * This primitive is unsafe because it makes the size assumption and
  155
+ * because it does not add a terminating \0.
  156
+ */
152 157
 
153  
-}                                 // namespace detail
  158
+inline uint32_t uint64ToBufferUnsafe(uint64_t v, char *const buffer) {
  159
+  auto const result = digits10(v);
  160
+  // WARNING: using size_t or pointer arithmetic for pos slows down
  161
+  // the loop below 20x. This is because several 32-bit ops can be
  162
+  // done in parallel, but only fewer 64-bit ones.
  163
+  uint32_t pos = result - 1;
  164
+  while (v >= 10) {
  165
+    // Keep these together so a peephole optimization "sees" them and
  166
+    // computes them in one shot.
  167
+    auto const q = v / 10;
  168
+    auto const r = static_cast<uint32_t>(v % 10);
  169
+    buffer[pos--] = '0' + r;
  170
+    v = q;
  171
+  }
  172
+  // Last digit is trivial to handle
  173
+  buffer[pos] = static_cast<uint32_t>(v) + '0';
  174
+  return result;
  175
+}
154 176
 
155 177
 /**
156 178
  * A single char gets appended.
@@ -222,18 +244,13 @@ typename std::enable_if<
222 244
   && detail::IsSomeString<Tgt>::value && sizeof(Src) >= 4>::type
223 245
 toAppend(Src value, Tgt * result) {
224 246
   typedef typename std::make_unsigned<Src>::type Usrc;
225  
-  char buffer[detail::kMaxInt64BufLen];
226  
-  size_t begin;
  247
+  char buffer[20];
227 248
   if (value < 0) {
228  
-    begin = detail::uintToBuffer(buffer, sizeof(buffer),
229  
-                                 static_cast<Usrc>(-value));
230  
-    DCHECK_GE(begin, 1);
231  
-    buffer[--begin] = '-';
  249
+    result->push_back('-');
  250
+    result->append(buffer, uint64ToBufferUnsafe(-uint64_t(value), buffer));
232 251
   } else {
233  
-    begin = detail::uintToBuffer(buffer, sizeof(buffer),
234  
-                                 static_cast<Usrc>(value));
  252
+    result->append(buffer, uint64ToBufferUnsafe(value, buffer));
235 253
   }
236  
-  result->append(buffer + begin, buffer + sizeof(buffer));
237 254
 }
238 255
 
239 256
 /**
@@ -244,9 +261,8 @@ typename std::enable_if<
244 261
   std::is_integral<Src>::value && !std::is_signed<Src>::value
245 262
   && detail::IsSomeString<Tgt>::value && sizeof(Src) >= 4>::type
246 263
 toAppend(Src value, Tgt * result) {
247  
-  char buffer[detail::kMaxInt64BufLen];
248  
-  const size_t begin = detail::uintToBuffer(buffer, sizeof(buffer), value);
249  
-  result->append(buffer + begin, buffer + sizeof(buffer));
  264
+  char buffer[20];
  265
+  result->append(buffer, buffer + uint64ToBufferUnsafe(value, buffer));
250 266
 }
251 267
 
252 268
 /**
5  folly/Format-inl.h
@@ -429,9 +429,8 @@ class FormatValue<
429 429
         useSprintf("%'ju");
430 430
       } else {
431 431
         // Use uintToBuffer, faster than sprintf
432  
-        valBufEnd = valBuf + valBufSize - 1;
433  
-        valBufBegin = valBuf + detail::uintToBuffer(valBuf, valBufSize - 1,
434  
-                                                    uval);
  432
+        valBufBegin = valBuf + 3;
  433
+        valBufEnd = valBufBegin + uint64ToBufferUnsafe(uval, valBufBegin);
435 434
       }
436 435
       break;
437 436
     case 'c':
173  folly/test/ConvTest.cpp
@@ -570,6 +570,44 @@ TEST(Conv, StringToBool) {
570 570
   EXPECT_EQ(buf5, sp5.begin());
571 571
 }
572 572
 
  573
+TEST(Conv, NewUint64ToString) {
  574
+  char buf[21];
  575
+
  576
+#define THE_GREAT_EXPECTATIONS(n, len)                  \
  577
+  do {                                                  \
  578
+    EXPECT_EQ((len), uint64ToBufferUnsafe((n), buf));   \
  579
+    buf[(len)] = 0;                                     \
  580
+    auto s = string(#n);                                \
  581
+    s = s.substr(0, s.size() - 2);                      \
  582
+    EXPECT_EQ(s, buf);                                  \
  583
+  } while (0)
  584
+
  585
+  THE_GREAT_EXPECTATIONS(0UL, 1);
  586
+  THE_GREAT_EXPECTATIONS(1UL, 1);
  587
+  THE_GREAT_EXPECTATIONS(12UL, 2);
  588
+  THE_GREAT_EXPECTATIONS(123UL, 3);
  589
+  THE_GREAT_EXPECTATIONS(1234UL, 4);
  590
+  THE_GREAT_EXPECTATIONS(12345UL, 5);
  591
+  THE_GREAT_EXPECTATIONS(123456UL, 6);
  592
+  THE_GREAT_EXPECTATIONS(1234567UL, 7);
  593
+  THE_GREAT_EXPECTATIONS(12345678UL, 8);
  594
+  THE_GREAT_EXPECTATIONS(123456789UL, 9);
  595
+  THE_GREAT_EXPECTATIONS(1234567890UL, 10);
  596
+  THE_GREAT_EXPECTATIONS(12345678901UL, 11);
  597
+  THE_GREAT_EXPECTATIONS(123456789012UL, 12);
  598
+  THE_GREAT_EXPECTATIONS(1234567890123UL, 13);
  599
+  THE_GREAT_EXPECTATIONS(12345678901234UL, 14);
  600
+  THE_GREAT_EXPECTATIONS(123456789012345UL, 15);
  601
+  THE_GREAT_EXPECTATIONS(1234567890123456UL, 16);
  602
+  THE_GREAT_EXPECTATIONS(12345678901234567UL, 17);
  603
+  THE_GREAT_EXPECTATIONS(123456789012345678UL, 18);
  604
+  THE_GREAT_EXPECTATIONS(1234567890123456789UL, 19);
  605
+  THE_GREAT_EXPECTATIONS(18446744073709551614UL, 20);
  606
+  THE_GREAT_EXPECTATIONS(18446744073709551615UL, 20);
  607
+
  608
+#undef THE_GREAT_EXPECTATIONS
  609
+}
  610
+
573 611
 ////////////////////////////////////////////////////////////////////////////////
574 612
 // Benchmarks for ASCII to int conversion
575 613
 ////////////////////////////////////////////////////////////////////////////////
@@ -653,11 +691,144 @@ void lexicalCastMeasure(uint n, uint digits) {
653 691
   }
654 692
 }
655 693
 
  694
+// Benchmarks for unsigned to string conversion, raw
  695
+
  696
+unsigned u64ToAsciiTable(uint64_t value, char* dst) {
  697
+  static const char digits[201] =
  698
+    "00010203040506070809"
  699
+    "10111213141516171819"
  700
+    "20212223242526272829"
  701
+    "30313233343536373839"
  702
+    "40414243444546474849"
  703
+    "50515253545556575859"
  704
+    "60616263646566676869"
  705
+    "70717273747576777879"
  706
+    "80818283848586878889"
  707
+    "90919293949596979899";
  708
+
  709
+  uint32_t const length = digits10(value);
  710
+  uint32_t next = length - 1;
  711
+  while (value >= 100) {
  712
+    auto const i = (value % 100) * 2;
  713
+    value /= 100;
  714
+    dst[next] = digits[i + 1];
  715
+    dst[next - 1] = digits[i];
  716
+    next -= 2;
  717
+  }
  718
+  // Handle last 1-2 digits
  719
+  if (value < 10) {
  720
+    dst[next] = '0' + uint32_t(value);
  721
+  } else {
  722
+    auto i = uint32_t(value) * 2;
  723
+    dst[next] = digits[i + 1];
  724
+    dst[next - 1] = digits[i];
  725
+  }
  726
+  return length;
  727
+}
  728
+
  729
+void u64ToAsciiTableBM(uint n, uint64_t value) {
  730
+  // This is too fast, need to do 10 times per iteration
  731
+  char buf[20];
  732
+  FOR_EACH_RANGE (i, 0, n) {
  733
+    doNotOptimizeAway(u64ToAsciiTable(value + n, buf));
  734
+  }
  735
+}
  736
+
  737
+unsigned u64ToAsciiClassic(uint64_t value, char* dst) {
  738
+  // Write backwards.
  739
+  char* next = (char*)dst;
  740
+  char* start = next;
  741
+  do {
  742
+    *next++ = '0' + (value % 10);
  743
+    value /= 10;
  744
+  } while (value != 0);
  745
+  unsigned length = next - start;
  746
+
  747
+  // Reverse in-place.
  748
+  next--;
  749
+  while (next > start) {
  750
+    char swap = *next;
  751
+    *next = *start;
  752
+    *start = swap;
  753
+    next--;
  754
+    start++;
  755
+  }
  756
+  return length;
  757
+}
  758
+
  759
+void u64ToAsciiClassicBM(uint n, uint64_t value) {
  760
+  // This is too fast, need to do 10 times per iteration
  761
+  char buf[20];
  762
+  FOR_EACH_RANGE (i, 0, n) {
  763
+    doNotOptimizeAway(u64ToAsciiClassic(value + n, buf));
  764
+  }
  765
+}
  766
+
  767
+void u64ToAsciiFollyBM(uint n, uint64_t value) {
  768
+  // This is too fast, need to do 10 times per iteration
  769
+  char buf[20];
  770
+  FOR_EACH_RANGE (i, 0, n) {
  771
+    doNotOptimizeAway(uint64ToBufferUnsafe(value + n, buf));
  772
+  }
  773
+}
  774
+
  775
+// Benchmark uitoa with string append
  776
+
  777
+void u2aAppendClassicBM(uint n, uint64_t value) {
  778
+  string s;
  779
+  FOR_EACH_RANGE (i, 0, n) {
  780
+    // auto buf = &s.back() + 1;
  781
+    char buffer[20];
  782
+    s.append(buffer, u64ToAsciiClassic(value, buffer));
  783
+    doNotOptimizeAway(s.size());
  784
+  }
  785
+}
  786
+
  787
+void u2aAppendFollyBM(uint n, uint64_t value) {
  788
+  string s;
  789
+  FOR_EACH_RANGE (i, 0, n) {
  790
+    // auto buf = &s.back() + 1;
  791
+    char buffer[20];
  792
+    s.append(buffer, uint64ToBufferUnsafe(value, buffer));
  793
+    doNotOptimizeAway(s.size());
  794
+  }
  795
+}
  796
+
  797
+#define DEFINE_BENCHMARK_GROUP(n)                       \
  798
+  BENCHMARK_PARAM(u64ToAsciiClassicBM, n);              \
  799
+  BENCHMARK_RELATIVE_PARAM(u64ToAsciiTableBM, n);       \
  800
+  BENCHMARK_RELATIVE_PARAM(u64ToAsciiFollyBM, n);       \
  801
+  BENCHMARK_DRAW_LINE();
  802
+
  803
+DEFINE_BENCHMARK_GROUP(1);
  804
+DEFINE_BENCHMARK_GROUP(12);
  805
+DEFINE_BENCHMARK_GROUP(123);
  806
+DEFINE_BENCHMARK_GROUP(1234);
  807
+DEFINE_BENCHMARK_GROUP(12345);
  808
+DEFINE_BENCHMARK_GROUP(123456);
  809
+DEFINE_BENCHMARK_GROUP(1234567);
  810
+DEFINE_BENCHMARK_GROUP(12345678);
  811
+DEFINE_BENCHMARK_GROUP(123456789);
  812
+DEFINE_BENCHMARK_GROUP(1234567890);
  813
+DEFINE_BENCHMARK_GROUP(12345678901);
  814
+DEFINE_BENCHMARK_GROUP(123456789012);
  815
+DEFINE_BENCHMARK_GROUP(1234567890123);
  816
+DEFINE_BENCHMARK_GROUP(12345678901234);
  817
+DEFINE_BENCHMARK_GROUP(123456789012345);
  818
+DEFINE_BENCHMARK_GROUP(1234567890123456);
  819
+DEFINE_BENCHMARK_GROUP(12345678901234567);
  820
+DEFINE_BENCHMARK_GROUP(123456789012345678);
  821
+DEFINE_BENCHMARK_GROUP(1234567890123456789);
  822
+DEFINE_BENCHMARK_GROUP(12345678901234567890U);
  823
+
  824
+#undef DEFINE_BENCHMARK_GROUP
  825
+
656 826
 #define DEFINE_BENCHMARK_GROUP(n)                       \
657 827
   BENCHMARK_PARAM(clibAtoiMeasure, n);                  \
658 828
   BENCHMARK_RELATIVE_PARAM(lexicalCastMeasure, n);      \
659 829
   BENCHMARK_RELATIVE_PARAM(handwrittenAtoiMeasure, n);  \
660  
-  BENCHMARK_RELATIVE_PARAM(follyAtoiMeasure, n);
  830
+  BENCHMARK_RELATIVE_PARAM(follyAtoiMeasure, n);        \
  831
+  BENCHMARK_DRAW_LINE();
661 832
 
662 833
 DEFINE_BENCHMARK_GROUP(1);
663 834
 DEFINE_BENCHMARK_GROUP(2);

0 notes on commit 43bdc5d

Please sign in to comment.
Something went wrong with that request. Please try again.