Skip to content

Commit

Permalink
Optimize JSON escaping of ASCII strings
Browse files Browse the repository at this point in the history
Summary:
`escapeString` is very slow even when there's very little to nothing to escape. This diff adds a fast path to copy sequences of bytes that don't need escaping.
It also optimizes appending escape sequences: `string::push_back` is slow because it has to do a capacity check for every character.

Before:
```
  ============================================================================
  folly/test/JsonOtherTest.cpp                    relative  time/iter  iters/s
  ============================================================================
  jsonSerialize                                              818.55ns    1.22M
  jsonSerializeWithNonAsciiEncoding                            1.35us  738.06K
  jsonSerializeWithUtf8Validation                              1.42us  705.60K
  jsonSerializeAsciiWithUtf8Validation                         3.27us  306.06K
  parseSmallStringWithUtf                                      1.91us  522.38K
  parseNormalString                                            1.51us  660.27K
  parseBigString                                             384.44ns    2.60M
  toJson                                                     480.54ns    2.08M
  ============================================================================
```

After:
```
  ============================================================================
  folly/test/JsonOtherTest.cpp                    relative  time/iter  iters/s
  ============================================================================
  jsonSerialize                                              781.69ns    1.28M
  jsonSerializeWithNonAsciiEncoding                          847.68ns    1.18M
  jsonSerializeWithUtf8Validation                            928.68ns    1.08M
  jsonSerializeAsciiWithUtf8Validation                       199.85ns    5.00M
  parseSmallStringWithUtf                                      1.93us  518.39K
  parseNormalString                                            1.45us  689.11K
  parseBigString                                             378.66ns    2.64M
  toJson                                                     446.38ns    2.24M
  ============================================================================
```

All string escaping benchmarks are slightly faster, and ASCII-only with no escapes is 8x faster.

Reviewed By: luciang, evilmucedin, yfeldblum

Differential Revision: D4793233

fbshipit-source-id: c40d07708bd787799c8c00f9f23a417b862ee9ae
  • Loading branch information
ot authored and facebook-github-bot committed Apr 1, 2017
1 parent 8e16a2e commit 2f0cabf
Show file tree
Hide file tree
Showing 3 changed files with 191 additions and 70 deletions.
115 changes: 94 additions & 21 deletions folly/json.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,13 @@
#include <folly/json.h>

#include <algorithm>
#include <cassert>
#include <functional>
#include <type_traits>

#include <boost/next_prior.hpp>
#include <boost/algorithm/string.hpp>
#include <boost/next_prior.hpp>
#include <folly/Bits.h>
#include <folly/Portability.h>

#include <folly/Conv.h>
#include <folly/Range.h>
Expand Down Expand Up @@ -344,7 +346,7 @@ std::string parseString(Input& in);
dynamic parseNumber(Input& in);

dynamic parseObject(Input& in) {
assert(*in == '{');
DCHECK_EQ(*in, '{');
++in;

dynamic ret = dynamic::object;
Expand Down Expand Up @@ -388,7 +390,7 @@ dynamic parseObject(Input& in) {
}

dynamic parseArray(Input& in) {
assert(*in == '[');
DCHECK_EQ(*in, '[');
++in;

dynamic ret = dynamic::array;
Expand Down Expand Up @@ -523,7 +525,7 @@ std::string decodeUnicodeEscape(Input& in) {
}

std::string parseString(Input& in) {
assert(*in == '\"');
DCHECK_EQ(*in, '\"');
++in;

std::string ret;
Expand Down Expand Up @@ -607,6 +609,44 @@ std::string serialize(dynamic const& dyn, serialization_opts const& opts) {
return ret;
}

// Fast path to determine the longest prefix that can be left
// unescaped in a string of sizeof(T) bytes packed in an integer of
// type T.
template <class T>
size_t firstEscapableInWord(T s) {
static_assert(std::is_unsigned<T>::value, "Unsigned integer required");
constexpr T kOnes = ~T() / 255; // 0x...0101
constexpr T kMsbs = kOnes * 0x80; // 0x...8080

// Sets the MSB of bytes < b. Precondition: b < 128.
auto isLess = [](T w, uint8_t b) {
// A byte is < b iff subtracting b underflows, so we check that
// the MSB wasn't set before and it's set after the subtraction.
return (w - kOnes * b) & ~w & kMsbs;
};

auto isChar = [&](uint8_t c) {
// A byte is == c iff it is 0 if xored with c.
return isLess(s ^ (kOnes * c), 1);
};

// The following masks have the MSB set for each byte of the word
// that satisfies the corresponding condition.
auto isHigh = s & kMsbs; // >= 128
auto isLow = isLess(s, 0x20); // <= 0x1f
auto needsEscape = isHigh | isLow | isChar('\\') | isChar('"');

if (!needsEscape) {
return sizeof(T);
}

if (folly::kIsLittleEndian) {
return folly::findFirstSet(needsEscape) / 8 - 1;
} else {
return sizeof(T) - folly::findLastSet(needsEscape) / 8;
}
}

// Escape a string so that it is legal to print it in JSON text.
void escapeString(
StringPiece input,
Expand All @@ -623,18 +663,48 @@ void escapeString(
auto* e = reinterpret_cast<const unsigned char*>(input.end());

while (p < e) {
// Find the longest prefix that does not need escaping, and copy
// it literally into the output string.
auto firstEsc = p;
while (firstEsc < e) {
auto avail = e - firstEsc;
uint64_t word = 0;
if (avail >= 8) {
word = folly::loadUnaligned<uint64_t>(firstEsc);
} else {
memcpy(static_cast<void*>(&word), firstEsc, avail);
}
auto prefix = firstEscapableInWord(word);
DCHECK_LE(prefix, avail);
firstEsc += prefix;
if (prefix < 8) {
break;
}
}
if (firstEsc > p) {
out.append(reinterpret_cast<const char*>(p), firstEsc - p);
p = firstEsc;
// We can't be in the middle of a multibyte sequence, so we can reset q.
q = p;
if (p == e) {
break;
}
}

// Handle the next byte that may need escaping.

// Since non-ascii encoding inherently does utf8 validation
// we explicitly validate utf8 only if non-ascii encoding is disabled.
if ((opts.validate_utf8 || opts.skip_invalid_utf8)
&& !opts.encode_non_ascii) {
// to achieve better spatial and temporal coherence
// To achieve better spatial and temporal coherence
// we do utf8 validation progressively along with the
// string-escaping instead of two separate passes
// string-escaping instead of two separate passes.

// as the encoding progresses, q will stay at or ahead of p
CHECK(q >= p);
// As the encoding progresses, q will stay at or ahead of p.
CHECK_GE(q, p);

// as p catches up with q, move q forward
// As p catches up with q, move q forward.
if (q == p) {
// calling utf8_decode has the side effect of
// checking that utf8 encodings are valid
Expand All @@ -650,14 +720,16 @@ void escapeString(
// note that this if condition captures utf8 chars
// with value > 127, so size > 1 byte
char32_t v = utf8ToCodePoint(p, e, opts.skip_invalid_utf8);
out.append("\\u");
out.push_back(hexDigit(uint8_t(v >> 12)));
out.push_back(hexDigit((v >> 8) & 0x0f));
out.push_back(hexDigit((v >> 4) & 0x0f));
out.push_back(hexDigit(v & 0x0f));
char buf[] = "\\u\0\0\0\0";
buf[2] = hexDigit(uint8_t(v >> 12));
buf[3] = hexDigit((v >> 8) & 0x0f);
buf[4] = hexDigit((v >> 4) & 0x0f);
buf[5] = hexDigit(v & 0x0f);
out.append(buf, 6);
} else if (*p == '\\' || *p == '\"') {
out.push_back('\\');
out.push_back(char(*p++));
char buf[] = "\\\0";
buf[1] = char(*p++);
out.append(buf, 2);
} else if (*p <= 0x1f) {
switch (*p) {
case '\b': out.append("\\b"); p++; break;
Expand All @@ -666,11 +738,12 @@ void escapeString(
case '\r': out.append("\\r"); p++; break;
case '\t': out.append("\\t"); p++; break;
default:
// note that this if condition captures non readable chars
// Note that this if condition captures non readable chars
// with value < 32, so size = 1 byte (e.g control chars).
out.append("\\u00");
out.push_back(hexDigit(uint8_t((*p & 0xf0) >> 4)));
out.push_back(hexDigit(uint8_t(*p & 0xf)));
char buf[] = "\\u00\0\0";
buf[4] = hexDigit(uint8_t((*p & 0xf0) >> 4));
buf[5] = hexDigit(uint8_t(*p & 0xf));
out.append(buf, 6);
p++;
}
} else {
Expand Down
99 changes: 50 additions & 49 deletions folly/test/JsonOtherTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,41 @@
#include <folly/json.h>

#include <folly/Benchmark.h>
#include <folly/Conv.h>
#include <folly/FileUtil.h>
#include <folly/Range.h>
#include <folly/portability/GFlags.h>
#include <folly/portability/GTest.h>

using folly::dynamic;
using folly::parseJson;
using folly::toJson;

constexpr folly::StringPiece kLargeAsciiString =
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk"
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk"
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk"
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk"
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk"
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk"
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk"
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk"
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk"
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk"
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk";

constexpr folly::StringPiece kLargeNonAsciiString =
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy";

TEST(Json, StripComments) {
const std::string kTestDir = "folly/test/";
const std::string kTestFile = "json_test_data/commented.json";
Expand All @@ -44,60 +71,44 @@ TEST(Json, StripComments) {
}

BENCHMARK(jsonSerialize, iters) {
const dynamic obj = kLargeNonAsciiString;

folly::json::serialization_opts opts;
for (size_t i = 0; i < iters; ++i) {
folly::json::serialize(
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy",
opts);
folly::json::serialize(obj, opts);
}
}

BENCHMARK(jsonSerializeWithNonAsciiEncoding, iters) {
const dynamic obj = kLargeNonAsciiString;

folly::json::serialization_opts opts;
opts.encode_non_ascii = true;

for (size_t i = 0; i < iters; ++i) {
folly::json::serialize(
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy",
opts);
folly::json::serialize(obj, opts);
}
}

BENCHMARK(jsonSerializeWithUtf8Validation, iters) {
const dynamic obj = kLargeNonAsciiString;

folly::json::serialization_opts opts;
opts.validate_utf8 = true;

for (size_t i = 0; i < iters; ++i) {
folly::json::serialize(
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy"
"qwerty \xc2\x80 \xef\xbf\xbf poiuy",
opts);
folly::json::serialize(obj, opts);
}
}

BENCHMARK(jsonSerializeAsciiWithUtf8Validation, iters) {
const dynamic obj = kLargeAsciiString;

folly::json::serialization_opts opts;
opts.validate_utf8 = true;

for (size_t i = 0; i < iters; ++i) {
folly::json::serialize(obj, opts);
}
}

Expand All @@ -114,20 +125,10 @@ BENCHMARK(parseNormalString, iters) {
}

BENCHMARK(parseBigString, iters) {
const auto json = folly::to<std::string>('"', kLargeAsciiString, '"');

for (size_t i = 0; i < iters; ++i) {
parseJson("\""
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk"
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk"
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk"
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk"
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk"
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk"
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk"
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk"
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk"
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk"
"akjhfk jhkjlakjhfk jhkjlakjhfk jhkjl akjhfk"
"\"");
parseJson(json);
}
}

Expand Down
47 changes: 47 additions & 0 deletions folly/test/JsonTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,53 @@ TEST(Json, JsonEscape) {
R"("\b\f\n\r\u0001\t\\\"/\u000b\u0007")");
}

TEST(Json, EscapeCornerCases) {
// The escaping logic uses some bitwise operations to determine
// which bytes need escaping 8 bytes at a time. Test that this logic
// is correct regardless of positions by planting 2 characters that
// may need escaping at each possible position and checking the
// result, for varying string lengths.

folly::json::serialization_opts opts;
opts.validate_utf8 = true;

std::string s;
std::string expected;
for (bool ascii : {true, false}) {
opts.encode_non_ascii = ascii;

for (size_t len = 2; len < 32; ++len) {
for (size_t i = 0; i < len; ++i) {
for (size_t j = 0; j < len; ++j) {
if (i == j) {
continue;
}

s.clear();
expected.clear();

expected.push_back('"');
for (size_t pos = 0; pos < len; ++pos) {
if (pos == i) {
s.push_back('\\');
expected.append("\\\\");
} else if (pos == j) {
s.append("\xe2\x82\xac");
expected.append(ascii ? "\\u20ac" : "\xe2\x82\xac");
} else {
s.push_back('x');
expected.push_back('x');
}
}
expected.push_back('"');

EXPECT_EQ(folly::json::serialize(s, opts), expected) << ascii;
}
}
}
}
}

TEST(Json, JsonNonAsciiEncoding) {
folly::json::serialization_opts opts;
opts.encode_non_ascii = true;
Expand Down

0 comments on commit 2f0cabf

Please sign in to comment.