Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Newer
Older
100644 697 lines (596 sloc) 16.879 kb
27494a2 @jdelong Pull from FB rev 63ce89e2f2301e6bba44a111cc7d4218022156f6
jdelong authored
1 /*
2 * Copyright 2012 Facebook, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "folly/json.h"
18 #include <cassert>
19 #include <boost/next_prior.hpp>
20 #include <boost/algorithm/string.hpp>
21
22 #include "folly/Range.h"
23 #include "folly/Unicode.h"
24 #include "folly/Conv.h"
25
26 namespace folly {
27
28 //////////////////////////////////////////////////////////////////////
29
30 namespace json {
31 namespace {
32
33 char32_t decodeUtf8(const char*& p, const char* const e) {
34 /* The following encodings are valid, except for the 5 and 6 byte
35 * combinations:
36 * 0xxxxxxx
37 * 110xxxxx 10xxxxxx
38 * 1110xxxx 10xxxxxx 10xxxxxx
39 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
40 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
41 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
42 */
43
44 if (p >= e) {
45 throw std::runtime_error("folly::decodeUtf8 empty/invalid string");
46 }
47
48 unsigned char fst = *p;
49 if (!(fst & 0x80)) {
50 // trivial case
51 return *p++;
52 }
53
54 static const uint32_t bitMask[] = {
55 (1 << 7) - 1,
56 (1 << 11) - 1,
57 (1 << 16) - 1,
58 (1 << 21) - 1
59 };
60
61 // upper control bits are masked out later
62 uint32_t d = fst;
63
64 if ((fst & 0xC0) != 0xC0) {
65 throw std::runtime_error(
66 to<std::string>("folly::decodeUtf8 i=0 d=", d));
67 }
68
69 fst <<= 1;
70
71 for (unsigned int i = 1; i != 3 && p + i < e; ++i) {
72 unsigned char tmp = p[i];
73
74 if ((tmp & 0xC0) != 0x80) {
75 throw std::runtime_error(
76 to<std::string>("folly::decodeUtf8 i=", i, " tmp=", (uint32_t)tmp));
77 }
78
79 d = (d << 6) | (tmp & 0x3F);
80 fst <<= 1;
81
82 if (!(fst & 0x80)) {
83 d &= bitMask[i];
84
85 // overlong, could have been encoded with i bytes
86 if ((d & ~bitMask[i - 1]) == 0) {
87 throw std::runtime_error(
88 to<std::string>("folly::decodeUtf8 i=", i, " d=", d));
89 }
90
91 // check for surrogates only needed for 3 bytes
92 if (i == 2) {
93 if ((d >= 0xD800 && d <= 0xDFFF) || d > 0x10FFFF) {
94 throw std::runtime_error(
95 to<std::string>("folly::decodeUtf8 i=", i, " d=", d));
96 }
97 }
98
99 p += i + 1;
100 return d;
101 }
102 }
103
104 throw std::runtime_error("folly::decodeUtf8 encoding length maxed out");
105 }
106
107 // Escape a string so that it is legal to print it in JSON text.
108 void escapeString(StringPiece input,
109 fbstring& out,
110 const serialization_opts& opts) {
111 auto hexDigit = [] (int c) -> char {
112 return c < 10 ? c + '0' : c - 10 + 'a';
113 };
114
115 out.reserve(out.size() + input.size() + 2);
116 out.push_back('\"');
117
118 const char* p = input.begin();
119 const char* q = input.begin();
120 const char* const e = input.end();
121
122 while (p < e) {
123 // Since non-ascii encoding inherently does utf8 validation
124 // we explicitly validate utf8 only if non-ascii encoding is disabled.
125 if (opts.validate_utf8 && !opts.encode_non_ascii) {
126 // to achieve better spatial and temporal coherence
127 // we do utf8 validation progressively along with the
128 // string-escaping instead of two separate passes
129
130 // as the encoding progresses, q will stay at or ahead of p
131 CHECK(q >= p);
132
133 // as p catches up with q, move q forward
134 if (q == p) {
135 // calling utf8_decode has the side effect of
136 // checking that utf8 encodings are valid
137 decodeUtf8(q, e);
138 }
139 }
140
141 if (opts.encode_non_ascii && (*p & 0x80)) {
142 char32_t v = decodeUtf8(p, e);
143 out.append("\\u");
144 out.push_back(hexDigit(v >> 12));
145 out.push_back(hexDigit((v >> 8) & 0x0f));
146 out.push_back(hexDigit((v >> 4) & 0x0f));
147 out.push_back(hexDigit(v & 0x0f));
148 continue;
149 }
150 if (*p == '\\' || *p == '\"') {
151 out.push_back('\\');
152 out.push_back(*p++);
153 continue;
154 }
155 if (*p <= 0x1f) {
156 // note that this if condition captures both control characters
157 // and extended ascii characters
158 out.append("\\u00");
159 out.push_back(hexDigit((*p & 0xf0) >> 4));
160 out.push_back(hexDigit(*p & 0xf));
161 p++;
162 continue;
163 }
164 out.push_back(*p++);
165 }
166
167 out.push_back('\"');
168 }
169
170 struct Printer {
171 explicit Printer(fbstring& out,
172 unsigned* indentLevel,
173 serialization_opts const* opts)
174 : out_(out)
175 , indentLevel_(indentLevel)
176 , opts_(*opts)
177 {}
178
179 void operator()(dynamic const& v) const {
180 switch (v.type()) {
181 case dynamic::DOUBLE:
182 toAppend(v.asDouble(), &out_);
183 break;
184 case dynamic::INT64: {
185 auto intval = v.asInt();
186 if (opts_.javascript_safe) {
187 // Use folly::to to check that this integer can be represented
188 // as a double without loss of precision.
189 intval = int64_t(to<double>(intval));
190 }
191 toAppend(intval, &out_);
192 break;
193 }
194 case dynamic::BOOL:
195 out_ += v.asBool() ? "true" : "false";
196 break;
197 case dynamic::NULLT:
198 out_ += "null";
199 break;
200 case dynamic::STRING:
201 escapeString(v.asString(), out_, opts_);
202 break;
203 case dynamic::OBJECT:
204 printObject(v);
205 break;
206 case dynamic::ARRAY:
207 printArray(v);
208 break;
209 default:
210 CHECK(0) << "Bad type " << v.type();
211 }
212 }
213
214 private:
215 void printKV(const std::pair<dynamic, dynamic>& p) const {
216 if (!opts_.allow_non_string_keys && !p.first.isString()) {
217 throw std::runtime_error("folly::toJson: JSON object key was not a "
218 "string");
219 }
220 (*this)(p.first);
221 mapColon();
222 (*this)(p.second);
223 }
224
225 void printObject(dynamic const& o) const {
226 if (o.empty()) {
227 out_ += "{}";
228 return;
229 }
230
231 out_ += '{';
232 indent();
233 newline();
234 auto it = o.items().begin();
235 printKV(*it);
236 for (++it; it != o.items().end(); ++it) {
237 out_ += ',';
238 newline();
239 printKV(*it);
240 }
241 outdent();
242 newline();
243 out_ += '}';
244 }
245
246 void printArray(dynamic const& a) const {
247 if (a.empty()) {
248 out_ += "[]";
249 return;
250 }
251
252 out_ += '[';
253 indent();
254 newline();
255 (*this)(a[0]);
256 for (auto& val : makeRange(boost::next(a.begin()), a.end())) {
257 out_ += ',';
258 newline();
259 (*this)(val);
260 }
261 outdent();
262 newline();
263 out_ += ']';
264 }
265
266 private:
267 void outdent() const {
268 if (indentLevel_) {
269 --*indentLevel_;
270 }
271 }
272
273 void indent() const {
274 if (indentLevel_) {
275 ++*indentLevel_;
276 }
277 }
278
279 void newline() const {
280 if (indentLevel_) {
281 out_ += to<fbstring>('\n', fbstring(*indentLevel_ * 2, ' '));
282 }
283 }
284
285 void mapColon() const {
286 out_ += indentLevel_ ? " : " : ":";
287 }
288
289 private:
290 fbstring& out_;
291 unsigned* const indentLevel_;
292 serialization_opts const& opts_;
293 };
294
295 //////////////////////////////////////////////////////////////////////
296
297 struct ParseError : std::runtime_error {
298 explicit ParseError(int line)
299 : std::runtime_error(to<std::string>("json parse error on line ", line))
300 {}
301
302 explicit ParseError(int line, std::string const& context,
303 std::string const& expected)
304 : std::runtime_error(to<std::string>("json parse error on line ", line,
305 !context.empty() ? to<std::string>(" near `", context, '\'')
306 : "",
307 ": ", expected))
308 {}
309
310 explicit ParseError(std::string const& what)
311 : std::runtime_error("json parse error: " + what)
312 {}
313 };
314
315 // Wraps our input buffer with some helper functions.
316 struct Input {
317 explicit Input(StringPiece range)
318 : range_(range)
319 , lineNum_(0)
320 {
321 storeCurrent();
322 }
323
324 Input(Input const&) = delete;
325 Input& operator=(Input const&) = delete;
326
327 char const* begin() const { return range_.begin(); }
328
329 // Parse ahead for as long as the supplied predicate is satisfied,
330 // returning a range of what was skipped.
331 template<class Predicate>
332 StringPiece skipWhile(const Predicate& p) {
333 std::size_t skipped = 0;
334 for (; skipped < range_.size(); ++skipped) {
335 if (!p(range_[skipped])) {
336 break;
337 }
338 if (range_[skipped] == '\n') {
339 ++lineNum_;
340 }
341 }
342 auto ret = range_.subpiece(0, skipped);
343 range_.advance(skipped);
344 storeCurrent();
345 return ret;
346 }
347
348 StringPiece skipDigits() {
349 return skipWhile([] (char c) { return c >= '0' && c <= '9'; });
350 }
351
352 void skipWhitespace() {
353 // Spaces other than ' ' characters are less common but should be
354 // checked. This configuration where we loop on the ' '
355 // separately from oddspaces was empirically fastest.
356 auto oddspace = [] (char c) {
357 return c == '\n' || c == '\t' || c == '\r';
358 };
359
360 loop:
361 for (; !range_.empty() && range_.front() == ' '; range_.pop_front()) {
362 }
363 if (!range_.empty() && oddspace(range_.front())) {
364 range_.pop_front();
365 goto loop;
366 }
367 storeCurrent();
368 }
369
370 void expect(char c) {
371 if (**this != c) {
372 throw ParseError(lineNum_, context(),
373 to<std::string>("expected '", c, '\''));
374 }
375 ++*this;
376 }
377
378 std::size_t size() const {
379 return range_.size();
380 }
381
382 int operator*() const {
383 return current_;
384 }
385
386 void operator++() {
387 range_.pop_front();
388 storeCurrent();
389 }
390
391 template<class T>
392 T extract() {
393 try {
394 return to<T>(&range_);
395 } catch (std::exception const& e) {
396 error(e.what());
397 }
398 }
399
400 bool consume(StringPiece str) {
401 if (boost::starts_with(range_, str)) {
402 range_.advance(str.size());
403 storeCurrent();
404 return true;
405 }
406 return false;
407 }
408
409 std::string context() const {
410 return range_.subpiece(0, 16 /* arbitrary */).toString();
411 }
412
413 dynamic error(char const* what) const {
414 throw ParseError(lineNum_, context(), what);
415 }
416
417 private:
418 void storeCurrent() {
419 current_ = range_.empty() ? EOF : range_.front();
420 }
421
422 private:
423 StringPiece range_;
424 unsigned lineNum_;
425 int current_;
426 };
427
428 dynamic parseValue(Input& in);
429 fbstring parseString(Input& in);
430
431 dynamic parseObject(Input& in) {
432 assert(*in == '{');
433 ++in;
434
435 dynamic ret = dynamic::object;
436
437 in.skipWhitespace();
438 if (*in == '}') {
439 ++in;
440 return ret;
441 }
442
443 for (;;) {
444 if (*in != '\"') {
445 in.error("expected string for object key name");
446 }
447 auto key = parseString(in);
448 in.skipWhitespace();
449 in.expect(':');
450 in.skipWhitespace();
451 ret.insert(std::move(key), parseValue(in));
452 in.skipWhitespace();
453 if (*in != ',') {
454 break;
455 }
456 ++in;
457 in.skipWhitespace();
458 }
459 in.expect('}');
460
461 return ret;
462 }
463
464 dynamic parseArray(Input& in) {
465 assert(*in == '[');
466 ++in;
467
468 dynamic ret = {};
469
470 in.skipWhitespace();
471 if (*in == ']') {
472 ++in;
473 return ret;
474 }
475
476 for (;;) {
477 ret.push_back(parseValue(in));
478 in.skipWhitespace();
479 if (*in != ',') {
480 break;
481 }
482 ++in;
483 in.skipWhitespace();
484 }
485 in.expect(']');
486
487 return ret;
488 }
489
490 dynamic parseNumber(Input& in) {
491 bool const negative = (*in == '-');
492 if (negative) {
493 ++in;
494 if (in.consume("Infinity")) {
495 return -std::numeric_limits<double>::infinity();
496 }
497 }
498
499 auto integral = in.skipDigits();
500 if (integral.empty()) {
501 in.error("expected digits after `-'");
502 }
503 auto const wasE = *in == 'e' || *in == 'E';
504 if (*in != '.' && !wasE) {
505 auto val = to<int64_t>(integral);
506 if (negative) {
507 val = -val;
508 }
509 in.skipWhitespace();
510 return val;
511 }
512
513 auto end = !wasE ? (++in, in.skipDigits().end()) : in.begin();
514 if (*in == 'e' || *in == 'E') {
515 ++in;
516 if (*in == '+' || *in == '-') {
517 ++in;
518 }
519 auto expPart = in.skipDigits();
520 end = expPart.end();
521 }
522 auto fullNum = makeRange(integral.begin(), end);
523
524 auto val = to<double>(fullNum);
525 if (negative) {
526 val *= -1;
527 }
528 return val;
529 }
530
531 fbstring decodeUnicodeEscape(Input& in) {
532 auto hexVal = [&] (char c) -> unsigned {
533 return c >= '0' && c <= '9' ? c - '0' :
534 c >= 'a' && c <= 'f' ? c - 'a' + 10 :
535 c >= 'A' && c <= 'F' ? c - 'A' + 10 :
536 (in.error("invalid hex digit"), 0);
537 };
538
539 auto readHex = [&] {
540 if (in.size() < 4) {
541 in.error("expected 4 hex digits");
542 }
543
544 uint16_t ret = hexVal(*in) * 4096;
545 ++in;
546 ret += hexVal(*in) * 256;
547 ++in;
548 ret += hexVal(*in) * 16;
549 ++in;
550 ret += hexVal(*in);
551 ++in;
552 return ret;
553 };
554
555 /*
556 * If the value encoded is in the surrogate pair range, we need to
557 * make sure there is another escape that we can use also.
558 */
559 uint32_t codePoint = readHex();
560 if (codePoint >= 0xd800 && codePoint <= 0xdbff) {
561 if (!in.consume("\\u")) {
562 in.error("expected another unicode escape for second half of "
563 "surrogate pair");
564 }
565 uint16_t second = readHex();
566 if (second >= 0xdc00 && second <= 0xdfff) {
567 codePoint = 0x10000 + ((codePoint & 0x3ff) << 10) +
568 (second & 0x3ff);
569 } else {
570 in.error("second character in surrogate pair is invalid");
571 }
572 } else if (codePoint >= 0xdc00 && codePoint <= 0xdfff) {
573 in.error("invalid unicode code point (in range [0xdc00,0xdfff])");
574 }
575
576 return codePointToUtf8(codePoint);
577 }
578
579 fbstring parseString(Input& in) {
580 assert(*in == '\"');
581 ++in;
582
583 fbstring ret;
584 for (;;) {
585 auto range = in.skipWhile(
586 [] (char c) { return c != '\"' && c != '\\'; }
587 );
588 ret.append(range.begin(), range.end());
589
590 if (*in == '\"') {
591 ++in;
592 break;
593 }
594 if (*in == '\\') {
595 ++in;
596 switch (*in) {
597 case '\"': ret.push_back('\"'); ++in; break;
598 case '\\': ret.push_back('\\'); ++in; break;
599 case '/': ret.push_back('/'); ++in; break;
600 case 'b': ret.push_back('\b'); ++in; break;
601 case 'f': ret.push_back('\f'); ++in; break;
602 case 'n': ret.push_back('\n'); ++in; break;
603 case 'r': ret.push_back('\r'); ++in; break;
604 case 't': ret.push_back('\t'); ++in; break;
605 case 'u': ++in; ret += decodeUnicodeEscape(in); break;
606 default: in.error(to<fbstring>("unknown escape ", *in,
607 " in string").c_str());
608 }
609 continue;
610 }
611 if (*in == EOF) {
612 in.error("unterminated string");
613 }
614 if (!*in) {
615 /*
616 * Apparently we're actually supposed to ban all control
617 * characters from strings. This seems unnecessarily
618 * restrictive, so we're only banning zero bytes. (Since the
619 * string is presumed to be UTF-8 encoded it's fine to just
620 * check this way.)
621 */
622 in.error("null byte in string");
623 }
624
625 ret.push_back(*in);
626 ++in;
627 }
628
629 return ret;
630 }
631
632 dynamic parseValue(Input& in) {
633 in.skipWhitespace();
634 return *in == '[' ? parseArray(in) :
635 *in == '{' ? parseObject(in) :
636 *in == '\"' ? parseString(in) :
637 (*in == '-' || (*in >= '0' && *in <= '9')) ? parseNumber(in) :
638 in.consume("true") ? true :
639 in.consume("false") ? false :
640 in.consume("null") ? nullptr :
641 in.consume("Infinity") ? std::numeric_limits<double>::infinity() :
642 in.consume("NaN") ? std::numeric_limits<double>::quiet_NaN() :
643 in.error("expected json value");
644 }
645
646 }
647
648 //////////////////////////////////////////////////////////////////////
649
650 fbstring serialize(dynamic const& dyn, serialization_opts const& opts) {
651 fbstring ret;
652 unsigned indentLevel = 0;
653 Printer p(ret, opts.pretty_formatting ? &indentLevel : nullptr, &opts);
654 p(dyn);
655 return ret;
656 }
657
658 }
659
660 //////////////////////////////////////////////////////////////////////
661
662 dynamic parseJson(StringPiece range) {
663 json::Input in(range);
664
665 auto ret = parseValue(in);
666 in.skipWhitespace();
667 if (*in != '\0' && in.size()) {
668 in.error("parsing didn't consume all input");
669 }
670 return ret;
671 }
672
673 fbstring toJson(dynamic const& dyn) {
674 return json::serialize(dyn, json::serialization_opts());
675 }
676
677 fbstring toPrettyJson(dynamic const& dyn) {
678 json::serialization_opts opts;
679 opts.pretty_formatting = true;
680 return json::serialize(dyn, opts);
681 }
682
683 //////////////////////////////////////////////////////////////////////
684 // dynamic::print_as_pseudo_json() is implemented here for header
685 // ordering reasons (most of the dynamic implementation is in
686 // dynamic-inl.h, which we don't want to include json.h).
687
688 void dynamic::print_as_pseudo_json(std::ostream& out) const {
689 json::serialization_opts opts;
690 opts.allow_non_string_keys = true;
691 out << json::serialize(*this, opts);
692 }
693
694 //////////////////////////////////////////////////////////////////////
695
696 }
Something went wrong with that request. Please try again.