Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Newer
Older
100644 651 lines (597 sloc) 18.865 kB
27494a2 @jdelong Pull from FB rev 63ce89e2f2301e6bba44a111cc7d4218022156f6
jdelong authored
1 /*
52ba96e @andralex readFile reads an entire file into a string, vector<char>, or similar
andralex authored
2 * Copyright 2014 Facebook, Inc.
27494a2 @jdelong Pull from FB rev 63ce89e2f2301e6bba44a111cc7d4218022156f6
jdelong authored
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #ifndef FOLLY_STRING_INL_H_
18 #define FOLLY_STRING_INL_H_
19
20 #include <stdexcept>
f3f96c6 @philippv strings join
philippv authored
21 #include <iterator>
27494a2 @jdelong Pull from FB rev 63ce89e2f2301e6bba44a111cc7d4218022156f6
jdelong authored
22
23 #ifndef FOLLY_BASE_STRING_H_
24 #error This file may only be included from String.h
25 #endif
26
27 namespace folly {
28
29 namespace detail {
30 // Map from character code to value of one-character escape sequence
31 // ('\n' = 10 maps to 'n'), 'O' if the character should be printed as
32 // an octal escape sequence, or 'P' if the character is printable and
33 // should be printed as is.
34 extern const char cEscapeTable[];
35 } // namespace detail
36
37 template <class String>
38 void cEscape(StringPiece str, String& out) {
39 char esc[4];
40 esc[0] = '\\';
41 out.reserve(out.size() + str.size());
42 auto p = str.begin();
43 auto last = p; // last regular character
44 // We advance over runs of regular characters (printable, not double-quote or
45 // backslash) and copy them in one go; this is faster than calling push_back
46 // repeatedly.
47 while (p != str.end()) {
48 char c = *p;
49 unsigned char v = static_cast<unsigned char>(c);
50 char e = detail::cEscapeTable[v];
51 if (e == 'P') { // printable
52 ++p;
53 } else if (e == 'O') { // octal
54 out.append(&*last, p - last);
55 esc[1] = '0' + ((v >> 6) & 7);
56 esc[2] = '0' + ((v >> 3) & 7);
57 esc[3] = '0' + (v & 7);
58 out.append(esc, 4);
59 ++p;
60 last = p;
61 } else { // special 1-character escape
62 out.append(&*last, p - last);
63 esc[1] = e;
64 out.append(esc, 2);
65 ++p;
66 last = p;
67 }
68 }
69 out.append(&*last, p - last);
70 }
71
72 namespace detail {
73 // Map from the character code of the character following a backslash to
74 // the unescaped character if a valid one-character escape sequence
75 // ('n' maps to 10 = '\n'), 'O' if this is the first character of an
76 // octal escape sequence, 'X' if this is the first character of a
77 // hexadecimal escape sequence, or 'I' if this escape sequence is invalid.
78 extern const char cUnescapeTable[];
79
80 // Map from the character code to the hex value, or 16 if invalid hex char.
81 extern const unsigned char hexTable[];
82 } // namespace detail
83
84 template <class String>
85 void cUnescape(StringPiece str, String& out, bool strict) {
86 out.reserve(out.size() + str.size());
87 auto p = str.begin();
88 auto last = p; // last regular character (not part of an escape sequence)
89 // We advance over runs of regular characters (not backslash) and copy them
90 // in one go; this is faster than calling push_back repeatedly.
91 while (p != str.end()) {
92 char c = *p;
93 if (c != '\\') { // normal case
94 ++p;
95 continue;
96 }
97 out.append(&*last, p - last);
98 if (p == str.end()) { // backslash at end of string
99 if (strict) {
100 throw std::invalid_argument("incomplete escape sequence");
101 }
102 out.push_back('\\');
103 last = p;
104 continue;
105 }
106 ++p;
107 char e = detail::cUnescapeTable[static_cast<unsigned char>(*p)];
108 if (e == 'O') { // octal
109 unsigned char val = 0;
110 for (int i = 0; i < 3 && p != str.end() && *p >= '0' && *p <= '7';
111 ++i, ++p) {
112 val = (val << 3) | (*p - '0');
113 }
114 out.push_back(val);
115 last = p;
116 } else if (e == 'X') { // hex
117 ++p;
118 if (p == str.end()) { // \x at end of string
119 if (strict) {
120 throw std::invalid_argument("incomplete hex escape sequence");
121 }
122 out.append("\\x");
123 last = p;
124 continue;
125 }
126 unsigned char val = 0;
127 unsigned char h;
128 for (; (p != str.end() &&
129 (h = detail::hexTable[static_cast<unsigned char>(*p)]) < 16);
130 ++p) {
131 val = (val << 4) | h;
132 }
133 out.push_back(val);
134 last = p;
135 } else if (e == 'I') { // invalid
136 if (strict) {
137 throw std::invalid_argument("invalid escape sequence");
138 }
139 out.push_back('\\');
140 out.push_back(*p);
141 ++p;
142 last = p;
143 } else { // standard escape sequence, \' etc
144 out.push_back(e);
145 ++p;
146 last = p;
147 }
148 }
149 out.append(&*last, p - last);
150 }
151
152 namespace detail {
f17d138 @tudor Revert "Revert "URI parsing in folly""
tudor authored
153 // Map from character code to escape mode:
154 // 0 = pass through
155 // 1 = unused
156 // 2 = pass through in PATH mode
157 // 3 = space, replace with '+' in QUERY mode
158 // 4 = percent-encode
159 extern const unsigned char uriEscapeTable[];
160 } // namespace detail
161
162 template <class String>
163 void uriEscape(StringPiece str, String& out, UriEscapeMode mode) {
164 static const char hexValues[] = "0123456789abcdef";
165 char esc[3];
166 esc[0] = '%';
167 // Preallocate assuming that 25% of the input string will be escaped
168 out.reserve(out.size() + str.size() + 3 * (str.size() / 4));
169 auto p = str.begin();
170 auto last = p; // last regular character
171 // We advance over runs of passthrough characters and copy them in one go;
172 // this is faster than calling push_back repeatedly.
173 unsigned char minEncode = static_cast<unsigned char>(mode);
174 while (p != str.end()) {
175 char c = *p;
176 unsigned char v = static_cast<unsigned char>(c);
177 unsigned char discriminator = detail::uriEscapeTable[v];
178 if (LIKELY(discriminator <= minEncode)) {
179 ++p;
180 } else if (mode == UriEscapeMode::QUERY && discriminator == 3) {
181 out.append(&*last, p - last);
182 out.push_back('+');
183 ++p;
184 last = p;
185 } else {
186 out.append(&*last, p - last);
187 esc[1] = hexValues[v >> 4];
188 esc[2] = hexValues[v & 0x0f];
189 out.append(esc, 3);
190 ++p;
191 last = p;
192 }
193 }
194 out.append(&*last, p - last);
195 }
196
197 template <class String>
198 void uriUnescape(StringPiece str, String& out, UriEscapeMode mode) {
199 out.reserve(out.size() + str.size());
200 auto p = str.begin();
201 auto last = p;
202 // We advance over runs of passthrough characters and copy them in one go;
203 // this is faster than calling push_back repeatedly.
204 while (p != str.end()) {
205 char c = *p;
206 unsigned char v = static_cast<unsigned char>(v);
207 switch (c) {
208 case '%':
209 {
210 if (UNLIKELY(std::distance(p, str.end()) < 3)) {
211 throw std::invalid_argument("incomplete percent encode sequence");
212 }
213 auto h1 = detail::hexTable[static_cast<unsigned char>(p[1])];
214 auto h2 = detail::hexTable[static_cast<unsigned char>(p[2])];
215 if (UNLIKELY(h1 == 16 || h2 == 16)) {
216 throw std::invalid_argument("invalid percent encode sequence");
217 }
218 out.append(&*last, p - last);
219 out.push_back((h1 << 4) | h2);
220 p += 3;
221 last = p;
222 break;
223 }
224 case '+':
225 if (mode == UriEscapeMode::QUERY) {
226 out.append(&*last, p - last);
227 out.push_back(' ');
228 ++p;
229 last = p;
230 break;
231 }
232 // else fallthrough
233 default:
234 ++p;
235 break;
236 }
237 }
238 out.append(&*last, p - last);
239 }
240
241 namespace detail {
27494a2 @jdelong Pull from FB rev 63ce89e2f2301e6bba44a111cc7d4218022156f6
jdelong authored
242
243 /*
244 * The following functions are type-overloaded helpers for
245 * internalSplit().
246 */
247 inline size_t delimSize(char) { return 1; }
248 inline size_t delimSize(StringPiece s) { return s.size(); }
249 inline bool atDelim(const char* s, char c) {
250 return *s == c;
251 }
252 inline bool atDelim(const char* s, StringPiece sp) {
253 return !std::memcmp(s, sp.start(), sp.size());
254 }
255
256 // These are used to short-circuit internalSplit() in the case of
257 // 1-character strings.
258 inline char delimFront(char c) {
259 // This one exists only for compile-time; it should never be called.
260 std::abort();
261 return c;
262 }
263 inline char delimFront(StringPiece s) {
264 assert(!s.empty() && s.start() != nullptr);
265 return *s.start();
266 }
267
268 /*
269 * These output conversion templates allow us to support multiple
270 * output string types, even when we are using an arbitrary
271 * OutputIterator.
272 */
273 template<class OutStringT> struct OutputConverter {};
274
275 template<> struct OutputConverter<std::string> {
276 std::string operator()(StringPiece sp) const {
277 return sp.toString();
278 }
279 };
280
281 template<> struct OutputConverter<fbstring> {
282 fbstring operator()(StringPiece sp) const {
283 return sp.toFbstring();
284 }
285 };
286
287 template<> struct OutputConverter<StringPiece> {
288 StringPiece operator()(StringPiece sp) const { return sp; }
289 };
290
291 /*
292 * Shared implementation for all the split() overloads.
293 *
294 * This uses some external helpers that are overloaded to let this
295 * algorithm be more performant if the deliminator is a single
296 * character instead of a whole string.
297 *
298 * @param ignoreEmpty iff true, don't copy empty segments to output
299 */
300 template<class OutStringT, class DelimT, class OutputIterator>
301 void internalSplit(DelimT delim, StringPiece sp, OutputIterator out,
302 bool ignoreEmpty) {
07ed0a7 @tudor Fix overeager assertion
tudor authored
303 assert(sp.empty() || sp.start() != nullptr);
27494a2 @jdelong Pull from FB rev 63ce89e2f2301e6bba44a111cc7d4218022156f6
jdelong authored
304
305 const char* s = sp.start();
306 const size_t strSize = sp.size();
307 const size_t dSize = delimSize(delim);
308
309 OutputConverter<OutStringT> conv;
310
311 if (dSize > strSize || dSize == 0) {
312 if (!ignoreEmpty || strSize > 0) {
313 *out++ = conv(sp);
314 }
315 return;
316 }
317 if (boost::is_same<DelimT,StringPiece>::value && dSize == 1) {
318 // Call the char version because it is significantly faster.
319 return internalSplit<OutStringT>(delimFront(delim), sp, out,
320 ignoreEmpty);
321 }
322
323 int tokenStartPos = 0;
324 int tokenSize = 0;
325 for (int i = 0; i <= strSize - dSize; ++i) {
326 if (atDelim(&s[i], delim)) {
327 if (!ignoreEmpty || tokenSize > 0) {
328 *out++ = conv(StringPiece(&s[tokenStartPos], tokenSize));
329 }
330
331 tokenStartPos = i + dSize;
332 tokenSize = 0;
333 i += dSize - 1;
334 } else {
335 ++tokenSize;
336 }
337 }
338
339 if (!ignoreEmpty || tokenSize > 0) {
340 tokenSize = strSize - tokenStartPos;
341 *out++ = conv(StringPiece(&s[tokenStartPos], tokenSize));
342 }
343 }
344
345 template<class String> StringPiece prepareDelim(const String& s) {
346 return StringPiece(s);
347 }
348 inline char prepareDelim(char c) { return c; }
349
6b0452d Fixed-size split()
Tom Jackson authored
350 template<bool exact,
351 class Delim>
352 bool splitFixed(const Delim& delimiter,
353 StringPiece input,
354 StringPiece& out) {
355 if (exact && UNLIKELY(std::string::npos != input.find(delimiter))) {
356 return false;
357 }
358 out = input;
359 return true;
360 }
361
362 template<bool exact,
363 class Delim,
364 class... StringPieces>
365 bool splitFixed(const Delim& delimiter,
366 StringPiece input,
367 StringPiece& outHead,
368 StringPieces&... outTail) {
369 size_t cut = input.find(delimiter);
370 if (UNLIKELY(cut == std::string::npos)) {
371 return false;
372 }
373 StringPiece head(input.begin(), input.begin() + cut);
374 StringPiece tail(input.begin() + cut + detail::delimSize(delimiter),
375 input.end());
376 if (LIKELY(splitFixed<exact>(delimiter, tail, outTail...))) {
377 outHead = head;
378 return true;
379 }
380 return false;
381 }
382
27494a2 @jdelong Pull from FB rev 63ce89e2f2301e6bba44a111cc7d4218022156f6
jdelong authored
383 }
384
385 //////////////////////////////////////////////////////////////////////
386
387 template<class Delim, class String, class OutputType>
388 void split(const Delim& delimiter,
389 const String& input,
390 std::vector<OutputType>& out,
391 bool ignoreEmpty) {
392 detail::internalSplit<OutputType>(
393 detail::prepareDelim(delimiter),
394 StringPiece(input),
395 std::back_inserter(out),
396 ignoreEmpty);
397 }
398
399 template<class Delim, class String, class OutputType>
400 void split(const Delim& delimiter,
401 const String& input,
402 fbvector<OutputType>& out,
a8370f0 @juchem Clang complains about errors in folly String-inl.h and FBString.h (P1…
juchem authored
403 bool ignoreEmpty) {
27494a2 @jdelong Pull from FB rev 63ce89e2f2301e6bba44a111cc7d4218022156f6
jdelong authored
404 detail::internalSplit<OutputType>(
405 detail::prepareDelim(delimiter),
406 StringPiece(input),
407 std::back_inserter(out),
408 ignoreEmpty);
409 }
410
411 template<class OutputValueType, class Delim, class String,
412 class OutputIterator>
413 void splitTo(const Delim& delimiter,
414 const String& input,
415 OutputIterator out,
416 bool ignoreEmpty) {
417 detail::internalSplit<OutputValueType>(
418 detail::prepareDelim(delimiter),
419 StringPiece(input),
420 out,
421 ignoreEmpty);
422 }
423
6b0452d Fixed-size split()
Tom Jackson authored
424 template<bool exact,
425 class Delim,
426 class... StringPieces>
427 bool split(const Delim& delimiter,
428 StringPiece input,
429 StringPiece& outHead,
430 StringPieces&... outTail) {
431 return detail::splitFixed<exact>(
432 detail::prepareDelim(delimiter),
433 input,
434 outHead,
435 outTail...);
436 }
437
f3f96c6 @philippv strings join
philippv authored
438 namespace detail {
439
440 template <class Iterator>
441 struct IsStringContainerIterator :
442 IsSomeString<typename std::iterator_traits<Iterator>::value_type> {
443 };
444
445 template <class Delim, class Iterator, class String>
446 void internalJoinAppend(Delim delimiter,
447 Iterator begin,
448 Iterator end,
449 String& output) {
450 assert(begin != end);
5df0c97 @philippv one more simple folly::join optimization
philippv authored
451 if (std::is_same<Delim, StringPiece>::value &&
452 delimSize(delimiter) == 1) {
453 internalJoinAppend(delimFront(delimiter), begin, end, output);
454 return;
455 }
f3f96c6 @philippv strings join
philippv authored
456 toAppend(*begin, &output);
457 while (++begin != end) {
458 toAppend(delimiter, *begin, &output);
459 }
460 }
461
462 template <class Delim, class Iterator, class String>
463 typename std::enable_if<IsStringContainerIterator<Iterator>::value>::type
464 internalJoin(Delim delimiter,
465 Iterator begin,
466 Iterator end,
467 String& output) {
468 output.clear();
469 if (begin == end) {
470 return;
471 }
472 const size_t dsize = delimSize(delimiter);
473 Iterator it = begin;
474 size_t size = it->size();
475 while (++it != end) {
476 size += dsize + it->size();
477 }
478 output.reserve(size);
479 internalJoinAppend(delimiter, begin, end, output);
480 }
481
482 template <class Delim, class Iterator, class String>
483 typename std::enable_if<!IsStringContainerIterator<Iterator>::value>::type
484 internalJoin(Delim delimiter,
485 Iterator begin,
486 Iterator end,
487 String& output) {
488 output.clear();
489 if (begin == end) {
490 return;
491 }
492 internalJoinAppend(delimiter, begin, end, output);
493 }
494
495 } // namespace detail
496
497 template <class Delim, class Iterator, class String>
498 void join(const Delim& delimiter,
499 Iterator begin,
500 Iterator end,
501 String& output) {
502 detail::internalJoin(
503 detail::prepareDelim(delimiter),
504 begin,
505 end,
506 output);
507 }
508
6834f1d @chipturner Move and refactor code
chipturner authored
509 template <class String1, class String2>
510 void backslashify(const String1& input, String2& output, bool hex_style) {
511 static const char hexValues[] = "0123456789abcdef";
512 output.clear();
513 output.reserve(3 * input.size());
514 for (unsigned char c : input) {
515 // less than space or greater than '~' are considered unprintable
516 if (c < 0x20 || c > 0x7e || c == '\\') {
517 bool hex_append = false;
518 output.push_back('\\');
519 if (hex_style) {
520 hex_append = true;
521 } else {
522 if (c == '\r') output += 'r';
523 else if (c == '\n') output += 'n';
524 else if (c == '\t') output += 't';
525 else if (c == '\a') output += 'a';
526 else if (c == '\b') output += 'b';
527 else if (c == '\0') output += '0';
528 else if (c == '\\') output += '\\';
529 else {
530 hex_append = true;
531 }
532 }
533 if (hex_append) {
534 output.push_back('x');
535 output.push_back(hexValues[(c >> 4) & 0xf]);
536 output.push_back(hexValues[c & 0xf]);
537 }
538 } else {
539 output += c;
540 }
541 }
542 }
543
544 template <class String1, class String2>
545 void humanify(const String1& input, String2& output) {
546 int numUnprintable = 0;
547 int numPrintablePrefix = 0;
548 for (unsigned char c : input) {
549 if (c < 0x20 || c > 0x7e || c == '\\') {
550 ++numUnprintable;
551 }
552 if (numUnprintable == 0) {
553 ++numPrintablePrefix;
554 }
555 }
556
557 // hexlify doubles a string's size; backslashify can potentially
558 // explode it by 4x. Now, the printable range of the ascii
559 // "spectrum" is around 95 out of 256 values, so a "random" binary
560 // string should be around 60% unprintable. We use a 50% hueristic
561 // here, so if a string is 60% unprintable, then we just use hex
562 // output. Otherwise we backslash.
563 //
564 // UTF8 is completely ignored; as a result, utf8 characters will
565 // likely be \x escaped (since most common glyphs fit in two bytes).
566 // This is a tradeoff of complexity/speed instead of a convenience
567 // that likely would rarely matter. Moreover, this function is more
568 // about displaying underlying bytes, not about displaying glyphs
569 // from languages.
570 if (numUnprintable == 0) {
571 output = input;
572 } else if (5 * numUnprintable >= 3 * input.size()) {
573 // However! If we have a "meaningful" prefix of printable
574 // characters, say 20% of the string, we backslashify under the
575 // assumption viewing the prefix as ascii is worth blowing the
576 // output size up a bit.
577 if (5 * numPrintablePrefix >= input.size()) {
578 backslashify(input, output);
579 } else {
580 output = "0x";
581 hexlify(input, output, true /* append output */);
582 }
583 } else {
584 backslashify(input, output);
585 }
586 }
587
588 template<class InputString, class OutputString>
589 bool hexlify(const InputString& input, OutputString& output,
a8370f0 @juchem Clang complains about errors in folly String-inl.h and FBString.h (P1…
juchem authored
590 bool append_output) {
6834f1d @chipturner Move and refactor code
chipturner authored
591 if (!append_output) output.clear();
592
593 static char hexValues[] = "0123456789abcdef";
594 int j = output.size();
595 output.resize(2 * input.size() + output.size());
596 for (int i = 0; i < input.size(); ++i) {
597 int ch = input[i];
598 output[j++] = hexValues[(ch >> 4) & 0xf];
599 output[j++] = hexValues[ch & 0xf];
600 }
601 return true;
602 }
603
604 template<class InputString, class OutputString>
605 bool unhexlify(const InputString& input, OutputString& output) {
606 if (input.size() % 2 != 0) {
607 return false;
608 }
609 output.resize(input.size() / 2);
610 int j = 0;
611 auto unhex = [](char c) -> int {
612 return c >= '0' && c <= '9' ? c - '0' :
613 c >= 'A' && c <= 'F' ? c - 'A' + 10 :
614 c >= 'a' && c <= 'f' ? c - 'a' + 10 :
615 -1;
616 };
617
618 for (int i = 0; i < input.size(); i += 2) {
619 int highBits = unhex(input[i]);
620 int lowBits = unhex(input[i + 1]);
621 if (highBits < 0 || lowBits < 0) {
622 return false;
623 }
624 output[j++] = (highBits << 4) + lowBits;
625 }
626 return true;
627 }
628
27494a2 @jdelong Pull from FB rev 63ce89e2f2301e6bba44a111cc7d4218022156f6
jdelong authored
629 namespace detail {
630 /**
631 * Hex-dump at most 16 bytes starting at offset from a memory area of size
632 * bytes. Return the number of bytes actually dumped.
633 */
634 size_t hexDumpLine(const void* ptr, size_t offset, size_t size,
635 std::string& line);
636 } // namespace detail
637
638 template <class OutIt>
639 void hexDump(const void* ptr, size_t size, OutIt out) {
640 size_t offset = 0;
641 std::string line;
642 while (offset < size) {
643 offset += detail::hexDumpLine(ptr, offset, size, line);
644 *out++ = line;
645 }
646 }
647
648 } // namespace folly
649
650 #endif /* FOLLY_STRING_INL_H_ */
Something went wrong with that request. Please try again.