Permalink
Browse files

Revert "Revert "URI parsing in folly""

Summary:
Now that the third_party link was updated in
https://phabricator.fb.com/D778617, we're good.

Test Plan: fbconfig -r thrift && fbmake runtests_opt

Reviewed By: chip@fb.com

FB internal diff: D778707
  • Loading branch information...
1 parent cd884e6 commit f17d138fe185ccbe2a8bab815ee8d7614076ea11 @tudor tudor committed with jdelong Apr 17, 2013
Showing with 742 additions and 23 deletions.
  1. +89 −0 folly/String-inl.h
  2. +50 −0 folly/String.h
  3. +49 −0 folly/Uri-inl.h
  4. +95 −0 folly/Uri.cpp
  5. +77 −0 folly/Uri.h
  6. +29 −7 folly/build/generate_escape_tables.py
  7. +140 −16 folly/test/StringTest.cpp
  8. +213 −0 folly/test/UriTest.cpp
View
@@ -150,6 +150,95 @@ void cUnescape(StringPiece str, String& out, bool strict) {
}
namespace detail {
+// Map from character code to escape mode:
+// 0 = pass through
+// 1 = unused
+// 2 = pass through in PATH mode
+// 3 = space, replace with '+' in QUERY mode
+// 4 = percent-encode
+extern const unsigned char uriEscapeTable[];
+} // namespace detail
+
+template <class String>
+void uriEscape(StringPiece str, String& out, UriEscapeMode mode) {
+ static const char hexValues[] = "0123456789abcdef";
+ char esc[3];
+ esc[0] = '%';
+ // Preallocate assuming that 25% of the input string will be escaped
+ out.reserve(out.size() + str.size() + 3 * (str.size() / 4));
+ auto p = str.begin();
+ auto last = p; // last regular character
+ // We advance over runs of passthrough characters and copy them in one go;
+ // this is faster than calling push_back repeatedly.
+ unsigned char minEncode = static_cast<unsigned char>(mode);
+ while (p != str.end()) {
+ char c = *p;
+ unsigned char v = static_cast<unsigned char>(c);
+ unsigned char discriminator = detail::uriEscapeTable[v];
+ if (LIKELY(discriminator <= minEncode)) {
+ ++p;
+ } else if (mode == UriEscapeMode::QUERY && discriminator == 3) {
+ out.append(&*last, p - last);
+ out.push_back('+');
+ ++p;
+ last = p;
+ } else {
+ out.append(&*last, p - last);
+ esc[1] = hexValues[v >> 4];
+ esc[2] = hexValues[v & 0x0f];
+ out.append(esc, 3);
+ ++p;
+ last = p;
+ }
+ }
+ out.append(&*last, p - last);
+}
+
+template <class String>
+void uriUnescape(StringPiece str, String& out, UriEscapeMode mode) {
+ out.reserve(out.size() + str.size());
+ auto p = str.begin();
+ auto last = p;
+ // We advance over runs of passthrough characters and copy them in one go;
+ // this is faster than calling push_back repeatedly.
+ while (p != str.end()) {
+ char c = *p;
+ unsigned char v = static_cast<unsigned char>(v);
+ switch (c) {
+ case '%':
+ {
+ if (UNLIKELY(std::distance(p, str.end()) < 3)) {
+ throw std::invalid_argument("incomplete percent encode sequence");
+ }
+ auto h1 = detail::hexTable[static_cast<unsigned char>(p[1])];
+ auto h2 = detail::hexTable[static_cast<unsigned char>(p[2])];
+ if (UNLIKELY(h1 == 16 || h2 == 16)) {
+ throw std::invalid_argument("invalid percent encode sequence");
+ }
+ out.append(&*last, p - last);
+ out.push_back((h1 << 4) | h2);
+ p += 3;
+ last = p;
+ break;
+ }
+ case '+':
+ if (mode == UriEscapeMode::QUERY) {
+ out.append(&*last, p - last);
+ out.push_back(' ');
+ ++p;
+ last = p;
+ break;
+ }
+ // else fallthrough
+ default:
+ ++p;
+ break;
+ }
+ }
+ out.append(&*last, p - last);
+}
+
+namespace detail {
/*
* The following functions are type-overloaded helpers for
View
@@ -113,6 +113,56 @@ String cUnescape(StringPiece str, bool strict = true) {
}
/**
+ * URI-escape a string. Appends the result to the output string.
+ *
+ * Alphanumeric characters and other characters marked as "unreserved" in RFC
+ * 3986 ( -_.~ ) are left unchanged. In PATH mode, the forward slash (/) is
+ * also left unchanged. In QUERY mode, spaces are replaced by '+'. All other
+ * characters are percent-encoded.
+ */
+enum class UriEscapeMode : unsigned char {
+ // The values are meaningful, see generate_escape_tables.py
+ ALL = 0,
+ QUERY = 1,
+ PATH = 2
+};
+template <class String>
+void uriEscape(StringPiece str,
+ String& out,
+ UriEscapeMode mode = UriEscapeMode::ALL);
+
+/**
+ * Similar to uriEscape above, but returns the escaped string.
+ */
+template <class String>
+String uriEscape(StringPiece str, UriEscapeMode mode = UriEscapeMode::ALL) {
+ String out;
+ uriEscape(str, out, mode);
+ return out;
+}
+
+/**
+ * URI-unescape a string. Appends the result to the output string.
+ *
+ * In QUERY mode, '+' are replaced by space. %XX sequences are decoded if
+ * XX is a valid hex sequence, otherwise we throw invalid_argument.
+ */
+template <class String>
+void uriUnescape(StringPiece str,
+ String& out,
+ UriEscapeMode mode = UriEscapeMode::ALL);
+
+/**
+ * Similar to uriUnescape above, but returns the unescaped string.
+ */
+template <class String>
+String uriUnescape(StringPiece str, UriEscapeMode mode = UriEscapeMode::ALL) {
+ String out;
+ uriUnescape(str, out, mode);
+ return out;
+}
+
+/**
* stringPrintf is much like printf but deposits its result into a
* string. Two signatures are supported: the first simply returns the
* resulting string, and the second appends the produced characters to
View
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2013 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FOLLY_URI_H_
+#error This file may only be included from folly/Uri.h
+#endif
+
+#include "folly/Conv.h"
+
+namespace folly {
+
+template <class String>
+String Uri::toString() const {
+ String str;
+ toAppend(scheme_, "://", &str);
+ if (!password_.empty()) {
+ toAppend(username_, ":", password_, "@", &str);
+ } else if (!username_.empty()) {
+ toAppend(username_, "@", &str);
+ }
+ toAppend(host_, &str);
+ if (port_ != 0) {
+ toAppend(":", port_, &str);
+ }
+ toAppend(path_, &str);
+ if (!query_.empty()) {
+ toAppend("?", query_, &str);
+ }
+ if (!fragment_.empty()) {
+ toAppend("#", fragment_, &str);
+ }
+ return str;
+}
+
+} // namespace folly
+
View
@@ -0,0 +1,95 @@
+/*
+ * Copyright 2013 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "folly/Uri.h"
+
+#include <ctype.h>
+#include <boost/regex.hpp>
+
+namespace folly {
+
+namespace {
+
+fbstring submatch(const boost::cmatch& m, size_t idx) {
+ auto& sub = m[idx];
+ return fbstring(sub.first, sub.second);
+}
+
+template <class String>
+void toLower(String& s) {
+ for (auto& c : s) {
+ c = tolower(c);
+ }
+}
+
+} // namespace
+
+Uri::Uri(StringPiece str) : port_(0) {
+ static const boost::regex uriRegex(
+ "([a-zA-Z][a-zA-Z0-9+.-]*):" // scheme:
+ "([^?#]*)" // authority and path
+ "(?:\\?([^#]*))?" // ?query
+ "(?:#(.*))?"); // #fragment
+ static const boost::regex authorityAndPathRegex("//([^/]*)(/.*)?");
+
+ boost::cmatch match;
+ if (UNLIKELY(!boost::regex_match(str.begin(), str.end(), match, uriRegex))) {
+ throw std::invalid_argument("invalid URI");
+ }
+
+ scheme_ = submatch(match, 1);
+ toLower(scheme_);
+
+ StringPiece authorityAndPath(match[2].first, match[2].second);
+ boost::cmatch authorityAndPathMatch;
+ if (!boost::regex_match(authorityAndPath.begin(),
+ authorityAndPath.end(),
+ authorityAndPathMatch,
+ authorityAndPathRegex)) {
+ // Does not start with //, doesn't have authority
+ path_ = authorityAndPath.fbstr();
+ } else {
+ static const boost::regex authorityRegex(
+ "(?:([^@:]*)(?::([^@]*))?@)?" // username, password
+ "(\\[[^\\]]*\\]|[^\\[:]*)" // host (IP-literal, dotted-IPv4, or
+ // named host)
+ "(?::(\\d*))?"); // port
+
+ auto authority = authorityAndPathMatch[1];
+ boost::cmatch authorityMatch;
+ if (!boost::regex_match(authority.first,
+ authority.second,
+ authorityMatch,
+ authorityRegex)) {
+ throw std::invalid_argument("invalid URI authority");
+ }
+
+ StringPiece port(authorityMatch[4].first, authorityMatch[4].second);
+ if (!port.empty()) {
+ port_ = to<uint32_t>(port);
+ }
+
+ username_ = submatch(authorityMatch, 1);
+ password_ = submatch(authorityMatch, 2);
+ host_ = submatch(authorityMatch, 3);
+ path_ = submatch(authorityAndPathMatch, 2);
+ }
+
+ query_ = submatch(match, 3);
+ fragment_ = submatch(match, 4);
+}
+
+} // namespace folly
View
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2013 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FOLLY_URI_H_
+#define FOLLY_URI_H_
+
+#include "folly/String.h"
+
+namespace folly {
+
+/**
+ * Class representing a URI.
+ *
+ * Consider http://www.facebook.com/foo/bar?key=foo#anchor
+ *
+ * The URI is broken down into its parts: scheme ("http"), authority
+ * (ie. host and port, in most cases: "www.facebook.com"), path
+ * ("/foo/bar"), query ("key=foo") and fragment ("anchor"). The scheme is
+ * lower-cased.
+ *
+ * If this Uri represents a URL, note that, to prevent ambiguity, the component
+ * parts are NOT percent-decoded; you should do this yourself with
+ * uriUnescape() (for the authority and path) and uriUnescape(...,
+ * UriEscapeMode::QUERY) (for the query, but probably only after splitting at
+ * '&' to identify the individual parameters).
+ */
+class Uri {
+ public:
+ /**
+ * Parse a Uri from a string. Throws std::invalid_argument on parse error.
+ */
+ explicit Uri(StringPiece str);
+
+ const fbstring& scheme() const { return scheme_; }
+ const fbstring& username() const { return username_; }
+ const fbstring& password() const { return password_; }
+ const fbstring& host() const { return host_; }
+ uint32_t port() const { return port_; }
+ const fbstring& path() const { return path_; }
+ const fbstring& query() const { return query_; }
+ const fbstring& fragment() const { return fragment_; }
+
+ template <class String>
+ String toString() const;
+
+ std::string str() const { return toString<std::string>(); }
+ fbstring fbstr() const { return toString<fbstring>(); }
+
+ private:
+ fbstring scheme_;
+ fbstring username_;
+ fbstring password_;
+ fbstring host_;
+ uint32_t port_;
+ fbstring path_;
+ fbstring query_;
+ fbstring fragment_;
+};
+
+} // namespace folly
+
+#include "folly/Uri-inl.h"
+
+#endif /* FOLLY_URI_H_ */
Oops, something went wrong.

0 comments on commit f17d138

Please sign in to comment.