Skip to content

Commit

Permalink
Revert "Revert "URI parsing in folly""
Browse files Browse the repository at this point in the history
Summary:
Now that the third_party link was updated in
https://phabricator.fb.com/D778617, we're good.

Test Plan: fbconfig -r thrift && fbmake runtests_opt

Reviewed By: chip@fb.com

FB internal diff: D778707
  • Loading branch information
tudor authored and jdelong committed Apr 21, 2013
1 parent cd884e6 commit f17d138
Show file tree
Hide file tree
Showing 8 changed files with 742 additions and 23 deletions.
89 changes: 89 additions & 0 deletions folly/String-inl.h
Expand Up @@ -149,6 +149,95 @@ void cUnescape(StringPiece str, String& out, bool strict) {
out.append(&*last, p - last);
}

namespace detail {
// Map from character code to escape mode:
// 0 = pass through
// 1 = unused
// 2 = pass through in PATH mode
// 3 = space, replace with '+' in QUERY mode
// 4 = percent-encode
extern const unsigned char uriEscapeTable[];
} // namespace detail

template <class String>
void uriEscape(StringPiece str, String& out, UriEscapeMode mode) {
static const char hexValues[] = "0123456789abcdef";
char esc[3];
esc[0] = '%';
// Preallocate assuming that 25% of the input string will be escaped
out.reserve(out.size() + str.size() + 3 * (str.size() / 4));
auto p = str.begin();
auto last = p; // last regular character
// We advance over runs of passthrough characters and copy them in one go;
// this is faster than calling push_back repeatedly.
unsigned char minEncode = static_cast<unsigned char>(mode);
while (p != str.end()) {
char c = *p;
unsigned char v = static_cast<unsigned char>(c);
unsigned char discriminator = detail::uriEscapeTable[v];
if (LIKELY(discriminator <= minEncode)) {
++p;
} else if (mode == UriEscapeMode::QUERY && discriminator == 3) {
out.append(&*last, p - last);
out.push_back('+');
++p;
last = p;
} else {
out.append(&*last, p - last);
esc[1] = hexValues[v >> 4];
esc[2] = hexValues[v & 0x0f];
out.append(esc, 3);
++p;
last = p;
}
}
out.append(&*last, p - last);
}

template <class String>
void uriUnescape(StringPiece str, String& out, UriEscapeMode mode) {
out.reserve(out.size() + str.size());
auto p = str.begin();
auto last = p;
// We advance over runs of passthrough characters and copy them in one go;
// this is faster than calling push_back repeatedly.
while (p != str.end()) {
char c = *p;
unsigned char v = static_cast<unsigned char>(v);
switch (c) {
case '%':
{
if (UNLIKELY(std::distance(p, str.end()) < 3)) {
throw std::invalid_argument("incomplete percent encode sequence");
}
auto h1 = detail::hexTable[static_cast<unsigned char>(p[1])];
auto h2 = detail::hexTable[static_cast<unsigned char>(p[2])];
if (UNLIKELY(h1 == 16 || h2 == 16)) {
throw std::invalid_argument("invalid percent encode sequence");
}
out.append(&*last, p - last);
out.push_back((h1 << 4) | h2);
p += 3;
last = p;
break;
}
case '+':
if (mode == UriEscapeMode::QUERY) {
out.append(&*last, p - last);
out.push_back(' ');
++p;
last = p;
break;
}
// else fallthrough
default:
++p;
break;
}
}
out.append(&*last, p - last);
}

namespace detail {

/*
Expand Down
50 changes: 50 additions & 0 deletions folly/String.h
Expand Up @@ -112,6 +112,56 @@ String cUnescape(StringPiece str, bool strict = true) {
return out;
}

/**
* URI-escape a string. Appends the result to the output string.
*
* Alphanumeric characters and other characters marked as "unreserved" in RFC
* 3986 ( -_.~ ) are left unchanged. In PATH mode, the forward slash (/) is
* also left unchanged. In QUERY mode, spaces are replaced by '+'. All other
* characters are percent-encoded.
*/
enum class UriEscapeMode : unsigned char {
// The values are meaningful, see generate_escape_tables.py
ALL = 0,
QUERY = 1,
PATH = 2
};
template <class String>
void uriEscape(StringPiece str,
String& out,
UriEscapeMode mode = UriEscapeMode::ALL);

/**
* Similar to uriEscape above, but returns the escaped string.
*/
template <class String>
String uriEscape(StringPiece str, UriEscapeMode mode = UriEscapeMode::ALL) {
String out;
uriEscape(str, out, mode);
return out;
}

/**
* URI-unescape a string. Appends the result to the output string.
*
* In QUERY mode, '+' are replaced by space. %XX sequences are decoded if
* XX is a valid hex sequence, otherwise we throw invalid_argument.
*/
template <class String>
void uriUnescape(StringPiece str,
String& out,
UriEscapeMode mode = UriEscapeMode::ALL);

/**
* Similar to uriUnescape above, but returns the unescaped string.
*/
template <class String>
String uriUnescape(StringPiece str, UriEscapeMode mode = UriEscapeMode::ALL) {
String out;
uriUnescape(str, out, mode);
return out;
}

/**
* stringPrintf is much like printf but deposits its result into a
* string. Two signatures are supported: the first simply returns the
Expand Down
49 changes: 49 additions & 0 deletions folly/Uri-inl.h
@@ -0,0 +1,49 @@
/*
* Copyright 2013 Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef FOLLY_URI_H_
#error This file may only be included from folly/Uri.h
#endif

#include "folly/Conv.h"

namespace folly {

template <class String>
String Uri::toString() const {
String str;
toAppend(scheme_, "://", &str);
if (!password_.empty()) {
toAppend(username_, ":", password_, "@", &str);
} else if (!username_.empty()) {
toAppend(username_, "@", &str);
}
toAppend(host_, &str);
if (port_ != 0) {
toAppend(":", port_, &str);
}
toAppend(path_, &str);
if (!query_.empty()) {
toAppend("?", query_, &str);
}
if (!fragment_.empty()) {
toAppend("#", fragment_, &str);
}
return str;
}

} // namespace folly

95 changes: 95 additions & 0 deletions folly/Uri.cpp
@@ -0,0 +1,95 @@
/*
* Copyright 2013 Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "folly/Uri.h"

#include <ctype.h>
#include <boost/regex.hpp>

namespace folly {

namespace {

fbstring submatch(const boost::cmatch& m, size_t idx) {
auto& sub = m[idx];
return fbstring(sub.first, sub.second);
}

template <class String>
void toLower(String& s) {
for (auto& c : s) {
c = tolower(c);
}
}

} // namespace

Uri::Uri(StringPiece str) : port_(0) {
static const boost::regex uriRegex(
"([a-zA-Z][a-zA-Z0-9+.-]*):" // scheme:
"([^?#]*)" // authority and path
"(?:\\?([^#]*))?" // ?query
"(?:#(.*))?"); // #fragment
static const boost::regex authorityAndPathRegex("//([^/]*)(/.*)?");

boost::cmatch match;
if (UNLIKELY(!boost::regex_match(str.begin(), str.end(), match, uriRegex))) {
throw std::invalid_argument("invalid URI");
}

scheme_ = submatch(match, 1);
toLower(scheme_);

StringPiece authorityAndPath(match[2].first, match[2].second);
boost::cmatch authorityAndPathMatch;
if (!boost::regex_match(authorityAndPath.begin(),
authorityAndPath.end(),
authorityAndPathMatch,
authorityAndPathRegex)) {
// Does not start with //, doesn't have authority
path_ = authorityAndPath.fbstr();
} else {
static const boost::regex authorityRegex(
"(?:([^@:]*)(?::([^@]*))?@)?" // username, password
"(\\[[^\\]]*\\]|[^\\[:]*)" // host (IP-literal, dotted-IPv4, or
// named host)
"(?::(\\d*))?"); // port

auto authority = authorityAndPathMatch[1];
boost::cmatch authorityMatch;
if (!boost::regex_match(authority.first,
authority.second,
authorityMatch,
authorityRegex)) {
throw std::invalid_argument("invalid URI authority");
}

StringPiece port(authorityMatch[4].first, authorityMatch[4].second);
if (!port.empty()) {
port_ = to<uint32_t>(port);
}

username_ = submatch(authorityMatch, 1);
password_ = submatch(authorityMatch, 2);
host_ = submatch(authorityMatch, 3);
path_ = submatch(authorityAndPathMatch, 2);
}

query_ = submatch(match, 3);
fragment_ = submatch(match, 4);
}

} // namespace folly
77 changes: 77 additions & 0 deletions folly/Uri.h
@@ -0,0 +1,77 @@
/*
* Copyright 2013 Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef FOLLY_URI_H_
#define FOLLY_URI_H_

#include "folly/String.h"

namespace folly {

/**
* Class representing a URI.
*
* Consider http://www.facebook.com/foo/bar?key=foo#anchor
*
* The URI is broken down into its parts: scheme ("http"), authority
* (ie. host and port, in most cases: "www.facebook.com"), path
* ("/foo/bar"), query ("key=foo") and fragment ("anchor"). The scheme is
* lower-cased.
*
* If this Uri represents a URL, note that, to prevent ambiguity, the component
* parts are NOT percent-decoded; you should do this yourself with
* uriUnescape() (for the authority and path) and uriUnescape(...,
* UriEscapeMode::QUERY) (for the query, but probably only after splitting at
* '&' to identify the individual parameters).
*/
class Uri {
public:
/**
* Parse a Uri from a string. Throws std::invalid_argument on parse error.
*/
explicit Uri(StringPiece str);

const fbstring& scheme() const { return scheme_; }
const fbstring& username() const { return username_; }
const fbstring& password() const { return password_; }
const fbstring& host() const { return host_; }
uint32_t port() const { return port_; }
const fbstring& path() const { return path_; }
const fbstring& query() const { return query_; }
const fbstring& fragment() const { return fragment_; }

template <class String>
String toString() const;

std::string str() const { return toString<std::string>(); }
fbstring fbstr() const { return toString<fbstring>(); }

private:
fbstring scheme_;
fbstring username_;
fbstring password_;
fbstring host_;
uint32_t port_;
fbstring path_;
fbstring query_;
fbstring fragment_;
};

} // namespace folly

#include "folly/Uri-inl.h"

#endif /* FOLLY_URI_H_ */

0 comments on commit f17d138

Please sign in to comment.