Skip to content

Commit

Permalink
URLPattern: Collapse look-ahead loops into main parse loop.
Browse files Browse the repository at this point in the history
This CL refactors the parser to use separate states instead of the
previously implemented look-ahead loops.  This fixes a bug where the
look-ahead loops did not properly ignore characters within `{ }`
pattern groupings.

This CL also slightly improves handling for nesting `{ }` groupings
even though they are not legal pattern syntax.  It seems better to
avoid getting confused on depth and let the later pattern compiler
return a more predictable error.

This CL also adds a number of additional comments and other cleanup.

Bug: 1141510
Change-Id: Id6bc1b4a16390b9e878c6757582519997332bbc8
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2951204
Commit-Queue: Ben Kelly <wanderview@chromium.org>
Reviewed-by: Jeremy Roman <jbroman@chromium.org>
Cr-Commit-Position: refs/heads/master@{#892699}
  • Loading branch information
wanderview authored and Chromium LUCI CQ committed Jun 15, 2021
1 parent 0bc96be commit f66c35e
Show file tree
Hide file tree
Showing 3 changed files with 231 additions and 123 deletions.
243 changes: 134 additions & 109 deletions third_party/blink/renderer/modules/url_pattern/url_pattern_parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ namespace url_pattern {
Parser::Parser(const String& input) : input_(input), utf8_(input) {}

void Parser::Parse(ExceptionState& exception_state) {
DCHECK_EQ(state_, StringParseState::kInit);
DCHECK_EQ(token_index_, 0u);

auto tokenize_result =
liburlpattern::Tokenize(absl::string_view(utf8_.data(), utf8_.size()),
liburlpattern::TokenizePolicy::kLenient);
Expand All @@ -27,7 +30,6 @@ void Parser::Parse(ExceptionState& exception_state) {
return;
}

DCHECK_EQ(token_index_, 0u);
token_list_ = std::move(tokenize_result.value());
result_ = MakeGarbageCollected<URLPatternInit>();

Expand All @@ -41,63 +43,55 @@ void Parser::Parse(ExceptionState& exception_state) {
//
// To implement this we initialize components to the empty string in advance.
//
// We can't, however, do this immediately for all components. We want to
// allow the baseURL to provide information for relative URLs, so we only
// want to set the default empty string values for components following the
// first component in the relative URL.

// We start in relative mode by default. If we find a protocol `:` later,
// we will update the starting state to expect an absolute URL pattern.
DCHECK_EQ(state_, StringParseState::kPathname);

// Scan for protocol `:` terminator. This should be an invalid pattern
// character. This automatically works for "https://" because a name
// cannot start with a `/`. For URLs that do not include "//", however,
// the input string will need to escape the colon, e.g. "data\\:foo".
for (size_t i = 0; i < token_list_.size(); ++i) {
if (IsProtocolSuffix(i)) {
// Update the state to expect the start of an absolute URL.
state_ = StringParseState::kProtocol;

// Now that we are in absolute mode we know values will not be inherited
// from a base URL. Therefore initialize the rest of the components to
// the empty string.
result_->setUsername(g_empty_string);
result_->setPassword(g_empty_string);
result_->setHostname(g_empty_string);
result_->setPort(g_empty_string);
result_->setPathname(g_empty_string);
result_->setSearch(g_empty_string);
result_->setHash(g_empty_string);
break;
}
}

// If we failed to find a protocol terminator then we are still in relative
// mode. We now need to determine the first component of the relative URL.
if (state_ == StringParseState::kPathname) {
// If the string begins with `?` then its a relative search component. If
// it starts with `#` then its a relative hash component. Otherwise its
// a relative pathname.
//
// In each case we initialize any components following the initial
// component to be empty string.
if (IsHashPrefix()) {
ChangeStateWithoutSettingComponent(StringParseState::kHash, Skip(1));
} else if (IsSearchPrefix()) {
ChangeStateWithoutSettingComponent(StringParseState::kSearch, Skip(1));
result_->setHash(g_empty_string);
} else {
result_->setSearch(g_empty_string);
result_->setHash(g_empty_string);
}
}
// We can't, however, do this immediately. We want to allow the baseURL to
// provide information for relative URLs, so we only want to set the default
// empty string values for components following the first component in the
// relative URL.
//
// We therefore wait to set the default component values until after we exit
// the kInit state and have determined if we are in relative or absolute mode.

// Iterate through the list of tokens and update our state machine as we go.
for (; token_index_ < token_list_.size(); ++token_index_) {
for (; token_index_ < token_list_.size(); token_index_ += token_increment_) {
// Reset back to our default `token_increment_` value.
token_increment_ = 1;

// All states must respect the end of the token list. The liburlpattern
// tokenizer guarantees that the last token will have the type `kEnd`.
if (token_list_[token_index_].type == liburlpattern::TokenType::kEnd) {
// If we failed to find a protocol terminator then we are still in
// relative mode. We now need to determine the first component of the
// relative URL.
if (state_ == StringParseState::kInit) {
// Reset back to the start of the input string.
Rewind();

// If the string begins with `?` then its a relative search component.
// If it starts with `#` then its a relative hash component. Otherwise
// its a relative pathname.
//
// In each case we initialize any components following the initial
// component to be empty string.
if (IsHashPrefix()) {
ChangeState(StringParseState::kHash, Skip(1));
} else if (IsSearchPrefix()) {
ChangeState(StringParseState::kSearch, Skip(1));
result_->setHash(g_empty_string);
} else {
ChangeState(StringParseState::kPathname, Skip(0));
result_->setSearch(g_empty_string);
result_->setHash(g_empty_string);
}
continue;
}

// If we failed to find an `@`, then there is no username and password.
// We should rewind and process the data as a hostname.
else if (state_ == StringParseState::kAuthority) {
RewindAndSetState(StringParseState::kHostname);
continue;
}

ChangeState(StringParseState::kDone, Skip(0));
break;
}
Expand All @@ -107,20 +101,40 @@ void Parser::Parse(ExceptionState& exception_state) {
// past any tokens that are within `{` and `}`. Note, the tokenizer
// handles grouping `(` and `)` and `:foo` groups for us automatically, so
// we don't need special code for them here.
if (in_group_) {
if (group_depth_ > 0) {
if (IsGroupClose())
in_group_ = false;
group_depth_ -= 1;
else
continue;
}

if (IsGroupOpen()) {
in_group_ = true;
group_depth_ += 1;
continue;
}

switch (state_) {
case StringParseState::kProtocol: {
case StringParseState::kInit:
if (IsProtocolSuffix()) {
// We are in absolute mode and we know values will not be inherited
// from a base URL. Therefore initialize the rest of the components
// to the empty string.
result_->setUsername(g_empty_string);
result_->setPassword(g_empty_string);
result_->setHostname(g_empty_string);
result_->setPort(g_empty_string);
result_->setPathname(g_empty_string);
result_->setSearch(g_empty_string);
result_->setHash(g_empty_string);

// Update the state to expect the start of an absolute URL.
RewindAndSetState(StringParseState::kProtocol);

break;
}
break;

case StringParseState::kProtocol:
// If we find the end of the protocol component...
if (IsProtocolSuffix()) {
// First we eagerly compile the protocol pattern and use it to
Expand All @@ -145,49 +159,41 @@ void Parser::Parse(ExceptionState& exception_state) {
// If there are authority slashes, like `https://`, then
// we must transition to the authority section of the URLPattern.
if (NextIsAuthoritySlashes()) {
next_state = StringParseState::kHostname;
next_state = StringParseState::kAuthority;
skip = Skip(3);
}

// If there are no authority slashes, but the protocol is special
// then we still go to the hostname as this is a "standard" URL.
// This differs from the above case since we don't need to skip the
// extra slashes.
// then we still go to the authority section as this is a "standard"
// URL. This differs from the above case since we don't need to skip
// the extra slashes.
else if (should_treat_as_standard_url_) {
next_state = StringParseState::kHostname;
}

// Before actually going to the hostname state, though, we must see
// if there is an identity of the form:
//
// <username>:<password>@<hostname>
//
// We check for this by looking for the `@` character. The username
// and password are themselves each optional, so the `:` may not be
// present. If we see the `@` we just go to the username state
// and let it proceed until it hits either the password separator
// or the `@` terminator.
if (next_state == StringParseState::kHostname) {
for (size_t tmp_index = token_index_ + skip.value();
tmp_index < token_list_.size(); ++tmp_index) {
if (IsIdentityTerminator(tmp_index)) {
next_state = StringParseState::kUsername;
break;
}

// Stop searching for the `@` character if we see the beginning
// of the pathname, search, or hash components.
if (IsPathnameStart(tmp_index) || IsSearchPrefix(tmp_index) ||
IsHashPrefix(tmp_index)) {
break;
}
}
next_state = StringParseState::kAuthority;
}

ChangeState(next_state, skip);
}
break;
}

case StringParseState::kAuthority:
// Before going to the hostname state we must see if there is an
// identity of the form:
//
// <username>:<password>@<hostname>
//
// We check for this by looking for the `@` character. The username
// and password are themselves each optional, so the `:` may not be
// present. If we see the `@` we just go to the username state
// and let it proceed until it hits either the password separator
// or the `@` terminator.
if (IsIdentityTerminator())
RewindAndSetState(StringParseState::kUsername);

// Stop searching for the `@` character if we see the beginning
// of the pathname, search, or hash components.
else if (IsPathnameStart() || IsSearchPrefix() || IsHashPrefix())
RewindAndSetState(StringParseState::kHostname);
break;

case StringParseState::kUsername:
// If we find a `:` then transition to the password component state.
Expand Down Expand Up @@ -262,9 +268,15 @@ void Parser::ChangeState(StringParseState new_state, Skip skip) {
// a component pattern string. This is stored in the appropriate result
// property based on the current `state_`.
switch (state_) {
case StringParseState::kInit:
// No component to set when transitioning from this state.
break;
case StringParseState::kProtocol:
result_->setProtocol(MakeComponentString());
break;
case StringParseState::kAuthority:
// No component to set when transitioning from this state.
break;
case StringParseState::kUsername:
result_->setUsername(MakeComponentString());
break;
Expand Down Expand Up @@ -300,16 +312,26 @@ void Parser::ChangeStateWithoutSettingComponent(StringParseState new_state,

// Now update `component_start_` to point to the new component. The `skip`
// argument tells us how many tokens to ignore to get to the next start.
component_start_ = SafeToken(token_index_ + skip.value()).index;
component_start_ = token_index_ + skip.value();

// Next, move the `token_index_` so that the top of the loop will begin
// parsing the new component. The index will be automatically incremented by
// the parse loop, so we move one less than the indicated `skip` amount. This
// means `kNone` and `kOne` are equivalent for setting `token_index_`. Note,
// however, these enums do have a different effect on setting
// `component_start_` above.
if (skip.value() > 1)
token_index_ += (skip.value() - 1);
token_index_ += skip.value();
token_increment_ = 0;
}

void Parser::Rewind() {
token_index_ = component_start_;
token_increment_ = 0;
}

void Parser::RewindAndSetState(StringParseState new_state) {
Rewind();
state_ = new_state;
}

const liburlpattern::Token& Parser::SafeToken(size_t index) const {
Expand All @@ -328,17 +350,17 @@ bool Parser::IsNonSpecialPatternChar(size_t index, const char* value) const {
token.type == liburlpattern::TokenType::kInvalidChar);
}

bool Parser::IsProtocolSuffix(size_t index) const {
return IsNonSpecialPatternChar(index, ":");
bool Parser::IsProtocolSuffix() const {
return IsNonSpecialPatternChar(token_index_, ":");
}

bool Parser::NextIsAuthoritySlashes() const {
return IsNonSpecialPatternChar(token_index_ + 1, "/") &&
IsNonSpecialPatternChar(token_index_ + 2, "/");
}

bool Parser::IsIdentityTerminator(size_t index) const {
return IsNonSpecialPatternChar(index, "@");
bool Parser::IsIdentityTerminator() const {
return IsNonSpecialPatternChar(token_index_, "@");
}

bool Parser::IsPasswordPrefix() const {
Expand All @@ -349,20 +371,21 @@ bool Parser::IsPortPrefix() const {
return IsNonSpecialPatternChar(token_index_, ":");
}

bool Parser::IsPathnameStart(size_t index) const {
return IsNonSpecialPatternChar(index, "/");
bool Parser::IsPathnameStart() const {
return IsNonSpecialPatternChar(token_index_, "/");
}

bool Parser::IsSearchPrefix(size_t index) const {
if (IsNonSpecialPatternChar(index, "?"))
bool Parser::IsSearchPrefix() const {
if (IsNonSpecialPatternChar(token_index_, "?"))
return true;

if (token_list_[index].value != "?")
if (token_list_[token_index_].value != "?")
return false;

// If we have a "?" that is not a normal character, then it must be an
// optional group modifier.
DCHECK_EQ(SafeToken(index).type, liburlpattern::TokenType::kOtherModifier);
DCHECK_EQ(SafeToken(token_index_).type,
liburlpattern::TokenType::kOtherModifier);

// We have a `?` tokenized as a modifier. We only want to treat this as
// the search prefix if it would not normally be valid in a liburlpattern
Expand All @@ -386,14 +409,14 @@ bool Parser::IsSearchPrefix(size_t index) const {
// Note, if `token_index_` is zero the index will wrap around and
// `SafeToken()` will return the kEnd token. This will correctly return true
// from this method as a pattern cannot normally begin with an unescaped `?`.
const auto& previous_token = SafeToken(index - 1);
const auto& previous_token = SafeToken(token_index_ - 1);
return previous_token.type != liburlpattern::TokenType::kName &&
previous_token.type != liburlpattern::TokenType::kRegex &&
previous_token.type != liburlpattern::TokenType::kClose &&
previous_token.type != liburlpattern::TokenType::kAsterisk;
}

bool Parser::IsHashPrefix(size_t index) const {
bool Parser::IsHashPrefix() const {
return IsNonSpecialPatternChar(token_index_, "#");
}

Expand All @@ -409,14 +432,16 @@ String Parser::MakeComponentString() const {
DCHECK_LT(token_index_, token_list_.size());
const auto& token = token_list_[token_index_];

DCHECK_LE(component_start_, utf8_.size());
DCHECK_GE(token.index, component_start_);
size_t component_char_start = SafeToken(component_start_).index;

DCHECK_LE(component_char_start, utf8_.size());
DCHECK_GE(token.index, component_char_start);
DCHECK(token.index < utf8_.size() ||
(token.index == utf8_.size() &&
token.type == liburlpattern::TokenType::kEnd));

return String::FromUTF8(utf8_.data() + component_start_,
token.index - component_start_);
return String::FromUTF8(utf8_.data() + component_char_start,
token.index - component_char_start);
}

void Parser::ComputeShouldTreatAsStandardURL(ExceptionState& exception_state) {
Expand Down

0 comments on commit f66c35e

Please sign in to comment.