Skip to content

Commit

Permalink
Add support for (?<name>expr).
Browse files Browse the repository at this point in the history
This follows golang/go@ee61186 to some
extent. I took the opportunity to simplify the parsing logic and
also fixed a bug in `Regexp::Equal()` that had gone unnoticed...

Change-Id: I90abec942d39b02a1c6d1ac95cd3b1cc66ec7b2a
Reviewed-on: https://code-review.googlesource.com/c/re2/+/61690
Reviewed-by: Alex Chernyakhovsky <achernya@google.com>
Reviewed-by: Paul Wankadia <junyer@google.com>
  • Loading branch information
junyer committed Aug 11, 2023
1 parent cb000a8 commit 6148386
Show file tree
Hide file tree
Showing 5 changed files with 40 additions and 16 deletions.
2 changes: 1 addition & 1 deletion doc/syntax.html
Expand Up @@ -62,7 +62,7 @@ <h1>RE2 regular expression syntax reference</h1>
<tr><td colspan=2><b>Grouping:</b></td></tr>
<tr><td><code>(re)</code></td><td>numbered capturing group (submatch)</td></tr>
<tr><td><code>(?P&lt;name&gt;re)</code></td><td>named &amp; numbered capturing group (submatch)</td></tr>
<tr><td><code><font color=#808080>(?&lt;name&gt;re)</font></code></td><td>named &amp; numbered capturing group (submatch) </td></tr>
<tr><td><code>(?&lt;name&gt;re)</code></td><td>named &amp; numbered capturing group (submatch)</td></tr>
<tr><td><code><font color=#808080>(?'name're)</font></code></td><td>named &amp; numbered capturing group (submatch) </td></tr>
<tr><td><code>(?:re)</code></td><td>non-capturing group</td></tr>
<tr><td><code>(?flags)</code></td><td>set flags within current group; non-capturing</td></tr>
Expand Down
2 changes: 1 addition & 1 deletion doc/syntax.txt
Expand Up @@ -51,7 +51,7 @@ x{n}+ exactly «n» «x», possessive NOT SUPPORTED
Grouping:
(re) numbered capturing group (submatch)
(?P<name>re) named & numbered capturing group (submatch)
(?<name>re) named & numbered capturing group (submatch) NOT SUPPORTED
(?<name>re) named & numbered capturing group (submatch)
(?'name're) named & numbered capturing group (submatch) NOT SUPPORTED
(?:re) non-capturing group
(?flags) set flags within current group; non-capturing
Expand Down
26 changes: 13 additions & 13 deletions re2/parse.cc
Expand Up @@ -2059,8 +2059,6 @@ bool Regexp::ParseState::ParsePerlFlags(absl::string_view* s) {
return false;
}

t.remove_prefix(2); // "(?"

// Check for named captures, first introduced in Python's regexp library.
// As usual, there are three slightly different syntaxes:
//
Expand All @@ -2074,22 +2072,23 @@ bool Regexp::ParseState::ParsePerlFlags(absl::string_view* s) {
// support all three as well. EcmaScript 4 uses only the Python form.
//
// In both the open source world (via Code Search) and the
// Google source tree, (?P<expr>name) is the dominant form,
// so that's the one we implement. One is enough.
if (t.size() > 2 && t[0] == 'P' && t[1] == '<') {
// Google source tree, (?P<name>expr) and (?<name>expr) are the
// dominant forms of named captures and both are supported.
if ((t.size() > 4 && t[2] == 'P' && t[3] == '<') ||
(t.size() > 3 && t[2] == '<')) {
// Pull out name.
size_t end = t.find('>', 2);
size_t begin = t[2] == 'P' ? 4 : 3;
size_t end = t.find('>', begin);
if (end == absl::string_view::npos) {
if (!IsValidUTF8(*s, status_))
if (!IsValidUTF8(t, status_))
return false;
status_->set_code(kRegexpBadNamedCapture);
status_->set_error_arg(*s);
status_->set_error_arg(t);
return false;
}

// t is "P<name>...", t[end] == '>'
absl::string_view capture(t.data()-2, end+3); // "(?P<name>"
absl::string_view name(t.data()+2, end-2); // "name"
absl::string_view capture(t.data(), end+1);
absl::string_view name(t.data()+begin, end-begin);
if (!IsValidUTF8(name, status_))
return false;
if (!IsValidCaptureName(name)) {
Expand All @@ -2103,11 +2102,12 @@ bool Regexp::ParseState::ParsePerlFlags(absl::string_view* s) {
return false;
}

s->remove_prefix(
static_cast<size_t>(capture.data() + capture.size() - s->data()));
s->remove_prefix(capture.size());
return true;
}

t.remove_prefix(2); // "(?"

bool negated = false;
bool sawflags = false;
int nflags = flags_;
Expand Down
8 changes: 7 additions & 1 deletion re2/regexp.cc
Expand Up @@ -400,7 +400,13 @@ static bool TopEqual(Regexp* a, Regexp* b) {
a->max() == b->max();

case kRegexpCapture:
return a->cap() == b->cap() && a->name() == b->name();
if (a->name() == NULL || b->name() == NULL) {
// One pointer is null, so the other pointer should also be null.
return a->cap() == b->cap() && a->name() == b->name();
} else {
// Neither pointer is null, so compare the pointees for equality.
return a->cap() == b->cap() && *a->name() == *b->name();
}

case kRegexpHaveMatch:
return a->match_id() == b->match_id();
Expand Down
18 changes: 18 additions & 0 deletions re2/testing/parse_test.cc
Expand Up @@ -166,6 +166,8 @@ static Test tests[] = {
// Test named captures
{ "(?P<name>a)", "cap{name:lit{a}}" },
{ "(?P<中文>a)", "cap{中文:lit{a}}" },
{ "(?<name>a)", "cap{name:lit{a}}" },
{ "(?<中文>a)", "cap{中文:lit{a}}" },

// Case-folded literals
{ "[Aa]", "litfold{a}" },
Expand Down Expand Up @@ -396,6 +398,11 @@ const char* badtests[] = {
"(?P<name",
"(?P<x y>a)",
"(?P<>a)",
"(?<name>a",
"(?<name>",
"(?<name",
"(?<x y>a)",
"(?<>a)",
"[a-Z]",
"(?i)[a-Z]",
"a{100000}",
Expand All @@ -416,6 +423,7 @@ const char* only_perl[] = {
"\\Q\\\\\\\\\\E",
"(?:a)",
"(?P<name>a)",
"(?<name>a)",
};

// Valid in POSIX, bad in Perl.
Expand Down Expand Up @@ -505,6 +513,16 @@ TEST(NamedCaptures, ErrorArgs) {
EXPECT_TRUE(re == NULL);
EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
EXPECT_EQ(status.error_arg(), "(?P<space bar>");

re = Regexp::Parse("test(?<name", Regexp::LikePerl, &status);
EXPECT_TRUE(re == NULL);
EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
EXPECT_EQ(status.error_arg(), "(?<name");

re = Regexp::Parse("test(?<space bar>z)", Regexp::LikePerl, &status);
EXPECT_TRUE(re == NULL);
EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
EXPECT_EQ(status.error_arg(), "(?<space bar>");
}

} // namespace re2

0 comments on commit 6148386

Please sign in to comment.