Skip to content

Commit

Permalink
regexp: fix repeat of preferred empty match
Browse files Browse the repository at this point in the history
In Perl mode, (|a)* should match an empty string at the start of the
input. Instead it matches as many a's as possible.
Because (|a)+ is handled correctly, matching only an empty string,
this leads to the paradox that e* can match more text than e+
(for e = (|a)) and that e+ is sometimes different from ee*.

This is a very old bug that ultimately derives from the picture I drew
for e* in https://swtch.com/~rsc/regexp/regexp1.html. The picture is
correct for longest-match (POSIX) regexps but subtly wrong for
preferred-match (Perl) regexps in the case where e has a preferred
empty match. Pointed out by Andrew Gallant in private mail.

The current code treats e* and e+ as the same structure, with
different entry points. In the case of e* the preference list ends up
not quite in the right order, in part because the “before e” and
“after e” states are the same state. Splitting them apart fixes the
preference list, and that can be done by compiling e* as if it were
(e+)?.

Like with any bug fix, there is a very low chance of breaking a
program that accidentally depends on the buggy behavior.

RE2, Go, and Rust all have this bug, and we've all agreed to fix it,
to keep the implementations in sync.

Fixes #46123.

Change-Id: I70e742e71e0a23b626593b16ddef3c1e73b413b0
Reviewed-on: https://go-review.googlesource.com/c/go/+/318750
Trust: Russ Cox <rsc@golang.org>
Run-TryBot: Russ Cox <rsc@golang.org>
Reviewed-by: Rob Pike <r@golang.org>
TryBot-Result: Go Bot <gobot@golang.org>
  • Loading branch information
rsc committed May 13, 2021
1 parent fd4631e commit 2a61b3c
Show file tree
Hide file tree
Showing 8 changed files with 176 additions and 46 deletions.
1 change: 1 addition & 0 deletions src/regexp/find_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ var findTests = []FindTest{
{`\B`, "xx", build(1, 1, 1)},
{`\B`, "x y", nil},
{`\B`, "xx yy", build(2, 1, 1, 4, 4)},
{`(|a)*`, "aa", build(3, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2)},

// RE2 tests
{`[^\S\s]`, "abcd", nil},
Expand Down
2 changes: 1 addition & 1 deletion src/regexp/onepass_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ var onePassTests = []struct {
{`^(?:(a)|(?:a*))$`, false},
{`^(?:(?:(?:.(?:$))?))$`, true},
{`^abcd$`, true},
{`^(?:(?:a{0,})*?)$`, true},
{`^(?:(?:a{0,})*?)$`, false},
{`^(?:(?:a+)*)$`, true},
{`^(?:(?:a|(?:aa)))$`, true},
{`^(?:[^\s\S])$`, true},
Expand Down
29 changes: 23 additions & 6 deletions src/regexp/syntax/compile.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,9 @@ func (l1 patchList) append(p *Prog, l2 patchList) patchList {

// A frag represents a compiled program fragment.
type frag struct {
i uint32 // index of first instruction
out patchList // where to record end instruction
i uint32 // index of first instruction
out patchList // where to record end instruction
nullable bool // whether fragment can match empty string
}

type compiler struct {
Expand Down Expand Up @@ -159,7 +160,7 @@ func (c *compiler) compile(re *Regexp) frag {

func (c *compiler) inst(op InstOp) frag {
// TODO: impose length limit
f := frag{i: uint32(len(c.p.Inst))}
f := frag{i: uint32(len(c.p.Inst)), nullable: true}
c.p.Inst = append(c.p.Inst, Inst{Op: op})
return f
}
Expand Down Expand Up @@ -194,7 +195,7 @@ func (c *compiler) cat(f1, f2 frag) frag {
// TODO: elide nop

f1.out.patch(c.p, f2.i)
return frag{f1.i, f2.out}
return frag{f1.i, f2.out, f1.nullable && f2.nullable}
}

func (c *compiler) alt(f1, f2 frag) frag {
Expand All @@ -211,6 +212,7 @@ func (c *compiler) alt(f1, f2 frag) frag {
i.Out = f1.i
i.Arg = f2.i
f.out = f1.out.append(c.p, f2.out)
f.nullable = f1.nullable || f2.nullable
return f
}

Expand All @@ -228,7 +230,12 @@ func (c *compiler) quest(f1 frag, nongreedy bool) frag {
return f
}

func (c *compiler) star(f1 frag, nongreedy bool) frag {
// loop returns the fragment for the main loop of a plus or star.
// For plus, it can be used after changing the entry to f1.i.
// For star, it can be used directly when f1 can't match an empty string.
// (When f1 can match an empty string, f1* must be implemented as (f1+)?
// to get the priority match order correct.)
func (c *compiler) loop(f1 frag, nongreedy bool) frag {
f := c.inst(InstAlt)
i := &c.p.Inst[f.i]
if nongreedy {
Expand All @@ -242,8 +249,17 @@ func (c *compiler) star(f1 frag, nongreedy bool) frag {
return f
}

func (c *compiler) star(f1 frag, nongreedy bool) frag {
if f1.nullable {
// Use (f1+)? to get priority match order correct.
// See golang.org/issue/46123.
return c.quest(c.plus(f1, nongreedy), nongreedy)
}
return c.loop(f1, nongreedy)
}

func (c *compiler) plus(f1 frag, nongreedy bool) frag {
return frag{f1.i, c.star(f1, nongreedy).out}
return frag{f1.i, c.loop(f1, nongreedy).out, f1.nullable}
}

func (c *compiler) empty(op EmptyOp) frag {
Expand All @@ -255,6 +271,7 @@ func (c *compiler) empty(op EmptyOp) frag {

func (c *compiler) rune(r []rune, flags Flags) frag {
f := c.inst(InstRune)
f.nullable = false
i := &c.p.Inst[f.i]
i.Rune = r
flags &= FoldCase // only relevant flag is FoldCase
Expand Down
15 changes: 15 additions & 0 deletions src/regexp/syntax/prog_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,21 @@ var compileTests = []struct {
1* empty 4 -> 2
2 anynotnl -> 3
3 match
`},
{"(?:|a)+", ` 0 fail
1 nop -> 4
2 rune1 "a" -> 4
3* alt -> 1, 2
4 alt -> 3, 5
5 match
`},
{"(?:|a)*", ` 0 fail
1 nop -> 4
2 rune1 "a" -> 4
3 alt -> 1, 2
4 alt -> 3, 6
5* alt -> 3, 6
6 match
`},
}

Expand Down
12 changes: 4 additions & 8 deletions src/regexp/testdata/basic.dat
Original file line number Diff line number Diff line change
Expand Up @@ -124,24 +124,20 @@ E ((a)) abc (0,1)(0,1)(0,1)
E (a)b(c) abc (0,3)(0,1)(2,3)
E a+b+c aabbabc (4,7)
E a* aaa (0,3)
#E (a*)* - (0,0)(0,0)
E (a*)* - (0,0)(?,?) RE2/Go
E (a*)* - (0,0)(0,0)
E (a*)+ - (0,0)(0,0)
#E (a*|b)* - (0,0)(0,0)
E (a*|b)* - (0,0)(?,?) RE2/Go
E (a*|b)* - (0,0)(0,0)
E (a+|b)* ab (0,2)(1,2)
E (a+|b)+ ab (0,2)(1,2)
E (a+|b)? ab (0,1)(0,1)
BE [^ab]* cde (0,3)
#E (^)* - (0,0)(0,0)
E (^)* - (0,0)(?,?) RE2/Go
E (^)* - (0,0)(0,0)
BE a* NULL (0,0)
E ([abc])*d abbbcd (0,6)(4,5)
E ([abc])*bcd abcd (0,4)(0,1)
E a|b|c|d|e e (0,1)
E (a|b|c|d|e)f ef (0,2)(0,1)
#E ((a*|b))* - (0,0)(0,0)(0,0)
E ((a*|b))* - (0,0)(?,?)(?,?) RE2/Go
E ((a*|b))* - (0,0)(0,0)(0,0)
BE abcd*efg abcdefg (0,7)
BE ab* xabyabbbz (1,3)
BE ab* xayabbbz (1,2)
Expand Down
18 changes: 6 additions & 12 deletions src/regexp/testdata/nullsubexpr.dat
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
NOTE null subexpression matches : 2002-06-06

E (a*)* a (0,1)(0,1)
#E SAME x (0,0)(0,0)
E SAME x (0,0)(?,?) RE2/Go
E SAME x (0,0)(0,0)
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaax (0,6)(0,6)
E (a*)+ a (0,1)(0,1)
Expand All @@ -19,17 +18,15 @@ E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaax (0,6)(0,6)

E ([a]*)* a (0,1)(0,1)
#E SAME x (0,0)(0,0)
E SAME x (0,0)(?,?) RE2/Go
E SAME x (0,0)(0,0)
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaax (0,6)(0,6)
E ([a]*)+ a (0,1)(0,1)
E SAME x (0,0)(0,0)
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaax (0,6)(0,6)
E ([^b]*)* a (0,1)(0,1)
#E SAME b (0,0)(0,0)
E SAME b (0,0)(?,?) RE2/Go
E SAME b (0,0)(0,0)
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaab (0,6)(0,6)
E ([ab]*)* a (0,1)(0,1)
Expand All @@ -41,11 +38,9 @@ E SAME bbbbbb (0,6)(0,6)
E SAME aaaabcde (0,5)(0,5)
E ([^a]*)* b (0,1)(0,1)
E SAME bbbbbb (0,6)(0,6)
#E SAME aaaaaa (0,0)(0,0)
E SAME aaaaaa (0,0)(?,?) RE2/Go
E SAME aaaaaa (0,0)(0,0)
E ([^ab]*)* ccccxx (0,6)(0,6)
#E SAME ababab (0,0)(0,0)
E SAME ababab (0,0)(?,?) RE2/Go
E SAME ababab (0,0)(0,0)

E ((z)+|a)* zabcde (0,2)(1,2)

Expand All @@ -65,8 +60,7 @@ B \(a*\)*\(x\)\(\1\) axa (0,3)(0,1)(1,2)(2,3)
B \(a*\)*\(x\)\(\1\)\(x\) axax (0,4)(0,1)(1,2)(2,3)(3,4)
B \(a*\)*\(x\)\(\1\)\(x\) axxa (0,3)(1,1)(1,2)(2,2)(2,3)

#E (a*)*(x) x (0,1)(0,0)(0,1)
E (a*)*(x) x (0,1)(?,?)(0,1) RE2/Go
E (a*)*(x) x (0,1)(0,0)(0,1)
E (a*)*(x) ax (0,2)(0,1)(1,2)
E (a*)*(x) axa (0,2)(0,1)(1,2)

Expand Down
Binary file modified src/regexp/testdata/re2-exhaustive.txt.bz2
Binary file not shown.
145 changes: 126 additions & 19 deletions src/regexp/testdata/re2-search.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# RE2 basic search tests built by make log
# Thu Sep 8 13:43:43 EDT 2011
# Wed May 12 12:13:22 EDT 2021
Regexp.SearchTests
strings
""
Expand Down Expand Up @@ -227,22 +227,6 @@ regexps
0-0;0-0;0-0;0-0
strings
""
""
regexps
"a*"
0-0;0-0;0-0;0-0
0-0;0-0;0-0;0-0
"^(?:a*)$"
0-0;0-0;0-0;0-0
0-0;0-0;0-0;0-0
"^(?:a*)"
0-0;0-0;0-0;0-0
0-0;0-0;0-0;0-0
"(?:a*)$"
0-0;0-0;0-0;0-0
0-0;0-0;0-0;0-0
strings
""
"xabcdx"
regexps
"ab|cd"
Expand Down Expand Up @@ -3651,6 +3635,86 @@ regexps
0-1;0-1;0-1;0-1
strings
""
"a"
regexps
"a\\C+"
-;-;-;-
-;-;-;-
"^(?:a\\C+)$"
-;-;-;-
-;-;-;-
"^(?:a\\C+)"
-;-;-;-
-;-;-;-
"(?:a\\C+)$"
-;-;-;-
-;-;-;-
strings
""
"a"
regexps
"a\\C?"
-;-;-;-
0-1;0-1;0-1;0-1
"^(?:a\\C?)$"
-;-;-;-
0-1;0-1;0-1;0-1
"^(?:a\\C?)"
-;-;-;-
0-1;0-1;0-1;0-1
"(?:a\\C?)$"
-;-;-;-
0-1;0-1;0-1;0-1
strings
""
"a"
regexps
"a\\C*?"
-;-;-;-
0-1;0-1;0-1;0-1
"^(?:a\\C*?)$"
-;-;-;-
0-1;0-1;0-1;0-1
"^(?:a\\C*?)"
-;-;-;-
0-1;0-1;0-1;0-1
"(?:a\\C*?)$"
-;-;-;-
0-1;0-1;0-1;0-1
strings
""
"a"
regexps
"a\\C+?"
-;-;-;-
-;-;-;-
"^(?:a\\C+?)$"
-;-;-;-
-;-;-;-
"^(?:a\\C+?)"
-;-;-;-
-;-;-;-
"(?:a\\C+?)$"
-;-;-;-
-;-;-;-
strings
""
"a"
regexps
"a\\C??"
-;-;-;-
0-1;0-1;0-1;0-1
"^(?:a\\C??)$"
-;-;-;-
0-1;0-1;0-1;0-1
"^(?:a\\C??)"
-;-;-;-
0-1;0-1;0-1;0-1
"(?:a\\C??)$"
-;-;-;-
0-1;0-1;0-1;0-1
strings
""
"baba"
regexps
"a\\C*|ba\\C"
Expand All @@ -3666,7 +3730,50 @@ regexps
-;-;-;-
-;1-4;-;1-4
strings
"abc"
""
"Inc."
regexps
"a.*?c|a.*?b"
"\\w*I\\w*"
-;-;-;-
-;0-3;-;0-3
"^(?:\\w*I\\w*)$"
-;-;-;-
-;-;-;-
"^(?:\\w*I\\w*)"
-;-;-;-
-;0-3;-;0-3
"(?:\\w*I\\w*)$"
-;-;-;-
-;-;-;-
strings
""
"aaa"
regexps
"(?:|a)*"
0-0;0-0;0-0;0-0
0-3;0-0;0-3;0-3
"^(?:(?:|a)*)$"
0-0;0-0;0-0;0-0
0-3;0-3;0-3;0-3
"^(?:(?:|a)*)"
0-0;0-0;0-0;0-0
0-3;0-0;0-3;0-3
"(?:(?:|a)*)$"
0-0;0-0;0-0;0-0
0-3;0-3;0-3;0-3
strings
""
"aaa"
regexps
"(?:|a)+"
0-0;0-0;0-0;0-0
0-3;0-0;0-3;0-3
"^(?:(?:|a)+)$"
0-0;0-0;0-0;0-0
0-3;0-3;0-3;0-3
"^(?:(?:|a)+)"
0-0;0-0;0-0;0-0
0-3;0-0;0-3;0-3
"(?:(?:|a)+)$"
0-0;0-0;0-0;0-0
0-3;0-3;0-3;0-3

0 comments on commit 2a61b3c

Please sign in to comment.