Fix boundary handling in Regex auto-atomicity optimization (#79088)

This fixes a regression that occurred between .NET Core 3.1 and .NET 5, when we added the auto-atomicity optimization. This optimization is based on the premise that if a loop is followed by something that can't possibly match anything the loop could give up when backtracking, then backtracking for that loop is wasted effort, and thus the loop can be made atomic. For example, given the expression `a*b`, there's nothing the `a*` loop could give up when backtracking that would also match `b`, thus this can be transformed into `(?>a*)b`. As part of this, we also factor in word boundaries, but we're too aggressive in our handling of them. If you have `a+\b`, there's nothing that `a+` could give up that would enable the `\b` to match. However, if you have `a*\b`, since `\b` is a zero-width assertion, it is actually possible for the `a*` to give up something (the empty match) that could match `\b`, yet we're erroneously still converting that to be `(?>a*)\b`. The fix is to constrain that part of the optimization to require a non-zero minimum bound on the loop in order to make it atomic. While it's simple to repro incorrect handling here, it's also rare to find in real-world use, as word boundary anchors are used to demarcate words, this issue really only affects cases of empty words, and it's unusual that someone would write an expression to try to identify empty words.
dotnet · Dec 1, 2022 · 41f57b7 · 41f57b7
1 parent 8841dd2
commit 41f57b7
Show file tree

Hide file tree

Showing 3 changed files with 36 additions and 10 deletions.
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
@@ -2117,10 +2117,10 @@ private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent, bool i
                             case RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic when subsequent.M == 0 && node.Ch != subsequent.Ch:
                             case RegexNodeKind.Notonelazy or RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic when subsequent.M == 0 && node.Ch == subsequent.Ch:
                             case RegexNodeKind.Setlazy or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic when subsequent.M == 0 && !RegexCharClass.CharInClass(node.Ch, subsequent.Str!):
-                            case RegexNodeKind.Boundary when RegexCharClass.IsBoundaryWordChar(node.Ch):
-                            case RegexNodeKind.NonBoundary when !RegexCharClass.IsBoundaryWordChar(node.Ch):
-                            case RegexNodeKind.ECMABoundary when RegexCharClass.IsECMAWordChar(node.Ch):
-                            case RegexNodeKind.NonECMABoundary when !RegexCharClass.IsECMAWordChar(node.Ch):
+                            case RegexNodeKind.Boundary when node.M > 0 && RegexCharClass.IsBoundaryWordChar(node.Ch):
+                            case RegexNodeKind.NonBoundary when node.M > 0 && !RegexCharClass.IsBoundaryWordChar(node.Ch):
+                            case RegexNodeKind.ECMABoundary when node.M > 0 && RegexCharClass.IsECMAWordChar(node.Ch):
+                            case RegexNodeKind.NonECMABoundary when node.M > 0 && !RegexCharClass.IsECMAWordChar(node.Ch):
                                 // The loop can be made atomic based on this subsequent node, but we'll need to evaluate the next one as well.
                                 break;
 
@@ -2163,10 +2163,10 @@ private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent, bool i
 
                             case RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic when subsequent.M == 0 && !RegexCharClass.CharInClass(subsequent.Ch, node.Str!):
                             case RegexNodeKind.Setlazy or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic when subsequent.M == 0 && !RegexCharClass.MayOverlap(node.Str!, subsequent.Str!):
-                            case RegexNodeKind.Boundary when node.Str is RegexCharClass.WordClass or RegexCharClass.DigitClass:
-                            case RegexNodeKind.NonBoundary when node.Str is RegexCharClass.NotWordClass or RegexCharClass.NotDigitClass:
-                            case RegexNodeKind.ECMABoundary when node.Str is RegexCharClass.ECMAWordClass or RegexCharClass.ECMADigitClass:
-                            case RegexNodeKind.NonECMABoundary when node.Str is RegexCharClass.NotECMAWordClass or RegexCharClass.NotDigitClass:
+                            case RegexNodeKind.Boundary when node.M > 0 && node.Str is RegexCharClass.WordClass or RegexCharClass.DigitClass:
+                            case RegexNodeKind.NonBoundary when node.M > 0 && node.Str is RegexCharClass.NotWordClass or RegexCharClass.NotDigitClass:
+                            case RegexNodeKind.ECMABoundary when node.M > 0 && node.Str is RegexCharClass.ECMAWordClass or RegexCharClass.ECMADigitClass:
+                            case RegexNodeKind.NonECMABoundary when node.M > 0 && node.Str is RegexCharClass.NotECMAWordClass or RegexCharClass.NotDigitClass:
                                 // The loop can be made atomic based on this subsequent node, but we'll need to evaluate the next one as well.
                                 break;
 

diff --git a/...aries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.MultipleMatches.Tests.cs b/...aries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.MultipleMatches.Tests.cs
@@ -292,6 +292,25 @@ public static IEnumerable<object[]> Matches_TestData()
                     }
                 };
 
+                foreach (RegexOptions options in new[] { RegexOptions.None, RegexOptions.ECMAScript })
+                {
+                    if (RegexHelpers.IsNonBacktracking(engine))
+                    {
+                        continue;
+                    }
+
+                    yield return new object[]
+                    {
+                        engine,
+                        @"a?\b", "ac", options,
+                        new[]
+                        {
+                            new CaptureData("", 0, 0),
+                            new CaptureData("", 2, 0),
+                        }
+                    };
+                }
+
                 if (!PlatformDetection.IsNetFramework)
                 {
                     // .NET Framework missing fix in https://github.com/dotnet/runtime/pull/1075

diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs
@@ -334,8 +334,10 @@ public class RegexReductionTests
         [InlineData("[^\n]*\n+", "(?>[^\n]*)(?>\n+)")]
         [InlineData("(a+)b", "((?>a+))b")]
         [InlineData("a*(?:bcd|efg)", "(?>a*)(?:bcd|efg)")]
-        [InlineData("\\w*\\b", "(?>\\w*)\\b")]
-        [InlineData("\\d*\\b", "(?>\\d*)\\b")]
+        [InlineData("\\w+\\b", "(?>\\w+)\\b")]
+        [InlineData("\\d+\\b", "(?>\\d+)\\b")]
+        [InlineData("\\W+\\B", "(?>\\W+)\\B")]
+        [InlineData("\\D+\\B", "(?>\\D+)\\B")]
         [InlineData("(?:abc*|def*)g", "(?:ab(?>c*)|de(?>f*))g")]
         [InlineData("(?:a[ce]*|b*)g", "(?:a(?>[ce]*)|(?>b*))g")]
         [InlineData("(?:a[ce]*|b*)c", "(?:a[ce]*|(?>b*))c")]
@@ -476,6 +478,11 @@ public void PatternsReduceIdentically(string actual, string expected)
         [InlineData(@"\w*\b\w+", @"(?>\w*)\b\w+")]
         [InlineData(@"\W+\B\W+", @"(?>\W+)\B\W")]
         [InlineData(@"\W*\B\W+", @"(?>\W*)\B\W")]
+        [InlineData(@"a?\b", @"(?>a?)\b")]
+        [InlineData(@"\w*\b", @"(?>\w*)\b")]
+        [InlineData(@"\d*\b", @"(?>\d*)\b")]
+        [InlineData(@"\W*\B", @"(?>\W*)\B")]
+        [InlineData(@"\D*\B", @"(?>\D*)\B")]
         // Loops inside alternation constructs
         [InlineData("(abc*|def)chi", "(ab(?>c*)|def)chi")]
         [InlineData("(abc|def*)fhi", "(abc|de(?>f*))fhi")]