From 66e9711224479903574ac7b78ddcfb2acf65feaf Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Wed, 19 Jul 2023 14:51:23 -0400 Subject: [PATCH] Broaden use of SearchValues in TryFindNextPossibleStartingPosition in Regex SearchValues has been updated to have an ASCII fast-path for inputs that are not only ASCII. This means we can simplify TryFindNextPossibleStartingPosition in Regex to not track AsciiSet specially and instead just increase the number of characters we query the set for (from 5 to 128). That way, we'll use SearchValues rather than emitting our own helper up until a (semi-arbitrary) point where we deem it impossible or infeasible to enumerate all the chars that make up the set. --- .../gen/RegexGenerator.Emitter.cs | 120 ++++++++++-------- .../Text/RegularExpressions/RegexCompiler.cs | 19 ++- .../RegexFindOptimizations.cs | 6 +- .../RegularExpressions/RegexPrefixAnalyzer.cs | 26 ++-- 4 files changed, 91 insertions(+), 80 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index e54ffd3ecc72..19488703f07f 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -399,54 +399,65 @@ private static string EmitSearchValuesOrLiteral(ReadOnlySpan chars, Dictio } /// Adds a SearchValues instance declaration to the required helpers collection. - private static string EmitSearchValues(char[] asciiChars, Dictionary requiredHelpers) + private static string EmitSearchValues(char[] chars, Dictionary requiredHelpers) { - Debug.Assert(RegexCharClass.IsAscii(asciiChars)); + Array.Sort(chars); - // The set of ASCII characters can be represented as a 128-bit bitmap. Use the 16-byte hex string as the key. - byte[] bitmap = new byte[16]; - foreach (char c in asciiChars) + string fieldName; + if (RegexCharClass.IsAscii(chars)) { - bitmap[c >> 3] |= (byte)(1 << (c & 7)); + // The set of ASCII characters can be represented as a 128-bit bitmap. Use the 16-byte hex string as the key. + var bitmap = new byte[16]; + foreach (char c in chars) + { + bitmap[c >> 3] |= (byte)(1 << (c & 7)); + } + + string hexBitmap = BitConverter.ToString(bitmap).Replace("-", string.Empty); + + fieldName = hexBitmap switch + { + "FFFFFFFF000000000000000000000080" => "s_asciiControl", + "000000000000FF030000000000000000" => "s_asciiDigits", + "0000000000000000FEFFFF07FEFFFF07" => "s_asciiLetters", + "000000000000FF03FEFFFF07FEFFFF07" => "s_asciiLettersAndDigits", + "000000000000FF037E0000007E000000" => "s_asciiHexDigits", + "000000000000FF03000000007E000000" => "s_asciiHexDigitsLower", + "000000000000FF037E00000000000000" => "s_asciiHexDigitsUpper", + "00000000EEF7008C010000B800000028" => "s_asciiPunctuation", + "00000000010000000000000000000000" => "s_asciiSeparators", + "00000000100800700000004001000050" => "s_asciiSymbols", + "003E0000010000000000000000000000" => "s_asciiWhiteSpace", + "000000000000FF03FEFFFF87FEFFFF07" => "s_asciiWordChars", + + "00000000FFFFFFFFFFFFFFFFFFFFFF7F" => "s_asciiExceptControl", + "FFFFFFFFFFFF00FCFFFFFFFFFFFFFFFF" => "s_asciiExceptDigits", + "FFFFFFFFFFFFFFFF010000F8010000F8" => "s_asciiExceptLetters", + "FFFFFFFFFFFF00FC010000F8010000F8" => "s_asciiExceptLettersAndDigits", + "FFFFFFFFFFFFFFFFFFFFFFFF010000F8" => "s_asciiExceptLower", + "FFFFFFFF1108FF73FEFFFF47FFFFFFD7" => "s_asciiExceptPunctuation", + "FFFFFFFFFEFFFFFFFFFFFFFFFFFFFFFF" => "s_asciiExceptSeparators", + "FFFFFFFFEFF7FF8FFFFFFFBFFEFFFFAF" => "s_asciiExceptSymbols", + "FFFFFFFFFFFFFFFF010000F8FFFFFFFF" => "s_asciiExceptUpper", + "FFC1FFFFFEFFFFFFFFFFFFFFFFFFFFFF" => "s_asciiExceptWhiteSpace", + "FFFFFFFFFFFF00FC01000078010000F8" => "s_asciiExceptWordChars", + + _ => $"s_ascii_{hexBitmap.TrimStart('0')}" + }; } - - string hexBitmap = BitConverter.ToString(bitmap).Replace("-", string.Empty); - - string fieldName = hexBitmap switch + else { - "FFFFFFFF000000000000000000000080" => "s_asciiControl", - "000000000000FF030000000000000000" => "s_asciiDigits", - "0000000000000000FEFFFF07FEFFFF07" => "s_asciiLetters", - "000000000000FF03FEFFFF07FEFFFF07" => "s_asciiLettersAndDigits", - "000000000000FF037E0000007E000000" => "s_asciiHexDigits", - "000000000000FF03000000007E000000" => "s_asciiHexDigitsLower", - "000000000000FF037E00000000000000" => "s_asciiHexDigitsUpper", - "00000000EEF7008C010000B800000028" => "s_asciiPunctuation", - "00000000010000000000000000000000" => "s_asciiSeparators", - "00000000100800700000004001000050" => "s_asciiSymbols", - "003E0000010000000000000000000000" => "s_asciiWhiteSpace", - "000000000000FF03FEFFFF87FEFFFF07" => "s_asciiWordChars", - - "00000000FFFFFFFFFFFFFFFFFFFFFF7F" => "s_asciiExceptControl", - "FFFFFFFFFFFF00FCFFFFFFFFFFFFFFFF" => "s_asciiExceptDigits", - "FFFFFFFFFFFFFFFF010000F8010000F8" => "s_asciiExceptLetters", - "FFFFFFFFFFFF00FC010000F8010000F8" => "s_asciiExceptLettersAndDigits", - "FFFFFFFFFFFFFFFFFFFFFFFF010000F8" => "s_asciiExceptLower", - "FFFFFFFF1108FF73FEFFFF47FFFFFFD7" => "s_asciiExceptPunctuation", - "FFFFFFFFFEFFFFFFFFFFFFFFFFFFFFFF" => "s_asciiExceptSeparators", - "FFFFFFFFEFF7FF8FFFFFFFBFFEFFFFAF" => "s_asciiExceptSymbols", - "FFFFFFFFFFFFFFFF010000F8FFFFFFFF" => "s_asciiExceptUpper", - "FFC1FFFFFEFFFFFFFFFFFFFFFFFFFFFF" => "s_asciiExceptWhiteSpace", - "FFFFFFFFFFFF00FC01000078010000F8" => "s_asciiExceptWordChars", - - _ => $"s_ascii_{hexBitmap.TrimStart('0')}" - }; + using (SHA256 sha = SHA256.Create()) + { +#pragma warning disable CA1850 // SHA256.HashData isn't available on netstandard2.0 + fieldName = $"s_nonAscii_{BitConverter.ToString(sha.ComputeHash(Encoding.UTF8.GetBytes(chars))).Replace("-", "")}"; +#pragma warning restore CA1850 + } + } if (!requiredHelpers.ContainsKey(fieldName)) { - Array.Sort(asciiChars); - - string setLiteral = Literal(new string(asciiChars)); + string setLiteral = Literal(new string(chars)); requiredHelpers.Add(fieldName, new string[] { @@ -465,12 +476,12 @@ private static string EmitIndexOfAnyCustomHelper(string set, Dictionary(); + var excludedAsciiChars = new List(); for (int i = 0; i < 128; i++) { if (!RegexCharClass.CharInClass((char)i, set)) { - asciiChars.Add((char)i); + excludedAsciiChars.Add((char)i); } } @@ -538,9 +549,9 @@ private static string EmitIndexOfAnyCustomHelper(string set, Dictionary span)"); lines.Add($"{{"); int uncheckedStart = lines.Count; - lines.Add(asciiChars.Count == 128 ? + lines.Add(excludedAsciiChars.Count == 128 ? $" int i = span.IndexOfAnyExceptInRange('\0', '\u007f');" : - $" int i = span.IndexOfAnyExcept({EmitSearchValues(asciiChars.ToArray(), requiredHelpers)});"); + $" int i = span.IndexOfAnyExcept({EmitSearchValues(excludedAsciiChars.ToArray(), requiredHelpers)});"); lines.Add($" if ((uint)i < (uint)span.Length)"); lines.Add($" {{"); lines.Add($" if (char.IsAscii(span[i]))"); @@ -1067,6 +1078,8 @@ void EmitFixedSet_LeftToRight() string indexOf; if (primarySet.Chars is not null) { + Debug.Assert(primarySet.Chars.Length > 0); + // We have a chars array, so we can use IndexOf{Any}{Except} to search for it. Choose the best overload. string indexOfName = "IndexOf", indexOfAnyName = "IndexOfAny"; if (primarySet.Negated) @@ -1076,18 +1089,19 @@ void EmitFixedSet_LeftToRight() indexOf = primarySet.Chars.Length switch { + // 1, 2, 3 have dedicated optimized IndexOfAny overloads 1 => $"{span}.{indexOfName}({Literal(primarySet.Chars[0])})", 2 => $"{span}.{indexOfAnyName}({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])})", 3 => $"{span}.{indexOfAnyName}({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])}, {Literal(primarySet.Chars[2])})", - _ => $"{span}.{indexOfAnyName}({EmitSearchValuesOrLiteral(primarySet.Chars, requiredHelpers)})", + + // 4, 5 have dedicated optimized IndexOfAny overloads accessible via the ReadOnlySpan overload, + // but can also be handled via SearchValues + 4 or 5 => $"{span}.{indexOfAnyName}({EmitSearchValuesOrLiteral(primarySet.Chars, requiredHelpers)})", + + // > 5 can only be handled efficiently via SearchValues + _ => $"{span}.{indexOfAnyName}({EmitSearchValues(primarySet.Chars, requiredHelpers)})", }; } - else if (primarySet.AsciiSet is not null) - { - // We have a set of ASCII chars, so we can use IndexOfAny(SearchValues) to search for it. - Debug.Assert(!primarySet.Negated); - indexOf = $"{span}.IndexOfAny({EmitSearchValues(primarySet.AsciiSet, requiredHelpers)})"; - } else if (primarySet.Range is not null) { // We have a range, so we can use IndexOfAny{Except}InRange to search for it. In the corner case, @@ -1102,8 +1116,8 @@ void EmitFixedSet_LeftToRight() } else { - // We have an arbitrary set of characters that includes at least one non-ASCII char. We use a custom IndexOfAny helper that - // will perform the search as efficiently as possible. + // We have an arbitrary set of characters that's really large or otherwise not enumerable. + // We use a custom IndexOfAny helper that will perform the search as efficiently as possible. indexOf = $"{span}.{EmitIndexOfAnyCustomHelper(primarySet.Set, requiredHelpers, checkOverflow)}()"; } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index 97e728a08ca6..20ccc3afefcc 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -903,6 +903,7 @@ void EmitFixedSet_LeftToRight() if (primarySet.Chars is not null) { + Debug.Assert(primarySet.Chars.Length > 0); switch (primarySet.Chars.Length) { case 1: @@ -926,19 +927,23 @@ void EmitFixedSet_LeftToRight() Call(primarySet.Negated ? s_spanIndexOfAnyExceptCharCharChar : s_spanIndexOfAnyCharCharChar); break; - default: + case 4 or 5: + // tmp = ...IndexOfAny("abcd"); + // Note that this case differs slightly from the source generator, where it might choose to use + // SearchValues instead of a literal, but there's extra cost to doing so for RegexCompiler so + // it just always uses IndexOfAny(span). Ldstr(new string(primarySet.Chars)); Call(s_stringAsSpanMethod); Call(primarySet.Negated ? s_spanIndexOfAnyExceptSpan : s_spanIndexOfAnySpan); break; + + default: + // tmp = ...IndexOfAny(s_searchValues); + LoadSearchValues(primarySet.Chars); + Call(primarySet.Negated ? s_spanIndexOfAnyExceptSearchValues : s_spanIndexOfAnySearchValues); + break; } } - else if (primarySet.AsciiSet is not null) - { - Debug.Assert(!primarySet.Negated); - LoadSearchValues(primarySet.AsciiSet); - Call(s_spanIndexOfAnySearchValues); - } else if (primarySet.Range is not null) { if (primarySet.Range.Value.LowInclusive == primarySet.Range.Value.HighInclusive) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs index 7597b37edcc9..517c9da6b427 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs @@ -271,8 +271,6 @@ public FixedDistanceSet(char[]? chars, string set, int distance) public int Distance; /// As an alternative to , a description of the single range the set represents, if it does. public (char LowInclusive, char HighInclusive)? Range; - /// As an alternative to , a description of the set of ASCII characters it represents, if it does. - public char[]? AsciiSet; } /// When in literal after set loop node, gets the literal to search for and the RegexNode representing the leading loop. @@ -593,7 +591,7 @@ public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan textSpan, char[]? chars = primarySet.Chars; ReadOnlySpan span = textSpan.Slice(pos); - if (chars is not null) + if (chars is { Length: <= 5 }) // 5 == currently the max length efficiently handled by IndexOfAny{Except} without SearchValues { int i = primarySet.Negated ? span.IndexOfAnyExcept(chars) : span.IndexOfAny(chars); if (i >= 0) @@ -660,7 +658,7 @@ public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan textSpan, int endMinusRequiredLength = textSpan.Length - Math.Max(1, MinRequiredLength); - if (primarySet.Chars is not null) + if (primarySet.Chars is { Length: <= 5 }) // 5 == currently the max length efficiently handled by IndexOfAny{Except} { for (int inputPosition = pos; inputPosition <= endMinusRequiredLength; inputPosition++) { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs index 56ff64cdb182..96c50215c7b7 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs @@ -194,7 +194,7 @@ static bool Process(RegexNode node, ref ValueStringBuilder vsb) TryFindRawFixedSets(root, results, ref distance, thorough); #if DEBUG results.ForEach(r => Debug.Assert( - !r.Negated && r.Chars is null && r.AsciiSet is null && r.Range is null, + !r.Negated && r.Chars is null && r.Range is null, $"{nameof(TryFindRawFixedSets)} should have only populated {nameof(r.Set)} and {nameof(r.Distance)}")); #endif @@ -225,31 +225,25 @@ static bool Process(RegexNode node, ref ValueStringBuilder vsb) // For every entry, try to get the chars that make up the set, if there are few enough. // For any for which we couldn't get the small chars list, see if we can get other useful info. - Span scratch = stackalloc char[5]; // max efficiently optimized by IndexOfAny today + Span scratch = stackalloc char[128]; // limit based on what's currently efficiently handled by SearchValues for (int i = 0; i < results.Count; i++) { RegexFindOptimizations.FixedDistanceSet result = results[i]; result.Negated = RegexCharClass.IsNegated(result.Set); int count = RegexCharClass.GetSetChars(result.Set, scratch); - if (count > 0) { result.Chars = scratch.Slice(0, count).ToArray(); } - if (thorough) + // Prefer IndexOfAnyInRange over IndexOfAny for sets of 3-5 values that fit in a single range. + if (thorough && + (result.Chars is null || result.Chars.Length > 2) && + RegexCharClass.TryGetSingleRange(result.Set, out char lowInclusive, out char highInclusive)) { - // Prefer IndexOfAnyInRange over IndexOfAny for sets of 3-5 values that fit in a single range. - if ((result.Chars is null || count > 2) && RegexCharClass.TryGetSingleRange(result.Set, out char lowInclusive, out char highInclusive)) - { - result.Chars = null; - result.Range = (lowInclusive, highInclusive); - } - else if (result.Chars is null && !result.Negated && RegexCharClass.TryGetAsciiSetChars(result.Set, out char[]? asciiChars)) - { - result.AsciiSet = asciiChars; - } + result.Chars = null; + result.Range = (lowInclusive, highInclusive); } results[i] = result; @@ -472,8 +466,8 @@ static bool TryFindRawFixedSets(RegexNode node, List { - char[]? s1Chars = s1.Chars ?? s1.AsciiSet; - char[]? s2Chars = s2.Chars ?? s2.AsciiSet; + char[]? s1Chars = s1.Chars; + char[]? s2Chars = s2.Chars; int s1CharsLength = s1Chars?.Length ?? 0; int s2CharsLength = s2Chars?.Length ?? 0; bool s1Negated = s1.Negated;