Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Broaden use of SearchValues in TryFindNextPossibleStartingPosition in Regex #89205

Merged
merged 1 commit into from Jul 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Expand Up @@ -399,54 +399,65 @@ private static string EmitSearchValuesOrLiteral(ReadOnlySpan<char> chars, Dictio
}

/// <summary>Adds a SearchValues instance declaration to the required helpers collection.</summary>
private static string EmitSearchValues(char[] asciiChars, Dictionary<string, string[]> requiredHelpers)
private static string EmitSearchValues(char[] chars, Dictionary<string, string[]> requiredHelpers)
{
Debug.Assert(RegexCharClass.IsAscii(asciiChars));
Array.Sort(chars);

// The set of ASCII characters can be represented as a 128-bit bitmap. Use the 16-byte hex string as the key.
byte[] bitmap = new byte[16];
foreach (char c in asciiChars)
string fieldName;
if (RegexCharClass.IsAscii(chars))
{
bitmap[c >> 3] |= (byte)(1 << (c & 7));
// The set of ASCII characters can be represented as a 128-bit bitmap. Use the 16-byte hex string as the key.
var bitmap = new byte[16];
foreach (char c in chars)
{
bitmap[c >> 3] |= (byte)(1 << (c & 7));
}

string hexBitmap = BitConverter.ToString(bitmap).Replace("-", string.Empty);

fieldName = hexBitmap switch
{
"FFFFFFFF000000000000000000000080" => "s_asciiControl",
"000000000000FF030000000000000000" => "s_asciiDigits",
"0000000000000000FEFFFF07FEFFFF07" => "s_asciiLetters",
"000000000000FF03FEFFFF07FEFFFF07" => "s_asciiLettersAndDigits",
"000000000000FF037E0000007E000000" => "s_asciiHexDigits",
"000000000000FF03000000007E000000" => "s_asciiHexDigitsLower",
"000000000000FF037E00000000000000" => "s_asciiHexDigitsUpper",
"00000000EEF7008C010000B800000028" => "s_asciiPunctuation",
"00000000010000000000000000000000" => "s_asciiSeparators",
"00000000100800700000004001000050" => "s_asciiSymbols",
"003E0000010000000000000000000000" => "s_asciiWhiteSpace",
"000000000000FF03FEFFFF87FEFFFF07" => "s_asciiWordChars",

"00000000FFFFFFFFFFFFFFFFFFFFFF7F" => "s_asciiExceptControl",
"FFFFFFFFFFFF00FCFFFFFFFFFFFFFFFF" => "s_asciiExceptDigits",
"FFFFFFFFFFFFFFFF010000F8010000F8" => "s_asciiExceptLetters",
"FFFFFFFFFFFF00FC010000F8010000F8" => "s_asciiExceptLettersAndDigits",
"FFFFFFFFFFFFFFFFFFFFFFFF010000F8" => "s_asciiExceptLower",
"FFFFFFFF1108FF73FEFFFF47FFFFFFD7" => "s_asciiExceptPunctuation",
"FFFFFFFFFEFFFFFFFFFFFFFFFFFFFFFF" => "s_asciiExceptSeparators",
"FFFFFFFFEFF7FF8FFFFFFFBFFEFFFFAF" => "s_asciiExceptSymbols",
"FFFFFFFFFFFFFFFF010000F8FFFFFFFF" => "s_asciiExceptUpper",
"FFC1FFFFFEFFFFFFFFFFFFFFFFFFFFFF" => "s_asciiExceptWhiteSpace",
"FFFFFFFFFFFF00FC01000078010000F8" => "s_asciiExceptWordChars",

_ => $"s_ascii_{hexBitmap.TrimStart('0')}"
};
}

string hexBitmap = BitConverter.ToString(bitmap).Replace("-", string.Empty);

string fieldName = hexBitmap switch
else
{
"FFFFFFFF000000000000000000000080" => "s_asciiControl",
"000000000000FF030000000000000000" => "s_asciiDigits",
"0000000000000000FEFFFF07FEFFFF07" => "s_asciiLetters",
"000000000000FF03FEFFFF07FEFFFF07" => "s_asciiLettersAndDigits",
"000000000000FF037E0000007E000000" => "s_asciiHexDigits",
"000000000000FF03000000007E000000" => "s_asciiHexDigitsLower",
"000000000000FF037E00000000000000" => "s_asciiHexDigitsUpper",
"00000000EEF7008C010000B800000028" => "s_asciiPunctuation",
"00000000010000000000000000000000" => "s_asciiSeparators",
"00000000100800700000004001000050" => "s_asciiSymbols",
"003E0000010000000000000000000000" => "s_asciiWhiteSpace",
"000000000000FF03FEFFFF87FEFFFF07" => "s_asciiWordChars",

"00000000FFFFFFFFFFFFFFFFFFFFFF7F" => "s_asciiExceptControl",
"FFFFFFFFFFFF00FCFFFFFFFFFFFFFFFF" => "s_asciiExceptDigits",
"FFFFFFFFFFFFFFFF010000F8010000F8" => "s_asciiExceptLetters",
"FFFFFFFFFFFF00FC010000F8010000F8" => "s_asciiExceptLettersAndDigits",
"FFFFFFFFFFFFFFFFFFFFFFFF010000F8" => "s_asciiExceptLower",
"FFFFFFFF1108FF73FEFFFF47FFFFFFD7" => "s_asciiExceptPunctuation",
"FFFFFFFFFEFFFFFFFFFFFFFFFFFFFFFF" => "s_asciiExceptSeparators",
"FFFFFFFFEFF7FF8FFFFFFFBFFEFFFFAF" => "s_asciiExceptSymbols",
"FFFFFFFFFFFFFFFF010000F8FFFFFFFF" => "s_asciiExceptUpper",
"FFC1FFFFFEFFFFFFFFFFFFFFFFFFFFFF" => "s_asciiExceptWhiteSpace",
"FFFFFFFFFFFF00FC01000078010000F8" => "s_asciiExceptWordChars",

_ => $"s_ascii_{hexBitmap.TrimStart('0')}"
};
using (SHA256 sha = SHA256.Create())
{
#pragma warning disable CA1850 // SHA256.HashData isn't available on netstandard2.0
fieldName = $"s_nonAscii_{BitConverter.ToString(sha.ComputeHash(Encoding.UTF8.GetBytes(chars))).Replace("-", "")}";
#pragma warning restore CA1850
}
}

if (!requiredHelpers.ContainsKey(fieldName))
{
Array.Sort(asciiChars);

string setLiteral = Literal(new string(asciiChars));
string setLiteral = Literal(new string(chars));

requiredHelpers.Add(fieldName, new string[]
{
Expand All @@ -465,12 +476,12 @@ private static string EmitIndexOfAnyCustomHelper(string set, Dictionary<string,
// a sequential walk). In order to do that search, we actually build up a set for all of the ASCII
// characters _not_ contained in the set, and then do a search for the inverse of that, which will be
// all of the target ASCII characters and all of non-ASCII.
var asciiChars = new List<char>();
var excludedAsciiChars = new List<char>();
for (int i = 0; i < 128; i++)
{
if (!RegexCharClass.CharInClass((char)i, set))
{
asciiChars.Add((char)i);
excludedAsciiChars.Add((char)i);
}
}

Expand Down Expand Up @@ -538,9 +549,9 @@ private static string EmitIndexOfAnyCustomHelper(string set, Dictionary<string,
lines.Add($"internal static int {helperName}(this ReadOnlySpan<char> span)");
lines.Add($"{{");
int uncheckedStart = lines.Count;
lines.Add(asciiChars.Count == 128 ?
lines.Add(excludedAsciiChars.Count == 128 ?
$" int i = span.IndexOfAnyExceptInRange('\0', '\u007f');" :
$" int i = span.IndexOfAnyExcept({EmitSearchValues(asciiChars.ToArray(), requiredHelpers)});");
$" int i = span.IndexOfAnyExcept({EmitSearchValues(excludedAsciiChars.ToArray(), requiredHelpers)});");
lines.Add($" if ((uint)i < (uint)span.Length)");
lines.Add($" {{");
lines.Add($" if (char.IsAscii(span[i]))");
Expand Down Expand Up @@ -1067,6 +1078,8 @@ void EmitFixedSet_LeftToRight()
string indexOf;
if (primarySet.Chars is not null)
{
Debug.Assert(primarySet.Chars.Length > 0);

// We have a chars array, so we can use IndexOf{Any}{Except} to search for it. Choose the best overload.
string indexOfName = "IndexOf", indexOfAnyName = "IndexOfAny";
if (primarySet.Negated)
Expand All @@ -1076,18 +1089,19 @@ void EmitFixedSet_LeftToRight()

indexOf = primarySet.Chars.Length switch
{
// 1, 2, 3 have dedicated optimized IndexOfAny overloads
1 => $"{span}.{indexOfName}({Literal(primarySet.Chars[0])})",
2 => $"{span}.{indexOfAnyName}({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])})",
3 => $"{span}.{indexOfAnyName}({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])}, {Literal(primarySet.Chars[2])})",
_ => $"{span}.{indexOfAnyName}({EmitSearchValuesOrLiteral(primarySet.Chars, requiredHelpers)})",

// 4, 5 have dedicated optimized IndexOfAny overloads accessible via the ReadOnlySpan<char> overload,
// but can also be handled via SearchValues
4 or 5 => $"{span}.{indexOfAnyName}({EmitSearchValuesOrLiteral(primarySet.Chars, requiredHelpers)})",

// > 5 can only be handled efficiently via SearchValues
_ => $"{span}.{indexOfAnyName}({EmitSearchValues(primarySet.Chars, requiredHelpers)})",
};
}
else if (primarySet.AsciiSet is not null)
{
// We have a set of ASCII chars, so we can use IndexOfAny(SearchValues) to search for it.
Debug.Assert(!primarySet.Negated);
indexOf = $"{span}.IndexOfAny({EmitSearchValues(primarySet.AsciiSet, requiredHelpers)})";
}
else if (primarySet.Range is not null)
{
// We have a range, so we can use IndexOfAny{Except}InRange to search for it. In the corner case,
Expand All @@ -1102,8 +1116,8 @@ void EmitFixedSet_LeftToRight()
}
else
{
// We have an arbitrary set of characters that includes at least one non-ASCII char. We use a custom IndexOfAny helper that
// will perform the search as efficiently as possible.
// We have an arbitrary set of characters that's really large or otherwise not enumerable.
stephentoub marked this conversation as resolved.
Show resolved Hide resolved
// We use a custom IndexOfAny helper that will perform the search as efficiently as possible.
indexOf = $"{span}.{EmitIndexOfAnyCustomHelper(primarySet.Set, requiredHelpers, checkOverflow)}()";
}

Expand Down
Expand Up @@ -903,6 +903,7 @@ void EmitFixedSet_LeftToRight()

if (primarySet.Chars is not null)
{
Debug.Assert(primarySet.Chars.Length > 0);
switch (primarySet.Chars.Length)
{
case 1:
Expand All @@ -926,19 +927,23 @@ void EmitFixedSet_LeftToRight()
Call(primarySet.Negated ? s_spanIndexOfAnyExceptCharCharChar : s_spanIndexOfAnyCharCharChar);
break;

default:
case 4 or 5:
// tmp = ...IndexOfAny("abcd");
// Note that this case differs slightly from the source generator, where it might choose to use
// SearchValues instead of a literal, but there's extra cost to doing so for RegexCompiler so
// it just always uses IndexOfAny(span).
Ldstr(new string(primarySet.Chars));
Call(s_stringAsSpanMethod);
Call(primarySet.Negated ? s_spanIndexOfAnyExceptSpan : s_spanIndexOfAnySpan);
break;

default:
// tmp = ...IndexOfAny(s_searchValues);
LoadSearchValues(primarySet.Chars);
Call(primarySet.Negated ? s_spanIndexOfAnyExceptSearchValues : s_spanIndexOfAnySearchValues);
break;
}
}
else if (primarySet.AsciiSet is not null)
{
Debug.Assert(!primarySet.Negated);
LoadSearchValues(primarySet.AsciiSet);
Call(s_spanIndexOfAnySearchValues);
}
else if (primarySet.Range is not null)
{
if (primarySet.Range.Value.LowInclusive == primarySet.Range.Value.HighInclusive)
Expand Down
Expand Up @@ -271,8 +271,6 @@ public FixedDistanceSet(char[]? chars, string set, int distance)
public int Distance;
/// <summary>As an alternative to <see cref="Chars"/>, a description of the single range the set represents, if it does.</summary>
public (char LowInclusive, char HighInclusive)? Range;
/// <summary>As an alternative to <see cref="Chars"/>, a description of the set of ASCII characters it represents, if it does.</summary>
public char[]? AsciiSet;
}

/// <summary>When in literal after set loop node, gets the literal to search for and the RegexNode representing the leading loop.</summary>
Expand Down Expand Up @@ -593,7 +591,7 @@ public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan<char> textSpan,
char[]? chars = primarySet.Chars;

ReadOnlySpan<char> span = textSpan.Slice(pos);
if (chars is not null)
if (chars is { Length: <= 5 }) // 5 == currently the max length efficiently handled by IndexOfAny{Except} without SearchValues
{
int i = primarySet.Negated ? span.IndexOfAnyExcept(chars) : span.IndexOfAny(chars);
if (i >= 0)
Expand Down Expand Up @@ -660,7 +658,7 @@ public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan<char> textSpan,

int endMinusRequiredLength = textSpan.Length - Math.Max(1, MinRequiredLength);

if (primarySet.Chars is not null)
if (primarySet.Chars is { Length: <= 5 }) // 5 == currently the max length efficiently handled by IndexOfAny{Except}
{
for (int inputPosition = pos; inputPosition <= endMinusRequiredLength; inputPosition++)
{
Expand Down
Expand Up @@ -194,7 +194,7 @@ static bool Process(RegexNode node, ref ValueStringBuilder vsb)
TryFindRawFixedSets(root, results, ref distance, thorough);
#if DEBUG
results.ForEach(r => Debug.Assert(
!r.Negated && r.Chars is null && r.AsciiSet is null && r.Range is null,
!r.Negated && r.Chars is null && r.Range is null,
$"{nameof(TryFindRawFixedSets)} should have only populated {nameof(r.Set)} and {nameof(r.Distance)}"));
#endif

Expand Down Expand Up @@ -225,31 +225,25 @@ static bool Process(RegexNode node, ref ValueStringBuilder vsb)

// For every entry, try to get the chars that make up the set, if there are few enough.
// For any for which we couldn't get the small chars list, see if we can get other useful info.
Span<char> scratch = stackalloc char[5]; // max efficiently optimized by IndexOfAny today
Span<char> scratch = stackalloc char[128]; // limit based on what's currently efficiently handled by SearchValues
for (int i = 0; i < results.Count; i++)
{
RegexFindOptimizations.FixedDistanceSet result = results[i];
result.Negated = RegexCharClass.IsNegated(result.Set);

int count = RegexCharClass.GetSetChars(result.Set, scratch);

if (count > 0)
{
result.Chars = scratch.Slice(0, count).ToArray();
}

if (thorough)
// Prefer IndexOfAnyInRange over IndexOfAny for sets of 3-5 values that fit in a single range.
if (thorough &&
(result.Chars is null || result.Chars.Length > 2) &&
RegexCharClass.TryGetSingleRange(result.Set, out char lowInclusive, out char highInclusive))
{
// Prefer IndexOfAnyInRange over IndexOfAny for sets of 3-5 values that fit in a single range.
if ((result.Chars is null || count > 2) && RegexCharClass.TryGetSingleRange(result.Set, out char lowInclusive, out char highInclusive))
{
result.Chars = null;
result.Range = (lowInclusive, highInclusive);
}
else if (result.Chars is null && !result.Negated && RegexCharClass.TryGetAsciiSetChars(result.Set, out char[]? asciiChars))
{
result.AsciiSet = asciiChars;
}
result.Chars = null;
result.Range = (lowInclusive, highInclusive);
}

results[i] = result;
Expand Down Expand Up @@ -472,8 +466,8 @@ static bool TryFindRawFixedSets(RegexNode node, List<RegexFindOptimizations.Fixe
// for the fastest and that have the best chance of matching as few false positives as possible.
results.Sort(static (s1, s2) =>
{
char[]? s1Chars = s1.Chars ?? s1.AsciiSet;
char[]? s2Chars = s2.Chars ?? s2.AsciiSet;
char[]? s1Chars = s1.Chars;
char[]? s2Chars = s2.Chars;
int s1CharsLength = s1Chars?.Length ?? 0;
int s2CharsLength = s2Chars?.Length ?? 0;
bool s1Negated = s1.Negated;
Expand Down