Skip to content

Commit

Permalink
Improve choice of IndexOfXx routine for some TryFindNextStartingPosit…
Browse files Browse the repository at this point in the history
…ion implementations (#89099)

Earlier in .NET 8, we updated the Regex compiler and source generator to be able to vectorize a search for any set, not just simple ones.  When one of the main routines couldn't be used, we emit a specialized IndexOfAny helper that uses SearchValues to search for any matching ASCII character or a Unicode character, and if it encounters a Unicode character, it falls back to a linear scan.  This meant that a bunch of sets that wouldn't previously have taken these paths now do, but some of those sets have more efficient means of searching; for example, for the set `[^aA]` that searches case-insensitive for anything other than an 'A', with these scheme we'll emit a whole routine that uses SearchValues with a fallback, but we could just use IndexOfAnyExcept('A', 'a').  This fixes the compiler / source generator to prefer such helpers instead when available.
  • Loading branch information
stephentoub committed Jul 18, 2023
1 parent bced584 commit 74d69fd
Show file tree
Hide file tree
Showing 5 changed files with 142 additions and 86 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1063,25 +1063,49 @@ void EmitFixedSet_LeftToRight()
(true, _) => $"{span}.Slice(i + {primarySet.Distance})",
};

Debug.Assert(!primarySet.Negated || (primarySet.Chars is null && primarySet.AsciiSet is null));
// Get the IndexOf* expression to use to perform the search.
string indexOf;
if (primarySet.Chars is not null)
{
// We have a chars array, so we can use IndexOf{Any}{Except} to search for it. Choose the best overload.
string indexOfName = "IndexOf", indexOfAnyName = "IndexOfAny";
if (primarySet.Negated)
{
indexOfName = indexOfAnyName = "IndexOfAnyExcept";
}

string indexOf =
primarySet.Chars is not null ? primarySet.Chars.Length switch
indexOf = primarySet.Chars.Length switch
{
1 => $"{span}.IndexOf({Literal(primarySet.Chars[0])})",
2 => $"{span}.IndexOfAny({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])})",
3 => $"{span}.IndexOfAny({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])}, {Literal(primarySet.Chars[2])})",
_ => $"{span}.IndexOfAny({EmitSearchValuesOrLiteral(primarySet.Chars, requiredHelpers)})",
} :
primarySet.AsciiSet is not null ? $"{span}.IndexOfAny({EmitSearchValues(primarySet.AsciiSet, requiredHelpers)})" :
primarySet.Range is not null ? (primarySet.Range.Value.LowInclusive == primarySet.Range.Value.HighInclusive, primarySet.Negated) switch
1 => $"{span}.{indexOfName}({Literal(primarySet.Chars[0])})",
2 => $"{span}.{indexOfAnyName}({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])})",
3 => $"{span}.{indexOfAnyName}({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])}, {Literal(primarySet.Chars[2])})",
_ => $"{span}.{indexOfAnyName}({EmitSearchValuesOrLiteral(primarySet.Chars, requiredHelpers)})",
};
}
else if (primarySet.AsciiSet is not null)
{
// We have a set of ASCII chars, so we can use IndexOfAny(SearchValues) to search for it.
Debug.Assert(!primarySet.Negated);
indexOf = $"{span}.IndexOfAny({EmitSearchValues(primarySet.AsciiSet, requiredHelpers)})";
}
else if (primarySet.Range is not null)
{
// We have a range, so we can use IndexOfAny{Except}InRange to search for it. In the corner case,
// where we end up with a set of a single char, we can use IndexOf instead.
indexOf = (primarySet.Range.Value.LowInclusive == primarySet.Range.Value.HighInclusive, primarySet.Negated) switch
{
(false, false) => $"{span}.IndexOfAnyInRange({Literal(primarySet.Range.Value.LowInclusive)}, {Literal(primarySet.Range.Value.HighInclusive)})",
(true, false) => $"{span}.IndexOf({Literal(primarySet.Range.Value.LowInclusive)})",
(false, true) => $"{span}.IndexOfAnyExceptInRange({Literal(primarySet.Range.Value.LowInclusive)}, {Literal(primarySet.Range.Value.HighInclusive)})",
(true, true) => $"{span}.IndexOfAnyExcept({Literal(primarySet.Range.Value.LowInclusive)})",
} :
$"{span}.{EmitIndexOfAnyCustomHelper(primarySet.Set, requiredHelpers, checkOverflow)}()";
};
}
else
{
// We have an arbitrary set of characters that includes at least one non-ASCII char. We use a custom IndexOfAny helper that
// will perform the search as efficiently as possible.
indexOf = $"{span}.{EmitIndexOfAnyCustomHelper(primarySet.Set, requiredHelpers, checkOverflow)}()";
}

if (needLoop)
{
Expand Down Expand Up @@ -1184,6 +1208,7 @@ void EmitFixedSet_RightToLeft()

if (set.Chars is { Length: 1 })
{
Debug.Assert(!set.Negated);
writer.WriteLine($"pos = inputSpan.Slice(0, pos).LastIndexOf({Literal(set.Chars[0])});");
using (EmitBlock(writer, "if (pos >= 0)"))
{
Expand Down Expand Up @@ -3307,7 +3332,7 @@ void EmitSingleCharLazy(RegexNode node, RegexNode? subsequent = null, bool emitL
{
if (iterationCount is null &&
node.Kind is RegexNodeKind.Notonelazy &&
subsequent?.FindStartingLiteral(4) is RegexNode.StartingLiteralData literal && // 5 == max optimized by IndexOfAny, and we need to reserve 1 for node.Ch
subsequent?.FindStartingLiteral(4) is RegexNode.StartingLiteralData literal && // 5 == max efficiently optimized by IndexOfAny, and we need to reserve 1 for node.Ch
!literal.Negated && // not negated; can't search for both the node.Ch and a negated subsequent char with an IndexOf* method
(literal.String is not null ||
literal.SetChars is not null ||
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -901,37 +901,35 @@ void EmitFixedSet_LeftToRight()
Ldloc(textSpanLocal);
}

Debug.Assert(!primarySet.Negated || (primarySet.Chars is null && primarySet.AsciiSet is null));

if (primarySet.Chars is not null)
{
switch (primarySet.Chars.Length)
{
case 1:
// tmp = ...IndexOf(setChars[0]);
Ldc(primarySet.Chars[0]);
Call(s_spanIndexOfChar);
Call(primarySet.Negated ? s_spanIndexOfAnyExceptChar : s_spanIndexOfChar);
break;

case 2:
// tmp = ...IndexOfAny(setChars[0], setChars[1]);
Ldc(primarySet.Chars[0]);
Ldc(primarySet.Chars[1]);
Call(s_spanIndexOfAnyCharChar);
Call(primarySet.Negated ? s_spanIndexOfAnyExceptCharChar : s_spanIndexOfAnyCharChar);
break;

case 3:
// tmp = ...IndexOfAny(setChars[0], setChars[1], setChars[2]});
Ldc(primarySet.Chars[0]);
Ldc(primarySet.Chars[1]);
Ldc(primarySet.Chars[2]);
Call(s_spanIndexOfAnyCharCharChar);
Call(primarySet.Negated ? s_spanIndexOfAnyExceptCharCharChar : s_spanIndexOfAnyCharCharChar);
break;

default:
Ldstr(new string(primarySet.Chars));
Call(s_stringAsSpanMethod);
Call(s_spanIndexOfAnySpan);
Call(primarySet.Negated ? s_spanIndexOfAnyExceptSpan : s_spanIndexOfAnySpan);
break;
}
}
Expand Down Expand Up @@ -1166,6 +1164,8 @@ void EmitFixedSet_RightToLeft()

if (set.Chars is { Length: 1 })
{
Debug.Assert(!set.Negated);

// pos = inputSpan.Slice(0, pos).LastIndexOf(set.Chars[0]);
Ldloca(inputSpan);
Ldc(0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

using System.Collections.Generic;
using System.Diagnostics;
using System.Globalization;

namespace System.Text.RegularExpressions
{
Expand Down Expand Up @@ -95,7 +94,7 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
if (RegexPrefixAnalyzer.FindFirstCharClass(root) is string charClass)
{
// See if the set is limited to holding only a few characters.
Span<char> scratch = stackalloc char[5]; // max optimized by IndexOfAny today
Span<char> scratch = stackalloc char[5]; // max efficiently optimized by IndexOfAny today
int scratchCount;
char[]? chars = null;
if (!RegexCharClass.IsNegated(charClass) &&
Expand All @@ -109,12 +108,14 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
{
// The set contains one and only one character, meaning every match starts
// with the same literal value (potentially case-insensitive). Search for that.
Debug.Assert(!RegexCharClass.IsNegated(charClass));
FixedDistanceLiteral = (chars[0], null, 0);
FindMode = FindNextStartingPositionMode.LeadingChar_RightToLeft;
}
else
{
// The set may match multiple characters. Search for that.
Debug.Assert(!RegexCharClass.IsNegated(charClass) || chars is null);
FixedDistanceSets = new List<FixedDistanceSet>()
{
new FixedDistanceSet(chars, charClass, 0)
Expand Down Expand Up @@ -154,22 +155,32 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)

// As a backup, see if we can find a literal after a leading atomic loop. That might be better than whatever sets we find, so
// we want to know whether we have one in our pocket before deciding whether to use a leading set (we'll prefer a leading
// set if it's something for which we can vectorize a search).
// set if it's something for which we can search efficiently).
(RegexNode LoopNode, (char Char, string? String, char[]? Chars) Literal)? literalAfterLoop = RegexPrefixAnalyzer.FindLiteralFollowingLeadingLoop(root);

// If we got such sets, we'll likely use them. However, if the best of them is something that doesn't support a vectorized
// search and we did successfully find a literal after an atomic loop we could search instead, we prefer the vectorizable search.
// If we got such sets, we'll likely use them. However, if the best of them is something that doesn't support an efficient
// search and we did successfully find a literal after an atomic loop we could search instead, we prefer the efficient search.
// For example, if we have a negated set, we will still prefer the literal-after-an-atomic-loop because negated sets typically
// contain _many_ characters (e.g. [^a] is everything but 'a') and are thus more likely to very quickly match, which means any
// vectorization employed is less likely to kick in and be worth the startup overhead.
if (fixedDistanceSets is not null)
{
// Sort the sets by "quality", such that whatever set is first is the one deemed most efficient to use.
// In some searches, we may use multiple sets, so we want the subsequent ones to also be the efficiency runners-up.
RegexPrefixAnalyzer.SortFixedDistanceSetsByQuality(fixedDistanceSets);
if (fixedDistanceSets[0].Chars is not null || literalAfterLoop is null)

// If there is no literal after the loop, use whatever set we got.
// If there is a literal after the loop, consider it to be better than a negated set and better than a set with many characters.
if (literalAfterLoop is null ||
(fixedDistanceSets[0].Chars is not null && !fixedDistanceSets[0].Negated))
{
// Determine whether to do searching based on one or more sets or on a single literal. Compiled engines
// don't need to special-case literals as they already do codegen to create the optimal lookup based on
// the set's characteristics.
if (!compiled &&
fixedDistanceSets.Count == 1 &&
fixedDistanceSets[0].Chars is { Length: 1 })
fixedDistanceSets[0].Chars is { Length: 1 } &&
!fixedDistanceSets[0].Negated)
{
FixedDistanceLiteral = (fixedDistanceSets[0].Chars![0], null, fixedDistanceSets[0].Distance);
FindMode = FindNextStartingPositionMode.FixedDistanceChar_LeftToRight;
Expand All @@ -186,8 +197,9 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)

// Store the sets, and compute which mode to use.
FixedDistanceSets = fixedDistanceSets;
FindMode = (fixedDistanceSets.Count == 1 && fixedDistanceSets[0].Distance == 0) ? FindNextStartingPositionMode.LeadingSet_LeftToRight
: FindNextStartingPositionMode.FixedDistanceSets_LeftToRight;
FindMode = (fixedDistanceSets.Count == 1 && fixedDistanceSets[0].Distance == 0) ?
FindNextStartingPositionMode.LeadingSet_LeftToRight :
FindNextStartingPositionMode.FixedDistanceSets_LeftToRight;
_asciiLookups = new uint[fixedDistanceSets.Count][];
}
return;
Expand Down Expand Up @@ -322,6 +334,7 @@ private static (string String, int Distance)? FindFixedDistanceString(List<Fixed
return best;
}

#if SYSTEM_TEXT_REGULAREXPRESSIONS
/// <summary>Try to advance to the next starting position that might be a location for a match.</summary>
/// <param name="textSpan">The text to search.</param>
/// <param name="pos">The position in <paramref name="textSpan"/>. This is updated with the found position.</param>
Expand Down Expand Up @@ -578,12 +591,11 @@ public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan<char> textSpan,
{
FixedDistanceSet primarySet = FixedDistanceSets![0];
char[]? chars = primarySet.Chars;
string set = primarySet.Set;

ReadOnlySpan<char> span = textSpan.Slice(pos);
if (chars is not null)
{
int i = span.IndexOfAny(chars);
int i = primarySet.Negated ? span.IndexOfAnyExcept(chars) : span.IndexOfAny(chars);
if (i >= 0)
{
pos += i;
Expand All @@ -595,7 +607,7 @@ public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan<char> textSpan,
ref uint[]? startingAsciiLookup = ref _asciiLookups![0];
for (int i = 0; i < span.Length; i++)
{
if (RegexCharClass.CharInClass(span[i], set, ref startingAsciiLookup))
if (RegexCharClass.CharInClass(span[i], primarySet.Set, ref startingAsciiLookup))
{
pos += i;
return true;
Expand Down Expand Up @@ -653,7 +665,8 @@ public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan<char> textSpan,
for (int inputPosition = pos; inputPosition <= endMinusRequiredLength; inputPosition++)
{
int offset = inputPosition + primarySet.Distance;
int index = textSpan.Slice(offset).IndexOfAny(primarySet.Chars);
ReadOnlySpan<char> textSpanAtOffset = textSpan.Slice(offset);
int index = primarySet.Negated ? textSpanAtOffset.IndexOfAnyExcept(primarySet.Chars) : textSpanAtOffset.IndexOfAny(primarySet.Chars);
if (index < 0)
{
break;
Expand Down Expand Up @@ -769,6 +782,7 @@ public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan<char> textSpan,
return true;
}
}
#endif
}

/// <summary>Mode to use for searching for the next location of a possible match.</summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1419,7 +1419,7 @@ public char FirstCharOfOneOrMulti()
/// A tuple of data about the literal: only one of the Char/String/SetChars fields is relevant.
/// The Negated value indicates whether the Char/SetChars should be considered exclusionary.
/// </returns>
public StartingLiteralData? FindStartingLiteral(int maxSetCharacters = 5) // 5 is max optimized by IndexOfAny today
public StartingLiteralData? FindStartingLiteral(int maxSetCharacters = 5) // 5 is max efficiently optimized by IndexOfAny today
{
Debug.Assert(maxSetCharacters >= 0 && maxSetCharacters <= 128, $"{nameof(maxSetCharacters)} == {maxSetCharacters} should be small enough to be stack allocated.");

Expand Down

0 comments on commit 74d69fd

Please sign in to comment.