Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Process "multi" strings in compiled regexes 4 or 2 chars at a time when possible #1654

Merged
merged 5 commits into from
Jan 14, 2020
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
using System.Globalization;
using System.Reflection;
using System.Reflection.Emit;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;

namespace System.Text.RegularExpressions
{
Expand Down Expand Up @@ -45,24 +47,27 @@ internal abstract class RegexCompiler
private static readonly MethodInfo s_dumpStateM = RegexRunnerMethod("DumpState");
#endif

private static readonly MethodInfo s_charToLowerMethod = typeof(char).GetMethod("ToLower", new Type[] { typeof(char), typeof(CultureInfo) })!;
private static readonly MethodInfo s_charToLowerInvariantMethod = typeof(char).GetMethod("ToLowerInvariant", new Type[] { typeof(char) })!;
private static readonly MethodInfo s_charIsDigitMethod = typeof(char).GetMethod("IsDigit", new Type[] { typeof(char) })!;
private static readonly MethodInfo s_charIsWhiteSpaceMethod = typeof(char).GetMethod("IsWhiteSpace", new Type[] { typeof(char) })!;
private static readonly MethodInfo s_stringGetCharsMethod = typeof(string).GetMethod("get_Chars", new Type[] { typeof(int) })!;
private static readonly MethodInfo s_stringAsSpanMethod = typeof(MemoryExtensions).GetMethod("AsSpan", new Type[] { typeof(string), typeof(int), typeof(int) })!;
private static readonly MethodInfo s_stringIndexOf = typeof(string).GetMethod("IndexOf", new Type[] { typeof(char), typeof(int), typeof(int) })!;
private static readonly MethodInfo s_charToLowerMethod = typeof(char).GetMethod("ToLower", new Type[] { typeof(char), typeof(CultureInfo) })!;
private static readonly MethodInfo s_charToLowerInvariantMethod = typeof(char).GetMethod("ToLowerInvariant", new Type[] { typeof(char) })!;
private static readonly MethodInfo s_cultureInfoGetCurrentCultureMethod = typeof(CultureInfo).GetMethod("get_CurrentCulture")!;
#if DEBUG
private static readonly MethodInfo s_debugWriteLine = typeof(Debug).GetMethod("WriteLine", new Type[] { typeof(string) })!;
#endif
private static readonly MethodInfo s_spanGetItemMethod = typeof(ReadOnlySpan<char>).GetMethod("get_Item", new Type[] { typeof(int) })!;
private static readonly MethodInfo s_spanGetLengthMethod = typeof(ReadOnlySpan<char>).GetMethod("get_Length")!;
private static readonly MethodInfo s_memoryMarshalGetReference = typeof(MemoryMarshal).GetMethod("GetReference", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_spanIndexOf = typeof(MemoryExtensions).GetMethod("IndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_spanIndexOfAnyCharChar = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_spanIndexOfAnyCharCharChar = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_spanGetItemMethod = typeof(ReadOnlySpan<char>).GetMethod("get_Item", new Type[] { typeof(int) })!;
private static readonly MethodInfo s_spanGetLengthMethod = typeof(ReadOnlySpan<char>).GetMethod("get_Length")!;
private static readonly MethodInfo s_spanSliceIntMethod = typeof(ReadOnlySpan<char>).GetMethod("Slice", new Type[] { typeof(int) })!;
private static readonly MethodInfo s_spanSliceIntIntMethod = typeof(ReadOnlySpan<char>).GetMethod("Slice", new Type[] { typeof(int), typeof(int) })!;
private static readonly MethodInfo s_cultureInfoGetCurrentCultureMethod = typeof(CultureInfo).GetMethod("get_CurrentCulture")!;
#if DEBUG
private static readonly MethodInfo s_debugWriteLine = typeof(Debug).GetMethod("WriteLine", new Type[] { typeof(string) })!;
#endif
private static readonly MethodInfo s_spanStartsWith = typeof(MemoryExtensions).GetMethod("StartsWith", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_stringAsSpanMethod = typeof(MemoryExtensions).GetMethod("AsSpan", new Type[] { typeof(string) })!;
private static readonly MethodInfo s_stringAsSpanIntIntMethod = typeof(MemoryExtensions).GetMethod("AsSpan", new Type[] { typeof(string), typeof(int), typeof(int) })!;
private static readonly MethodInfo s_stringGetCharsMethod = typeof(string).GetMethod("get_Chars", new Type[] { typeof(int) })!;
private static readonly MethodInfo s_stringIndexOf = typeof(string).GetMethod("IndexOf", new Type[] { typeof(char), typeof(int), typeof(int) })!;

protected ILGenerator? _ilg;

Expand Down Expand Up @@ -264,6 +269,9 @@ private void Ldc(int i)
}
}

/// <summary>A macro for _ilg.Emit(OpCodes.Ldc_I8).</summary>
private void LdcI8(long i) => _ilg!.Emit(OpCodes.Ldc_I8, i);
stephentoub marked this conversation as resolved.
Show resolved Hide resolved

/// <summary>A macro for _ilg.Emit(OpCodes.Dup).</summary>
private void Dup() => _ilg!.Emit(OpCodes.Dup);

Expand Down Expand Up @@ -318,6 +326,15 @@ private void Ldc(int i)
/// <summary>A macro for _ilg.Emit(OpCodes.Ldind_U2).</summary>
private void LdindU2() => _ilg!.Emit(OpCodes.Ldind_U2);

/// <summary>A macro for _ilg.Emit(OpCodes.Ldind_I4).</summary>
private void LdindI4() => _ilg!.Emit(OpCodes.Ldind_I4);

/// <summary>A macro for _ilg.Emit(OpCodes.Ldind_I8).</summary>
private void LdindI8() => _ilg!.Emit(OpCodes.Ldind_I8);

/// <summary>A macro for _ilg.Emit(OpCodes.Unaligned).</summary>
private void Unaligned(byte alignment) => _ilg!.Emit(OpCodes.Unaligned, alignment);

/// <summary>A macro for _ilg.Emit(OpCodes.Stloc_S).</summary>
private void Stloc(LocalBuilder lt) => _ilg!.Emit(OpCodes.Stloc_S, lt);

Expand Down Expand Up @@ -1307,7 +1324,7 @@ protected void GenerateFindFirstChar()
Ldloc(_runtextendLocal);
Ldloc(_runtextposLocal);
Sub();
Call(s_stringAsSpanMethod);
Call(s_stringAsSpanIntIntMethod);
Ldc(setChars[0]);
Ldc(setChars[1]);
if (setCharsCount == 3)
Expand Down Expand Up @@ -1361,7 +1378,7 @@ protected void GenerateFindFirstChar()
Ldloc(_runtextendLocal);
Ldloc(_runtextposLocal);
Sub();
Call(s_stringAsSpanMethod);
Call(s_stringAsSpanIntIntMethod);
Stloc(textSpanLocal);

// for (int i = 0;
Expand Down Expand Up @@ -1645,7 +1662,7 @@ void LoadTextSpanLocal()
Ldthisfld(s_runtextendField);
Ldloc(runtextposLocal);
Sub();
Call(s_stringAsSpanMethod);
Call(s_stringAsSpanIntIntMethod);
Stloc(textSpanLocal);
}

Expand Down Expand Up @@ -1708,6 +1725,18 @@ void EmitSpanLengthCheck(int requiredLength, LocalBuilder? dynamicRequiredLength
BgeUnFar(doneLabel);
}

// Emits code to get ref textSpan[textSpanPos]
void EmitTextSpanOffset()
{
Ldloc(textSpanLocal);
Call(s_memoryMarshalGetReference);
if (textSpanPos > 0)
{
Ldc(textSpanPos * sizeof(char));
Add();
}
}

void TransferTextSpanPosToRunTextPos()
{
if (textSpanPos > 0)
Expand Down Expand Up @@ -2134,23 +2163,94 @@ void EmitAnchors(RegexNode node)
// Emits the code to handle a multiple-character match.
void EmitMultiChar(RegexNode node)
{
// if (textSpanPos + node.Str.Length >= textSpan.Length) goto doneLabel;
// if (node.Str[0] != textSpan[textSpanPos]) goto doneLabel;
// if (node.Str[1] != textSpan[textSpanPos+1]) goto doneLabel;
// ...
EmitSpanLengthCheck(node.Str!.Length);
for (int i = 0; i < node.Str!.Length; i++)
bool caseInsensitive = IsCaseInsensitive(node);

// If the multi string's length exceeds the maximum length we want to unroll, instead generate a call to StartsWith.
// Each character that we unroll results in code generation that increases the size of both the IL and the resulting asm,
// and with a large enough string, that can cause significant overhead as well as even risk stack overflow due to
// having an obscenely long method. Such long string lengths in a pattern are generally quite rare. However, we also
// want to unroll for shorter strings, because the overhead of invoking StartsWith instead of doing a few simple
// inline comparisons is very measurable, especially if we're doing a culture-sensitive comparison and StartsWith
// accesses CultureInfo.CurrentCulture on each call. We need to be cognizant not only of the cost if the whole
// string matches, but also the cost when the comparison fails early on, and thus we pay for the call overhead
// but don't reap the benefits of all the vectorization StartsWith can do.
const int MaxUnrollLength = 64;
if (!caseInsensitive && // StartsWith(..., XxIgnoreCase) won't necessarily be the same as char-by-char comparison
node.Str!.Length > MaxUnrollLength)
{
// if (!textSpan.Slice(textSpanPos).StartsWith("...") goto doneLabel;
Ldloca(textSpanLocal);
Ldc(textSpanPos + i);
Call(s_spanGetItemMethod);
Ldc(textSpanPos);
Call(s_spanSliceIntMethod);
Ldstr(node.Str);
Call(s_stringAsSpanMethod);
Call(s_spanStartsWith);
BrfalseFar(doneLabel);
textSpanPos += node.Str.Length;
return;
}

// Emit the length check for the whole string. If the generated code gets past this point,
// we know the span is at least textSpanPos + s.Length long.
ReadOnlySpan<char> s = node.Str;
EmitSpanLengthCheck(s.Length);

// If we're doing a case-insensitive comparison, we need to lower case each character,
stephentoub marked this conversation as resolved.
Show resolved Hide resolved
// so we just go character-by-character. But if we're not, we try to process multiple
// characters at a time; this is helpful not only for throughput but also in reducing
// the amount of IL and asm that results from this unrolling.
if (!caseInsensitive)
{
// TODO https://github.com/dotnet/corefx/issues/39227:
// If/when we implement CompileToAssembly, this code will either need to be special-cased
// to not be used when saving out the assembly, or it'll need to be augmented to emit an
// endianness check into the IL. The code below is creating int/long constants based on
// reading the comparison string at compile time, and the machine doing the compilation
// could be of a different endianness than the machine running the compiled assembly.

// On 64-bit, process 4 characters at a time until the string isn't at least 4 characters long.
if (IntPtr.Size == 8)
stephentoub marked this conversation as resolved.
Show resolved Hide resolved
{
const int CharsPerInt64 = 4;
while (s.Length >= CharsPerInt64)
{
// if (Unsafe.ReadUnaligned<long>(ref Unsafe.Add(ref MemoryMarshal.GetReference(textSpan), textSpanPos)) != value) goto doneLabel;
EmitTextSpanOffset();
Unaligned(1);
LdindI8();
LdcI8(MemoryMarshal.Read<long>(MemoryMarshal.AsBytes(s)));
BneFar(doneLabel);
textSpanPos += CharsPerInt64;
s = s.Slice(CharsPerInt64);
}
}

// Of what remains, process 2 characters at a time until the string isn't at least 2 characters long.
const int CharsPerInt32 = 2;
while (s.Length >= CharsPerInt32)
{
// if (Unsafe.ReadUnaligned<int>(ref Unsafe.Add(ref MemoryMarshal.GetReference(textSpan), textSpanPos)) != value) goto doneLabel;
EmitTextSpanOffset();
Unaligned(1);
LdindI4();
Ldc(MemoryMarshal.Read<int>(MemoryMarshal.AsBytes(s)));
BneFar(doneLabel);
textSpanPos += CharsPerInt32;
s = s.Slice(CharsPerInt32);
}
}

// Finally, process all of the remaining characters one by one.
for (int i = 0; i < s.Length; i++)
{
// if (s[i] != textSpan[textSpanPos++]) goto doneLabel;
EmitTextSpanOffset();
textSpanPos++;
LdindU2();
if (IsCaseInsensitive(node)) CallToLower();
Ldc(node.Str[i]);
if (caseInsensitive) CallToLower();
Ldc(s[i]);
BneFar(doneLabel);
}

textSpanPos += node.Str.Length;
}

// Emits the code to handle a loop (repeater) with a fixed number of iterations.
Expand Down Expand Up @@ -4244,9 +4344,11 @@ private void EmitCallCharInClass(string charClass, bool caseInsensitive, LocalBu
// We use a const string instead of a byte[] / static data property because
// it lets IL emit handle all the gory details for us. It also is ok from an
// endianness perspective because the compilation happens on the same machine
// that runs the compiled code. If that were to ever change, this would need
// to be revisited. String length is 8 chars == 16 bytes == 128 bits.
string bitVectorString = string.Create(8, (charClass, invariant), (dest, state) =>
// that runs the compiled code.
// TODO https://github.com/dotnet/corefx/issues/39227: If that were to ever change,
// this would need to be revisited, such as by doubling the string length, and using
// just the lower byte of the char, e.g. (byte)lookup[x].
string bitVectorString = string.Create(8, (charClass, invariant), (dest, state) => // String length is 8 chars == 16 bytes == 128 bits.
{
for (int i = 0; i < 128; i++)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -377,6 +377,28 @@ public void Match(string pattern, string input, RegexOptions options, int beginn
VerifyMatch(new Regex(pattern, options).Match(input, beginning, length), expectedSuccess, expectedValue);
}

[Theory]
[InlineData(RegexOptions.None)]
[InlineData(RegexOptions.Compiled)]
[InlineData(RegexOptions.Compiled | RegexOptions.IgnoreCase)]
[InlineData(RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.CultureInvariant)]
public void Match_VaryingLengthStrings(RegexOptions options)
{
var lengths = new List<int>() { 2, 3, 4, 5, 6, 7, 8, 9, 31, 32, 33, 63, 64, 65 };
if ((options & RegexOptions.IgnoreCase) == 0)
{
lengths.Add(100_000); // currently produces too large a compiled method for case-insensitive
}

foreach (int length in lengths)
{
string text = string.Concat(Enumerable.Range(0, length).Select(i => (char)('A' + (i % 26))));
stephentoub marked this conversation as resolved.
Show resolved Hide resolved
string pattern = "[123]" + text;
string input = "2" + text;
Match(pattern, input, options, 0, input.Length, expectedSuccess: true, expectedValue: input);
}
}

private static void VerifyMatch(Match match, bool expectedSuccess, string expectedValue)
{
Assert.Equal(expectedSuccess, match.Success);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
// See the LICENSE file in the project root for more information.

using System.Globalization;
using System.Linq;
using System.Tests;
using Microsoft.DotNet.RemoteExecutor;
using Xunit;

namespace System.Text.RegularExpressions.Tests
Expand All @@ -14,11 +14,13 @@ public class RegexCultureTests
/// <summary>
/// See https://en.wikipedia.org/wiki/Dotted_and_dotless_I
/// </summary>
[Fact]
public void TurkishI_Is_Differently_LowerUpperCased_In_Turkish_Culture()
[Theory]
[InlineData(2)]
[InlineData(256)]
public void TurkishI_Is_Differently_LowerUpperCased_In_Turkish_Culture(int length)
{
var turkish = new CultureInfo("tr-TR");
string input = "I\u0131\u0130i";
string input = string.Concat(Enumerable.Repeat("I\u0131\u0130i", length / 2));

Regex[] cultInvariantRegex = Create(input, CultureInfo.InvariantCulture, RegexOptions.IgnoreCase | RegexOptions.CultureInvariant);
Regex[] turkishRegex = Create(input, turkish, RegexOptions.IgnoreCase);
Expand Down