From 2ab0dbce856a90ad4a23b2a993443abccece80f6 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Sun, 12 Jan 2020 15:02:27 -0500 Subject: [PATCH 1/5] Process "multi" strings in compiled regexes 4 or 2 chars at a time when possible When we encounter a series of characters in a regex pattern, we emit a comparison check per character. This updates that codegen to compare 2 or 4 characters at a time, when possible. In general these sequences are not long, but they can easily be a handful of characters, and comparing with ints and longs instead of chars slightly improves both throughput and the size of the IL and JIT'd asm. --- .../src/System.Text.RegularExpressions.csproj | 1 + .../Text/RegularExpressions/RegexCompiler.cs | 95 ++++++++++++++++--- 2 files changed, 83 insertions(+), 13 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj index 3d7bcdc5f1cfc..506aa8f6fef99 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj +++ b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj @@ -72,5 +72,6 @@ + diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index 222c68f6c57d6..4e4ff13730b20 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -8,6 +8,8 @@ using System.Globalization; using System.Reflection; using System.Reflection.Emit; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; namespace System.Text.RegularExpressions { @@ -60,6 +62,11 @@ internal abstract class RegexCompiler private static readonly MethodInfo s_spanSliceIntMethod = typeof(ReadOnlySpan).GetMethod("Slice", new Type[] { typeof(int) })!; private static readonly MethodInfo s_spanSliceIntIntMethod = typeof(ReadOnlySpan).GetMethod("Slice", new Type[] { typeof(int), typeof(int) })!; private static readonly MethodInfo s_cultureInfoGetCurrentCultureMethod = typeof(CultureInfo).GetMethod("get_CurrentCulture")!; + private static readonly MethodInfo s_memoryMarshalGetReference = typeof(MemoryMarshal).GetMethod("GetReference", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); + private static readonly MethodInfo s_unsafeCharAsByte = typeof(Unsafe).GetMethod("As", new Type[] { Type.MakeGenericMethodParameter(0).MakeByRefType() })!.MakeGenericMethod(typeof(char), typeof(byte)); + private static readonly MethodInfo s_unsafeAddChar = typeof(Unsafe).GetMethod("Add", new Type[] { Type.MakeGenericMethodParameter(0).MakeByRefType(), typeof(int) })!.MakeGenericMethod(typeof(char)); + private static readonly MethodInfo s_unsafeReadUnalignedInt32 = typeof(Unsafe).GetMethod("ReadUnaligned", new Type[] { typeof(byte).MakeByRefType() })!.MakeGenericMethod(typeof(int)); + private static readonly MethodInfo s_unsafeReadUnalignedInt64 = typeof(Unsafe).GetMethod("ReadUnaligned", new Type[] { typeof(byte).MakeByRefType() })!.MakeGenericMethod(typeof(long)); #if DEBUG private static readonly MethodInfo s_debugWriteLine = typeof(Debug).GetMethod("WriteLine", new Type[] { typeof(string) })!; #endif @@ -264,6 +271,9 @@ private void Ldc(int i) } } + /// A macro for _ilg.Emit(OpCodes.Ldc_I8). + private void LdcI8(long i) => _ilg!.Emit(OpCodes.Ldc_I8, i); + /// A macro for _ilg.Emit(OpCodes.Dup). private void Dup() => _ilg!.Emit(OpCodes.Dup); @@ -2134,23 +2144,80 @@ void EmitAnchors(RegexNode node) // Emits the code to handle a multiple-character match. void EmitMultiChar(RegexNode node) { - // if (textSpanPos + node.Str.Length >= textSpan.Length) goto doneLabel; - // if (node.Str[0] != textSpan[textSpanPos]) goto doneLabel; - // if (node.Str[1] != textSpan[textSpanPos+1]) goto doneLabel; - // ... - EmitSpanLengthCheck(node.Str!.Length); - for (int i = 0; i < node.Str!.Length; i++) + ReadOnlySpan s = node.Str; + + // Emit the length check for the whole string. If the generated code gets past this point, + // we know the span is at least textSpanPos + s.Length long. + EmitSpanLengthCheck(s.Length); + + // If we're doing a case-insensitive comparison, we need to lower case each character, + // so we just go character-by-character. But if we're not, we try to process multiple + // characters at a time; this is helpful not only for throughput but also in reducing + // the amount of IL and asm that results from this unrolling. + if (!IsCaseInsensitive(node)) + { + void EmitTextSpanOffsetAsByte() + { + // Unsafe.As(ref Unsafe.Add(ref MemoryMarshal.GetReference(textSpan), textSpanPos)) + Ldloc(textSpanLocal); + Call(s_memoryMarshalGetReference); + if (textSpanPos > 0) + { + Ldc(textSpanPos); + Call(s_unsafeAddChar); + } + Call(s_unsafeCharAsByte); + } + + // TODO https://github.com/dotnet/corefx/issues/39227: + // If/when we implement CompileToAssembly, this code will either need to be special-cased + // to not be used when saving out the assembly, or it'll need to be augmented to emit an + // endianness check into the IL. The code below is creating int/long constants based on + // reading the comparison string at compile time, and the machine doing the compilation + // could be of a different endianness than the machine running the compiled assembly. + + // On 64-bit, process 4 characters at a time until the string isn't at least 4 characters long. + if (IntPtr.Size == 8) + { + const int CharsPerInt64 = 4; + while (s.Length >= CharsPerInt64) + { + // if (Unsafe.ReadUnaligned(ref text) != value) goto doneLabel; + EmitTextSpanOffsetAsByte(); + Call(s_unsafeReadUnalignedInt64); + LdcI8(MemoryMarshal.Read(MemoryMarshal.AsBytes(s))); + BneFar(doneLabel); + textSpanPos += CharsPerInt64; + s = s.Slice(CharsPerInt64); + } + } + + // Of what remains, process 2 characters at a time until the string isn't at least 2 characters long. + const int CharsPerInt32 = 2; + while (s.Length >= CharsPerInt32) + { + // if (Unsafe.ReadUnaligned(ref text) != value) goto doneLabel; + EmitTextSpanOffsetAsByte(); + Call(s_unsafeReadUnalignedInt32); + Ldc(MemoryMarshal.Read(MemoryMarshal.AsBytes(s))); + BneFar(doneLabel); + textSpanPos += CharsPerInt32; + s = s.Slice(CharsPerInt32); + } + } + + // Finally, process all of the remaining characters one by one. + for (int i = 0; i < s.Length; i++) { + // if (s[i] != textSpan[textSpanPos+i]) goto doneLabel; Ldloca(textSpanLocal); - Ldc(textSpanPos + i); + Ldc(textSpanPos++); Call(s_spanGetItemMethod); LdindU2(); if (IsCaseInsensitive(node)) CallToLower(); - Ldc(node.Str[i]); + Ldc(s[i]); BneFar(doneLabel); } - - textSpanPos += node.Str.Length; } // Emits the code to handle a loop (repeater) with a fixed number of iterations. @@ -4244,9 +4311,11 @@ private void EmitCallCharInClass(string charClass, bool caseInsensitive, LocalBu // We use a const string instead of a byte[] / static data property because // it lets IL emit handle all the gory details for us. It also is ok from an // endianness perspective because the compilation happens on the same machine - // that runs the compiled code. If that were to ever change, this would need - // to be revisited. String length is 8 chars == 16 bytes == 128 bits. - string bitVectorString = string.Create(8, (charClass, invariant), (dest, state) => + // that runs the compiled code. + // TODO https://github.com/dotnet/corefx/issues/39227: If that were to ever change, + // this would need to be revisited, such as by doubling the string length, and using + // just the lower byte of the char, e.g. (byte)lookup[x]. + string bitVectorString = string.Create(8, (charClass, invariant), (dest, state) => // String length is 8 chars == 16 bytes == 128 bits. { for (int i = 0; i < 128; i++) { From 74c00da9b36433abd1aa9c7bbf498f30e20ab007 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Sun, 12 Jan 2020 21:48:31 -0500 Subject: [PATCH 2/5] Address PR feedback And add a test. --- .../src/System.Text.RegularExpressions.csproj | 1 - .../Text/RegularExpressions/RegexCompiler.cs | 59 ++++++++++--------- .../tests/Regex.Match.Tests.cs | 18 ++++++ 3 files changed, 50 insertions(+), 28 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj index 506aa8f6fef99..3d7bcdc5f1cfc 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj +++ b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj @@ -72,6 +72,5 @@ - diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index 4e4ff13730b20..58e4ed2d013fe 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -63,10 +63,6 @@ internal abstract class RegexCompiler private static readonly MethodInfo s_spanSliceIntIntMethod = typeof(ReadOnlySpan).GetMethod("Slice", new Type[] { typeof(int), typeof(int) })!; private static readonly MethodInfo s_cultureInfoGetCurrentCultureMethod = typeof(CultureInfo).GetMethod("get_CurrentCulture")!; private static readonly MethodInfo s_memoryMarshalGetReference = typeof(MemoryMarshal).GetMethod("GetReference", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); - private static readonly MethodInfo s_unsafeCharAsByte = typeof(Unsafe).GetMethod("As", new Type[] { Type.MakeGenericMethodParameter(0).MakeByRefType() })!.MakeGenericMethod(typeof(char), typeof(byte)); - private static readonly MethodInfo s_unsafeAddChar = typeof(Unsafe).GetMethod("Add", new Type[] { Type.MakeGenericMethodParameter(0).MakeByRefType(), typeof(int) })!.MakeGenericMethod(typeof(char)); - private static readonly MethodInfo s_unsafeReadUnalignedInt32 = typeof(Unsafe).GetMethod("ReadUnaligned", new Type[] { typeof(byte).MakeByRefType() })!.MakeGenericMethod(typeof(int)); - private static readonly MethodInfo s_unsafeReadUnalignedInt64 = typeof(Unsafe).GetMethod("ReadUnaligned", new Type[] { typeof(byte).MakeByRefType() })!.MakeGenericMethod(typeof(long)); #if DEBUG private static readonly MethodInfo s_debugWriteLine = typeof(Debug).GetMethod("WriteLine", new Type[] { typeof(string) })!; #endif @@ -328,6 +324,15 @@ private void Ldc(int i) /// A macro for _ilg.Emit(OpCodes.Ldind_U2). private void LdindU2() => _ilg!.Emit(OpCodes.Ldind_U2); + /// A macro for _ilg.Emit(OpCodes.Ldind_I4). + private void LdindI4() => _ilg!.Emit(OpCodes.Ldind_I4); + + /// A macro for _ilg.Emit(OpCodes.Ldind_I8). + private void LdindI8() => _ilg!.Emit(OpCodes.Ldind_I8); + + /// A macro for _ilg.Emit(OpCodes.Unaligned). + private void Unaligned(byte alignment) => _ilg!.Emit(OpCodes.Unaligned, alignment); + /// A macro for _ilg.Emit(OpCodes.Stloc_S). private void Stloc(LocalBuilder lt) => _ilg!.Emit(OpCodes.Stloc_S, lt); @@ -1718,6 +1723,18 @@ void EmitSpanLengthCheck(int requiredLength, LocalBuilder? dynamicRequiredLength BgeUnFar(doneLabel); } + // Emits code to get ref textSpan[textSpanPos] + void EmitTextSpanOffset() + { + Ldloc(textSpanLocal); + Call(s_memoryMarshalGetReference); + if (textSpanPos > 0) + { + Ldc(textSpanPos * sizeof(char)); + Add(); + } + } + void TransferTextSpanPosToRunTextPos() { if (textSpanPos > 0) @@ -2156,19 +2173,6 @@ void EmitMultiChar(RegexNode node) // the amount of IL and asm that results from this unrolling. if (!IsCaseInsensitive(node)) { - void EmitTextSpanOffsetAsByte() - { - // Unsafe.As(ref Unsafe.Add(ref MemoryMarshal.GetReference(textSpan), textSpanPos)) - Ldloc(textSpanLocal); - Call(s_memoryMarshalGetReference); - if (textSpanPos > 0) - { - Ldc(textSpanPos); - Call(s_unsafeAddChar); - } - Call(s_unsafeCharAsByte); - } - // TODO https://github.com/dotnet/corefx/issues/39227: // If/when we implement CompileToAssembly, this code will either need to be special-cased // to not be used when saving out the assembly, or it'll need to be augmented to emit an @@ -2182,9 +2186,10 @@ void EmitTextSpanOffsetAsByte() const int CharsPerInt64 = 4; while (s.Length >= CharsPerInt64) { - // if (Unsafe.ReadUnaligned(ref text) != value) goto doneLabel; - EmitTextSpanOffsetAsByte(); - Call(s_unsafeReadUnalignedInt64); + // if (Unsafe.ReadUnaligned(ref Unsafe.Add(ref MemoryMarshal.GetReference(textSpan), textSpanPos)) != value) goto doneLabel; + EmitTextSpanOffset(); + Unaligned(1); + LdindI8(); LdcI8(MemoryMarshal.Read(MemoryMarshal.AsBytes(s))); BneFar(doneLabel); textSpanPos += CharsPerInt64; @@ -2196,9 +2201,10 @@ void EmitTextSpanOffsetAsByte() const int CharsPerInt32 = 2; while (s.Length >= CharsPerInt32) { - // if (Unsafe.ReadUnaligned(ref text) != value) goto doneLabel; - EmitTextSpanOffsetAsByte(); - Call(s_unsafeReadUnalignedInt32); + // if (Unsafe.ReadUnaligned(ref Unsafe.Add(ref MemoryMarshal.GetReference(textSpan), textSpanPos)) != value) goto doneLabel; + EmitTextSpanOffset(); + Unaligned(1); + LdindI4(); Ldc(MemoryMarshal.Read(MemoryMarshal.AsBytes(s))); BneFar(doneLabel); textSpanPos += CharsPerInt32; @@ -2209,10 +2215,9 @@ void EmitTextSpanOffsetAsByte() // Finally, process all of the remaining characters one by one. for (int i = 0; i < s.Length; i++) { - // if (s[i] != textSpan[textSpanPos+i]) goto doneLabel; - Ldloca(textSpanLocal); - Ldc(textSpanPos++); - Call(s_spanGetItemMethod); + // if (s[i] != textSpan[textSpanPos++]) goto doneLabel; + EmitTextSpanOffset(); + textSpanPos++; LdindU2(); if (IsCaseInsensitive(node)) CallToLower(); Ldc(s[i]); diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index 6566129ed7f92..689aa64a91425 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -377,6 +377,24 @@ public void Match(string pattern, string input, RegexOptions options, int beginn VerifyMatch(new Regex(pattern, options).Match(input, beginning, length), expectedSuccess, expectedValue); } + [Fact] + public void Match_VaryingLengthStrings() + { + for (int length = 2; length < 32; length++) + { + for (int prechars = 0; prechars < 4; prechars++) + { + var r = new Random(42); + string pattern = string.Concat(Enumerable.Range(0, length).Select(i => (char)r.Next(0, 1 + char.MaxValue))); + string input = new string('#', prechars) + pattern; + foreach (RegexOptions options in new[] { RegexOptions.None, RegexOptions.Compiled }) + { + Match(pattern, input, options, prechars, length, expectedSuccess: true, expectedValue: pattern); + } + } + } + } + private static void VerifyMatch(Match match, bool expectedSuccess, string expectedValue) { Assert.Equal(expectedSuccess, match.Success); From 7af70b03eb620e10069011e259c41099dca4925d Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Mon, 13 Jan 2020 10:29:19 -0500 Subject: [PATCH 3/5] Use StartsWith for long strings in EmitMultiChar Avoids potentially very long generated Go methods and the resulting long JIT times and potentially stack overflows at invocation time. Prior to this change, a test added for 100K-long string takes a very long time to run and then stack overflows, whereas with this change it's very fast. --- .../Text/RegularExpressions/RegexCompiler.cs | 73 ++++++++++++++----- .../tests/Regex.Match.Tests.cs | 15 ++-- .../tests/RegexCultureTests.cs | 10 ++- 3 files changed, 67 insertions(+), 31 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index 58e4ed2d013fe..8136208c0e7f9 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -47,25 +47,28 @@ internal abstract class RegexCompiler private static readonly MethodInfo s_dumpStateM = RegexRunnerMethod("DumpState"); #endif - private static readonly MethodInfo s_charToLowerMethod = typeof(char).GetMethod("ToLower", new Type[] { typeof(char), typeof(CultureInfo) })!; - private static readonly MethodInfo s_charToLowerInvariantMethod = typeof(char).GetMethod("ToLowerInvariant", new Type[] { typeof(char) })!; private static readonly MethodInfo s_charIsDigitMethod = typeof(char).GetMethod("IsDigit", new Type[] { typeof(char) })!; private static readonly MethodInfo s_charIsWhiteSpaceMethod = typeof(char).GetMethod("IsWhiteSpace", new Type[] { typeof(char) })!; - private static readonly MethodInfo s_stringGetCharsMethod = typeof(string).GetMethod("get_Chars", new Type[] { typeof(int) })!; - private static readonly MethodInfo s_stringAsSpanMethod = typeof(MemoryExtensions).GetMethod("AsSpan", new Type[] { typeof(string), typeof(int), typeof(int) })!; - private static readonly MethodInfo s_stringIndexOf = typeof(string).GetMethod("IndexOf", new Type[] { typeof(char), typeof(int), typeof(int) })!; + private static readonly MethodInfo s_charToLowerMethod = typeof(char).GetMethod("ToLower", new Type[] { typeof(char), typeof(CultureInfo) })!; + private static readonly MethodInfo s_charToLowerInvariantMethod = typeof(char).GetMethod("ToLowerInvariant", new Type[] { typeof(char) })!; + private static readonly MethodInfo s_cultureInfoGetCurrentCultureMethod = typeof(CultureInfo).GetMethod("get_CurrentCulture")!; +#if DEBUG + private static readonly MethodInfo s_debugWriteLine = typeof(Debug).GetMethod("WriteLine", new Type[] { typeof(string) })!; +#endif + private static readonly MethodInfo s_spanGetItemMethod = typeof(ReadOnlySpan).GetMethod("get_Item", new Type[] { typeof(int) })!; + private static readonly MethodInfo s_spanGetLengthMethod = typeof(ReadOnlySpan).GetMethod("get_Length")!; + private static readonly MethodInfo s_memoryMarshalGetReference = typeof(MemoryMarshal).GetMethod("GetReference", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanIndexOf = typeof(MemoryExtensions).GetMethod("IndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanIndexOfAnyCharChar = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanIndexOfAnyCharCharChar = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); - private static readonly MethodInfo s_spanGetItemMethod = typeof(ReadOnlySpan).GetMethod("get_Item", new Type[] { typeof(int) })!; - private static readonly MethodInfo s_spanGetLengthMethod = typeof(ReadOnlySpan).GetMethod("get_Length")!; private static readonly MethodInfo s_spanSliceIntMethod = typeof(ReadOnlySpan).GetMethod("Slice", new Type[] { typeof(int) })!; private static readonly MethodInfo s_spanSliceIntIntMethod = typeof(ReadOnlySpan).GetMethod("Slice", new Type[] { typeof(int), typeof(int) })!; - private static readonly MethodInfo s_cultureInfoGetCurrentCultureMethod = typeof(CultureInfo).GetMethod("get_CurrentCulture")!; - private static readonly MethodInfo s_memoryMarshalGetReference = typeof(MemoryMarshal).GetMethod("GetReference", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); -#if DEBUG - private static readonly MethodInfo s_debugWriteLine = typeof(Debug).GetMethod("WriteLine", new Type[] { typeof(string) })!; -#endif + private static readonly MethodInfo s_spanStartsWith = typeof(MemoryExtensions).GetMethod("StartsWith", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); + private static readonly MethodInfo s_spanStartsWithComparison = typeof(MemoryExtensions).GetMethod("StartsWith", new Type[] { typeof(ReadOnlySpan), typeof(ReadOnlySpan), typeof(StringComparison) })!; + private static readonly MethodInfo s_stringAsSpanMethod = typeof(MemoryExtensions).GetMethod("AsSpan", new Type[] { typeof(string) })!; + private static readonly MethodInfo s_stringAsSpanIntIntMethod = typeof(MemoryExtensions).GetMethod("AsSpan", new Type[] { typeof(string), typeof(int), typeof(int) })!; + private static readonly MethodInfo s_stringGetCharsMethod = typeof(string).GetMethod("get_Chars", new Type[] { typeof(int) })!; + private static readonly MethodInfo s_stringIndexOf = typeof(string).GetMethod("IndexOf", new Type[] { typeof(char), typeof(int), typeof(int) })!; protected ILGenerator? _ilg; @@ -1322,7 +1325,7 @@ protected void GenerateFindFirstChar() Ldloc(_runtextendLocal); Ldloc(_runtextposLocal); Sub(); - Call(s_stringAsSpanMethod); + Call(s_stringAsSpanIntIntMethod); Ldc(setChars[0]); Ldc(setChars[1]); if (setCharsCount == 3) @@ -1376,7 +1379,7 @@ protected void GenerateFindFirstChar() Ldloc(_runtextendLocal); Ldloc(_runtextposLocal); Sub(); - Call(s_stringAsSpanMethod); + Call(s_stringAsSpanIntIntMethod); Stloc(textSpanLocal); // for (int i = 0; @@ -1660,7 +1663,7 @@ void LoadTextSpanLocal() Ldthisfld(s_runtextendField); Ldloc(runtextposLocal); Sub(); - Call(s_stringAsSpanMethod); + Call(s_stringAsSpanIntIntMethod); Stloc(textSpanLocal); } @@ -2161,17 +2164,51 @@ void EmitAnchors(RegexNode node) // Emits the code to handle a multiple-character match. void EmitMultiChar(RegexNode node) { - ReadOnlySpan s = node.Str; + bool caseInsensitive = IsCaseInsensitive(node); + + // If the multi string's length exceeds the maximum length we want to unroll, instead generate a call to StartsWith. + // Each character that we unroll results in code generation that increases the size of both the IL and the resulting asm, + // and with a large enough string, that can cause significant overhead as well as even risk stack overflow due to + // having an obscenely long method. Such long string lengths in a pattern are generally quite rare. However, we also + // want to unroll for shorter strings, because the overhead of invoking StartsWith instead of doing a few simple + // inline comparisons is very measurable, especially if we're doing a culture-sensitive comparison and StartsWith + // accesses CultureInfo.CurrentCulture on each call. We need to be cognizant not only of the cost if the whole + // string matches, but also the cost when the comparison fails early on, and thus we pay for the call overhead + // but don't reap the benefits of all the vectorization StartsWith can do. + const int MaxUnrollLength = 64; + if (node.Str!.Length > MaxUnrollLength) + { + // if (!textSpan.Slice(textSpanPos).StartsWith("..."[, StringComparison.XxIgnoreCase])) goto doneLabel; + Ldloca(textSpanLocal); + Ldc(textSpanPos); + Call(s_spanSliceIntMethod); + Ldstr(node.Str); + Call(s_stringAsSpanMethod); + if (!caseInsensitive) + { + Call(s_spanStartsWith); + } + else + { + Ldc((int)(UseToLowerInvariant ? StringComparison.InvariantCultureIgnoreCase : StringComparison.CurrentCultureIgnoreCase)); + Call(s_spanStartsWithComparison); + } + BrfalseFar(doneLabel); + textSpanPos += node.Str.Length; + + return; + } // Emit the length check for the whole string. If the generated code gets past this point, // we know the span is at least textSpanPos + s.Length long. + ReadOnlySpan s = node.Str; EmitSpanLengthCheck(s.Length); // If we're doing a case-insensitive comparison, we need to lower case each character, // so we just go character-by-character. But if we're not, we try to process multiple // characters at a time; this is helpful not only for throughput but also in reducing // the amount of IL and asm that results from this unrolling. - if (!IsCaseInsensitive(node)) + if (!caseInsensitive) { // TODO https://github.com/dotnet/corefx/issues/39227: // If/when we implement CompileToAssembly, this code will either need to be special-cased @@ -2219,7 +2256,7 @@ void EmitMultiChar(RegexNode node) EmitTextSpanOffset(); textSpanPos++; LdindU2(); - if (IsCaseInsensitive(node)) CallToLower(); + if (caseInsensitive) CallToLower(); Ldc(s[i]); BneFar(doneLabel); } diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index 689aa64a91425..978d5d51cfd40 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -380,17 +380,14 @@ public void Match(string pattern, string input, RegexOptions options, int beginn [Fact] public void Match_VaryingLengthStrings() { - for (int length = 2; length < 32; length++) + foreach (int length in new[] { 2, 3, 4, 5, 6, 7, 8, 9, 31, 32, 33, 63, 64, 65, 100_000 }) { - for (int prechars = 0; prechars < 4; prechars++) + string text = string.Concat(Enumerable.Range(0, length).Select(i => (char)('A' + (i % 26)))); + string pattern = "[123]" + text; + string input = "2" + text; + foreach (RegexOptions options in new[] { RegexOptions.None, RegexOptions.Compiled, RegexOptions.Compiled | RegexOptions.IgnoreCase, RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.CultureInvariant }) { - var r = new Random(42); - string pattern = string.Concat(Enumerable.Range(0, length).Select(i => (char)r.Next(0, 1 + char.MaxValue))); - string input = new string('#', prechars) + pattern; - foreach (RegexOptions options in new[] { RegexOptions.None, RegexOptions.Compiled }) - { - Match(pattern, input, options, prechars, length, expectedSuccess: true, expectedValue: pattern); - } + Match(pattern, input, options, 0, input.Length, expectedSuccess: true, expectedValue: input); } } } diff --git a/src/libraries/System.Text.RegularExpressions/tests/RegexCultureTests.cs b/src/libraries/System.Text.RegularExpressions/tests/RegexCultureTests.cs index 253bc098d6c7c..6ee819dfba0d4 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/RegexCultureTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/RegexCultureTests.cs @@ -3,8 +3,8 @@ // See the LICENSE file in the project root for more information. using System.Globalization; +using System.Linq; using System.Tests; -using Microsoft.DotNet.RemoteExecutor; using Xunit; namespace System.Text.RegularExpressions.Tests @@ -14,11 +14,13 @@ public class RegexCultureTests /// /// See https://en.wikipedia.org/wiki/Dotted_and_dotless_I /// - [Fact] - public void TurkishI_Is_Differently_LowerUpperCased_In_Turkish_Culture() + [Theory] + [InlineData(2)] + [InlineData(256)] + public void TurkishI_Is_Differently_LowerUpperCased_In_Turkish_Culture(int length) { var turkish = new CultureInfo("tr-TR"); - string input = "I\u0131\u0130i"; + string input = string.Concat(Enumerable.Repeat("I\u0131\u0130i", length / 2)); Regex[] cultInvariantRegex = Create(input, CultureInfo.InvariantCulture, RegexOptions.IgnoreCase | RegexOptions.CultureInvariant); Regex[] turkishRegex = Create(input, turkish, RegexOptions.IgnoreCase); From 155135a76fe1c6664147fe7d75e69ffe2f970ebb Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Mon, 13 Jan 2020 13:51:01 -0500 Subject: [PATCH 4/5] Address PR feedback --- .../Text/RegularExpressions/RegexCompiler.cs | 17 ++++----------- .../tests/Regex.Match.Tests.cs | 21 ++++++++++++------- 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index 8136208c0e7f9..125a155140d05 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -64,7 +64,6 @@ internal abstract class RegexCompiler private static readonly MethodInfo s_spanSliceIntMethod = typeof(ReadOnlySpan).GetMethod("Slice", new Type[] { typeof(int) })!; private static readonly MethodInfo s_spanSliceIntIntMethod = typeof(ReadOnlySpan).GetMethod("Slice", new Type[] { typeof(int), typeof(int) })!; private static readonly MethodInfo s_spanStartsWith = typeof(MemoryExtensions).GetMethod("StartsWith", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); - private static readonly MethodInfo s_spanStartsWithComparison = typeof(MemoryExtensions).GetMethod("StartsWith", new Type[] { typeof(ReadOnlySpan), typeof(ReadOnlySpan), typeof(StringComparison) })!; private static readonly MethodInfo s_stringAsSpanMethod = typeof(MemoryExtensions).GetMethod("AsSpan", new Type[] { typeof(string) })!; private static readonly MethodInfo s_stringAsSpanIntIntMethod = typeof(MemoryExtensions).GetMethod("AsSpan", new Type[] { typeof(string), typeof(int), typeof(int) })!; private static readonly MethodInfo s_stringGetCharsMethod = typeof(string).GetMethod("get_Chars", new Type[] { typeof(int) })!; @@ -2176,26 +2175,18 @@ void EmitMultiChar(RegexNode node) // string matches, but also the cost when the comparison fails early on, and thus we pay for the call overhead // but don't reap the benefits of all the vectorization StartsWith can do. const int MaxUnrollLength = 64; - if (node.Str!.Length > MaxUnrollLength) + if (!caseInsensitive && // StartsWith(..., XxIgnoreCase) won't necessarily be the same as char-by-char comparison + node.Str!.Length > MaxUnrollLength) { - // if (!textSpan.Slice(textSpanPos).StartsWith("..."[, StringComparison.XxIgnoreCase])) goto doneLabel; + // if (!textSpan.Slice(textSpanPos).StartsWith("...") goto doneLabel; Ldloca(textSpanLocal); Ldc(textSpanPos); Call(s_spanSliceIntMethod); Ldstr(node.Str); Call(s_stringAsSpanMethod); - if (!caseInsensitive) - { - Call(s_spanStartsWith); - } - else - { - Ldc((int)(UseToLowerInvariant ? StringComparison.InvariantCultureIgnoreCase : StringComparison.CurrentCultureIgnoreCase)); - Call(s_spanStartsWithComparison); - } + Call(s_spanStartsWith); BrfalseFar(doneLabel); textSpanPos += node.Str.Length; - return; } diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index 978d5d51cfd40..1858f57cb728a 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -377,18 +377,25 @@ public void Match(string pattern, string input, RegexOptions options, int beginn VerifyMatch(new Regex(pattern, options).Match(input, beginning, length), expectedSuccess, expectedValue); } - [Fact] - public void Match_VaryingLengthStrings() + [Theory] + [InlineData(RegexOptions.None)] + [InlineData(RegexOptions.Compiled)] + [InlineData(RegexOptions.Compiled | RegexOptions.IgnoreCase)] + [InlineData(RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.CultureInvariant)] + public void Match_VaryingLengthStrings(RegexOptions options) { - foreach (int length in new[] { 2, 3, 4, 5, 6, 7, 8, 9, 31, 32, 33, 63, 64, 65, 100_000 }) + var lengths = new List() { 2, 3, 4, 5, 6, 7, 8, 9, 31, 32, 33, 63, 64, 65 }; + if ((options & RegexOptions.IgnoreCase) == 0) + { + lengths.Add(100_000); // currently produces too large a compiled method for case-insensitive + } + + foreach (int length in lengths) { string text = string.Concat(Enumerable.Range(0, length).Select(i => (char)('A' + (i % 26)))); string pattern = "[123]" + text; string input = "2" + text; - foreach (RegexOptions options in new[] { RegexOptions.None, RegexOptions.Compiled, RegexOptions.Compiled | RegexOptions.IgnoreCase, RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.CultureInvariant }) - { - Match(pattern, input, options, 0, input.Length, expectedSuccess: true, expectedValue: input); - } + Match(pattern, input, options, 0, input.Length, expectedSuccess: true, expectedValue: input); } } From 360c9c85c925c1834d4ec122023b4c45063793e5 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Mon, 13 Jan 2020 16:35:33 -0500 Subject: [PATCH 5/5] Tweak test per PR feedback --- .../tests/Regex.Match.Tests.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index 1858f57cb728a..bbbdd5ed0c83c 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -390,11 +390,11 @@ public void Match_VaryingLengthStrings(RegexOptions options) lengths.Add(100_000); // currently produces too large a compiled method for case-insensitive } + bool caseInsensitive = (options & RegexOptions.IgnoreCase) != 0; foreach (int length in lengths) { - string text = string.Concat(Enumerable.Range(0, length).Select(i => (char)('A' + (i % 26)))); - string pattern = "[123]" + text; - string input = "2" + text; + string pattern = "[123]" + string.Concat(Enumerable.Range(0, length).Select(i => (char)('A' + (i % 26)))); + string input = "2" + string.Concat(Enumerable.Range(0, length).Select(i => (char)((caseInsensitive ? 'a' : 'A') + (i % 26)))); Match(pattern, input, options, 0, input.Length, expectedSuccess: true, expectedValue: input); } }