diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index 222c68f6c57d6..125a155140d05 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -8,6 +8,8 @@ using System.Globalization; using System.Reflection; using System.Reflection.Emit; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; namespace System.Text.RegularExpressions { @@ -45,24 +47,27 @@ internal abstract class RegexCompiler private static readonly MethodInfo s_dumpStateM = RegexRunnerMethod("DumpState"); #endif - private static readonly MethodInfo s_charToLowerMethod = typeof(char).GetMethod("ToLower", new Type[] { typeof(char), typeof(CultureInfo) })!; - private static readonly MethodInfo s_charToLowerInvariantMethod = typeof(char).GetMethod("ToLowerInvariant", new Type[] { typeof(char) })!; private static readonly MethodInfo s_charIsDigitMethod = typeof(char).GetMethod("IsDigit", new Type[] { typeof(char) })!; private static readonly MethodInfo s_charIsWhiteSpaceMethod = typeof(char).GetMethod("IsWhiteSpace", new Type[] { typeof(char) })!; - private static readonly MethodInfo s_stringGetCharsMethod = typeof(string).GetMethod("get_Chars", new Type[] { typeof(int) })!; - private static readonly MethodInfo s_stringAsSpanMethod = typeof(MemoryExtensions).GetMethod("AsSpan", new Type[] { typeof(string), typeof(int), typeof(int) })!; - private static readonly MethodInfo s_stringIndexOf = typeof(string).GetMethod("IndexOf", new Type[] { typeof(char), typeof(int), typeof(int) })!; + private static readonly MethodInfo s_charToLowerMethod = typeof(char).GetMethod("ToLower", new Type[] { typeof(char), typeof(CultureInfo) })!; + private static readonly MethodInfo s_charToLowerInvariantMethod = typeof(char).GetMethod("ToLowerInvariant", new Type[] { typeof(char) })!; + private static readonly MethodInfo s_cultureInfoGetCurrentCultureMethod = typeof(CultureInfo).GetMethod("get_CurrentCulture")!; +#if DEBUG + private static readonly MethodInfo s_debugWriteLine = typeof(Debug).GetMethod("WriteLine", new Type[] { typeof(string) })!; +#endif + private static readonly MethodInfo s_spanGetItemMethod = typeof(ReadOnlySpan).GetMethod("get_Item", new Type[] { typeof(int) })!; + private static readonly MethodInfo s_spanGetLengthMethod = typeof(ReadOnlySpan).GetMethod("get_Length")!; + private static readonly MethodInfo s_memoryMarshalGetReference = typeof(MemoryMarshal).GetMethod("GetReference", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanIndexOf = typeof(MemoryExtensions).GetMethod("IndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanIndexOfAnyCharChar = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanIndexOfAnyCharCharChar = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); - private static readonly MethodInfo s_spanGetItemMethod = typeof(ReadOnlySpan).GetMethod("get_Item", new Type[] { typeof(int) })!; - private static readonly MethodInfo s_spanGetLengthMethod = typeof(ReadOnlySpan).GetMethod("get_Length")!; private static readonly MethodInfo s_spanSliceIntMethod = typeof(ReadOnlySpan).GetMethod("Slice", new Type[] { typeof(int) })!; private static readonly MethodInfo s_spanSliceIntIntMethod = typeof(ReadOnlySpan).GetMethod("Slice", new Type[] { typeof(int), typeof(int) })!; - private static readonly MethodInfo s_cultureInfoGetCurrentCultureMethod = typeof(CultureInfo).GetMethod("get_CurrentCulture")!; -#if DEBUG - private static readonly MethodInfo s_debugWriteLine = typeof(Debug).GetMethod("WriteLine", new Type[] { typeof(string) })!; -#endif + private static readonly MethodInfo s_spanStartsWith = typeof(MemoryExtensions).GetMethod("StartsWith", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); + private static readonly MethodInfo s_stringAsSpanMethod = typeof(MemoryExtensions).GetMethod("AsSpan", new Type[] { typeof(string) })!; + private static readonly MethodInfo s_stringAsSpanIntIntMethod = typeof(MemoryExtensions).GetMethod("AsSpan", new Type[] { typeof(string), typeof(int), typeof(int) })!; + private static readonly MethodInfo s_stringGetCharsMethod = typeof(string).GetMethod("get_Chars", new Type[] { typeof(int) })!; + private static readonly MethodInfo s_stringIndexOf = typeof(string).GetMethod("IndexOf", new Type[] { typeof(char), typeof(int), typeof(int) })!; protected ILGenerator? _ilg; @@ -264,6 +269,9 @@ private void Ldc(int i) } } + /// A macro for _ilg.Emit(OpCodes.Ldc_I8). + private void LdcI8(long i) => _ilg!.Emit(OpCodes.Ldc_I8, i); + /// A macro for _ilg.Emit(OpCodes.Dup). private void Dup() => _ilg!.Emit(OpCodes.Dup); @@ -318,6 +326,15 @@ private void Ldc(int i) /// A macro for _ilg.Emit(OpCodes.Ldind_U2). private void LdindU2() => _ilg!.Emit(OpCodes.Ldind_U2); + /// A macro for _ilg.Emit(OpCodes.Ldind_I4). + private void LdindI4() => _ilg!.Emit(OpCodes.Ldind_I4); + + /// A macro for _ilg.Emit(OpCodes.Ldind_I8). + private void LdindI8() => _ilg!.Emit(OpCodes.Ldind_I8); + + /// A macro for _ilg.Emit(OpCodes.Unaligned). + private void Unaligned(byte alignment) => _ilg!.Emit(OpCodes.Unaligned, alignment); + /// A macro for _ilg.Emit(OpCodes.Stloc_S). private void Stloc(LocalBuilder lt) => _ilg!.Emit(OpCodes.Stloc_S, lt); @@ -1307,7 +1324,7 @@ protected void GenerateFindFirstChar() Ldloc(_runtextendLocal); Ldloc(_runtextposLocal); Sub(); - Call(s_stringAsSpanMethod); + Call(s_stringAsSpanIntIntMethod); Ldc(setChars[0]); Ldc(setChars[1]); if (setCharsCount == 3) @@ -1361,7 +1378,7 @@ protected void GenerateFindFirstChar() Ldloc(_runtextendLocal); Ldloc(_runtextposLocal); Sub(); - Call(s_stringAsSpanMethod); + Call(s_stringAsSpanIntIntMethod); Stloc(textSpanLocal); // for (int i = 0; @@ -1645,7 +1662,7 @@ void LoadTextSpanLocal() Ldthisfld(s_runtextendField); Ldloc(runtextposLocal); Sub(); - Call(s_stringAsSpanMethod); + Call(s_stringAsSpanIntIntMethod); Stloc(textSpanLocal); } @@ -1708,6 +1725,18 @@ void EmitSpanLengthCheck(int requiredLength, LocalBuilder? dynamicRequiredLength BgeUnFar(doneLabel); } + // Emits code to get ref textSpan[textSpanPos] + void EmitTextSpanOffset() + { + Ldloc(textSpanLocal); + Call(s_memoryMarshalGetReference); + if (textSpanPos > 0) + { + Ldc(textSpanPos * sizeof(char)); + Add(); + } + } + void TransferTextSpanPosToRunTextPos() { if (textSpanPos > 0) @@ -2134,23 +2163,94 @@ void EmitAnchors(RegexNode node) // Emits the code to handle a multiple-character match. void EmitMultiChar(RegexNode node) { - // if (textSpanPos + node.Str.Length >= textSpan.Length) goto doneLabel; - // if (node.Str[0] != textSpan[textSpanPos]) goto doneLabel; - // if (node.Str[1] != textSpan[textSpanPos+1]) goto doneLabel; - // ... - EmitSpanLengthCheck(node.Str!.Length); - for (int i = 0; i < node.Str!.Length; i++) + bool caseInsensitive = IsCaseInsensitive(node); + + // If the multi string's length exceeds the maximum length we want to unroll, instead generate a call to StartsWith. + // Each character that we unroll results in code generation that increases the size of both the IL and the resulting asm, + // and with a large enough string, that can cause significant overhead as well as even risk stack overflow due to + // having an obscenely long method. Such long string lengths in a pattern are generally quite rare. However, we also + // want to unroll for shorter strings, because the overhead of invoking StartsWith instead of doing a few simple + // inline comparisons is very measurable, especially if we're doing a culture-sensitive comparison and StartsWith + // accesses CultureInfo.CurrentCulture on each call. We need to be cognizant not only of the cost if the whole + // string matches, but also the cost when the comparison fails early on, and thus we pay for the call overhead + // but don't reap the benefits of all the vectorization StartsWith can do. + const int MaxUnrollLength = 64; + if (!caseInsensitive && // StartsWith(..., XxIgnoreCase) won't necessarily be the same as char-by-char comparison + node.Str!.Length > MaxUnrollLength) { + // if (!textSpan.Slice(textSpanPos).StartsWith("...") goto doneLabel; Ldloca(textSpanLocal); - Ldc(textSpanPos + i); - Call(s_spanGetItemMethod); + Ldc(textSpanPos); + Call(s_spanSliceIntMethod); + Ldstr(node.Str); + Call(s_stringAsSpanMethod); + Call(s_spanStartsWith); + BrfalseFar(doneLabel); + textSpanPos += node.Str.Length; + return; + } + + // Emit the length check for the whole string. If the generated code gets past this point, + // we know the span is at least textSpanPos + s.Length long. + ReadOnlySpan s = node.Str; + EmitSpanLengthCheck(s.Length); + + // If we're doing a case-insensitive comparison, we need to lower case each character, + // so we just go character-by-character. But if we're not, we try to process multiple + // characters at a time; this is helpful not only for throughput but also in reducing + // the amount of IL and asm that results from this unrolling. + if (!caseInsensitive) + { + // TODO https://github.com/dotnet/corefx/issues/39227: + // If/when we implement CompileToAssembly, this code will either need to be special-cased + // to not be used when saving out the assembly, or it'll need to be augmented to emit an + // endianness check into the IL. The code below is creating int/long constants based on + // reading the comparison string at compile time, and the machine doing the compilation + // could be of a different endianness than the machine running the compiled assembly. + + // On 64-bit, process 4 characters at a time until the string isn't at least 4 characters long. + if (IntPtr.Size == 8) + { + const int CharsPerInt64 = 4; + while (s.Length >= CharsPerInt64) + { + // if (Unsafe.ReadUnaligned(ref Unsafe.Add(ref MemoryMarshal.GetReference(textSpan), textSpanPos)) != value) goto doneLabel; + EmitTextSpanOffset(); + Unaligned(1); + LdindI8(); + LdcI8(MemoryMarshal.Read(MemoryMarshal.AsBytes(s))); + BneFar(doneLabel); + textSpanPos += CharsPerInt64; + s = s.Slice(CharsPerInt64); + } + } + + // Of what remains, process 2 characters at a time until the string isn't at least 2 characters long. + const int CharsPerInt32 = 2; + while (s.Length >= CharsPerInt32) + { + // if (Unsafe.ReadUnaligned(ref Unsafe.Add(ref MemoryMarshal.GetReference(textSpan), textSpanPos)) != value) goto doneLabel; + EmitTextSpanOffset(); + Unaligned(1); + LdindI4(); + Ldc(MemoryMarshal.Read(MemoryMarshal.AsBytes(s))); + BneFar(doneLabel); + textSpanPos += CharsPerInt32; + s = s.Slice(CharsPerInt32); + } + } + + // Finally, process all of the remaining characters one by one. + for (int i = 0; i < s.Length; i++) + { + // if (s[i] != textSpan[textSpanPos++]) goto doneLabel; + EmitTextSpanOffset(); + textSpanPos++; LdindU2(); - if (IsCaseInsensitive(node)) CallToLower(); - Ldc(node.Str[i]); + if (caseInsensitive) CallToLower(); + Ldc(s[i]); BneFar(doneLabel); } - - textSpanPos += node.Str.Length; } // Emits the code to handle a loop (repeater) with a fixed number of iterations. @@ -4244,9 +4344,11 @@ private void EmitCallCharInClass(string charClass, bool caseInsensitive, LocalBu // We use a const string instead of a byte[] / static data property because // it lets IL emit handle all the gory details for us. It also is ok from an // endianness perspective because the compilation happens on the same machine - // that runs the compiled code. If that were to ever change, this would need - // to be revisited. String length is 8 chars == 16 bytes == 128 bits. - string bitVectorString = string.Create(8, (charClass, invariant), (dest, state) => + // that runs the compiled code. + // TODO https://github.com/dotnet/corefx/issues/39227: If that were to ever change, + // this would need to be revisited, such as by doubling the string length, and using + // just the lower byte of the char, e.g. (byte)lookup[x]. + string bitVectorString = string.Create(8, (charClass, invariant), (dest, state) => // String length is 8 chars == 16 bytes == 128 bits. { for (int i = 0; i < 128; i++) { diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index 6566129ed7f92..bbbdd5ed0c83c 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -377,6 +377,28 @@ public void Match(string pattern, string input, RegexOptions options, int beginn VerifyMatch(new Regex(pattern, options).Match(input, beginning, length), expectedSuccess, expectedValue); } + [Theory] + [InlineData(RegexOptions.None)] + [InlineData(RegexOptions.Compiled)] + [InlineData(RegexOptions.Compiled | RegexOptions.IgnoreCase)] + [InlineData(RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.CultureInvariant)] + public void Match_VaryingLengthStrings(RegexOptions options) + { + var lengths = new List() { 2, 3, 4, 5, 6, 7, 8, 9, 31, 32, 33, 63, 64, 65 }; + if ((options & RegexOptions.IgnoreCase) == 0) + { + lengths.Add(100_000); // currently produces too large a compiled method for case-insensitive + } + + bool caseInsensitive = (options & RegexOptions.IgnoreCase) != 0; + foreach (int length in lengths) + { + string pattern = "[123]" + string.Concat(Enumerable.Range(0, length).Select(i => (char)('A' + (i % 26)))); + string input = "2" + string.Concat(Enumerable.Range(0, length).Select(i => (char)((caseInsensitive ? 'a' : 'A') + (i % 26)))); + Match(pattern, input, options, 0, input.Length, expectedSuccess: true, expectedValue: input); + } + } + private static void VerifyMatch(Match match, bool expectedSuccess, string expectedValue) { Assert.Equal(expectedSuccess, match.Success); diff --git a/src/libraries/System.Text.RegularExpressions/tests/RegexCultureTests.cs b/src/libraries/System.Text.RegularExpressions/tests/RegexCultureTests.cs index 253bc098d6c7c..6ee819dfba0d4 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/RegexCultureTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/RegexCultureTests.cs @@ -3,8 +3,8 @@ // See the LICENSE file in the project root for more information. using System.Globalization; +using System.Linq; using System.Tests; -using Microsoft.DotNet.RemoteExecutor; using Xunit; namespace System.Text.RegularExpressions.Tests @@ -14,11 +14,13 @@ public class RegexCultureTests /// /// See https://en.wikipedia.org/wiki/Dotted_and_dotless_I /// - [Fact] - public void TurkishI_Is_Differently_LowerUpperCased_In_Turkish_Culture() + [Theory] + [InlineData(2)] + [InlineData(256)] + public void TurkishI_Is_Differently_LowerUpperCased_In_Turkish_Culture(int length) { var turkish = new CultureInfo("tr-TR"); - string input = "I\u0131\u0130i"; + string input = string.Concat(Enumerable.Repeat("I\u0131\u0130i", length / 2)); Regex[] cultInvariantRegex = Create(input, CultureInfo.InvariantCulture, RegexOptions.IgnoreCase | RegexOptions.CultureInvariant); Regex[] turkishRegex = Create(input, turkish, RegexOptions.IgnoreCase);