From 80ebdc4fc43c89ee08e59800271747b185d88181 Mon Sep 17 00:00:00 2001 From: Adam Sitnik Date: Fri, 29 Jul 2022 11:21:07 +0200 Subject: [PATCH 01/14] port WidenFourAsciiBytesToUtf16AndWriteToBuffer --- .../src/System/Text/ASCIIUtility.cs | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs index 5cba2be3143e0..1859ab12c9d39 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs @@ -1889,17 +1889,10 @@ internal static void WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref char outputB { Debug.Assert(AllBytesInUInt32AreAscii(value)); - if (Sse2.X64.IsSupported) - { - Debug.Assert(BitConverter.IsLittleEndian, "SSE2 widening assumes little-endian."); - Vector128 vecNarrow = Sse2.ConvertScalarToVector128UInt32(value).AsByte(); - Vector128 vecWide = Sse2.UnpackLow(vecNarrow, Vector128.Zero).AsUInt64(); - Unsafe.WriteUnaligned(ref Unsafe.As(ref outputBuffer), Sse2.X64.ConvertToUInt64(vecWide)); - } - else if (AdvSimd.Arm64.IsSupported) + if (Vector128.IsHardwareAccelerated) { - Vector128 vecNarrow = AdvSimd.DuplicateToVector128(value).AsByte(); - Vector128 vecWide = AdvSimd.Arm64.ZipLow(vecNarrow, Vector128.Zero).AsUInt64(); + Vector128 vecNarrow = Vector128.CreateScalar(value).AsByte(); + Vector128 vecWide = Vector128.WidenLower(vecNarrow).AsUInt64(); Unsafe.WriteUnaligned(ref Unsafe.As(ref outputBuffer), vecWide.ToScalar()); } else From c37ef6124f535cc67a13d8e6f33b98693a24be7b Mon Sep 17 00:00:00 2001 From: Adam Sitnik Date: Fri, 29 Jul 2022 11:25:21 +0200 Subject: [PATCH 02/14] port WidenAsciiToUtf16* --- .../src/System/Text/ASCIIUtility.cs | 117 +++--------------- 1 file changed, 15 insertions(+), 102 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs index 1859ab12c9d39..dd6ebe8a375a8 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs @@ -1569,12 +1569,7 @@ public static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16Buf // Intrinsified in mono interpreter nuint currentOffset = 0; - // If SSE2 is supported, use those specific intrinsics instead of the generic vectorized - // code below. This has two benefits: (a) we can take advantage of specific instructions like - // pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while - // this method is running. - - if (Sse2.IsSupported || (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian)) + if (Vector128.IsHardwareAccelerated && BitConverter.IsLittleEndian) { if (elementCount >= 2 * (uint)Unsafe.SizeOf>()) { @@ -1710,80 +1705,38 @@ public static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16Buf } [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool ContainsNonAsciiByte(Vector128 value) - { - if (!AdvSimd.Arm64.IsSupported) - { - throw new PlatformNotSupportedException(); - } - value = AdvSimd.Arm64.MaxPairwise(value, value); - return (value.AsUInt64().ToScalar() & 0x8080808080808080) != 0; - } + private static bool ContainsNonAsciiByte(Vector128 asciiVector) => asciiVector.ExtractMostSignificantBits() != 0; private static unsafe nuint WidenAsciiToUtf16_Intrinsified(byte* pAsciiBuffer, char* pUtf16Buffer, nuint elementCount) { // JIT turns the below into constants - uint SizeOfVector128 = (uint)Unsafe.SizeOf>(); + uint SizeOfVector128 = (uint)Vector128.Count; nuint MaskOfAllBitsInVector128 = (nuint)(SizeOfVector128 - 1); // This method is written such that control generally flows top-to-bottom, avoiding // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII // data, we jump out of the hot paths to targets at the end of the method. - Debug.Assert(Sse2.IsSupported || AdvSimd.Arm64.IsSupported); + Debug.Assert(Vector128.IsHardwareAccelerated); Debug.Assert(BitConverter.IsLittleEndian); Debug.Assert(elementCount >= 2 * SizeOfVector128); // We're going to get the best performance when we have aligned writes, so we'll take the // hit of potentially unaligned reads in order to hit this sweet spot. - Vector128 asciiVector; - Vector128 utf16FirstHalfVector; - bool containsNonAsciiBytes; - // First, perform an unaligned read of the first part of the input buffer. - - if (Sse2.IsSupported) - { - asciiVector = Sse2.LoadVector128(pAsciiBuffer); // unaligned load - containsNonAsciiBytes = (uint)Sse2.MoveMask(asciiVector) != 0; - } - else if (AdvSimd.Arm64.IsSupported) - { - asciiVector = AdvSimd.LoadVector128(pAsciiBuffer); - containsNonAsciiBytes = ContainsNonAsciiByte(asciiVector); - } - else - { - throw new PlatformNotSupportedException(); - } + Vector128 asciiVector = Vector128.Load(pAsciiBuffer); // If there's non-ASCII data in the first 8 elements of the vector, there's nothing we can do. - + bool containsNonAsciiBytes = ContainsNonAsciiByte(asciiVector); if (containsNonAsciiBytes) { return 0; } - // Then perform an unaligned write of the first part of the input buffer. - - Vector128 zeroVector = Vector128.Zero; - - if (Sse2.IsSupported) - { - utf16FirstHalfVector = Sse2.UnpackLow(asciiVector, zeroVector); - Sse2.Store((byte*)pUtf16Buffer, utf16FirstHalfVector); // unaligned - } - else if (AdvSimd.IsSupported) - { - utf16FirstHalfVector = AdvSimd.ZeroExtendWideningLower(asciiVector.GetLower()).AsByte(); - AdvSimd.Store((byte*)pUtf16Buffer, utf16FirstHalfVector); // unaligned - } - else - { - throw new PlatformNotSupportedException(); - } + Vector128 utf16FirstHalfVector = Vector128.WidenLower(asciiVector); + utf16FirstHalfVector.Store((ushort*)pUtf16Buffer); // Calculate how many elements we wrote in order to get pOutputBuffer to its next alignment // point, then use that as the base offset going forward. Remember the >> 1 to account for @@ -1804,21 +1757,8 @@ private static unsafe nuint WidenAsciiToUtf16_Intrinsified(byte* pAsciiBuffer, c do { // In a loop, perform an unaligned read, widen to two vectors, then aligned write the two vectors. - - if (Sse2.IsSupported) - { - asciiVector = Sse2.LoadVector128(pAsciiBuffer + currentOffset); // unaligned load - containsNonAsciiBytes = (uint)Sse2.MoveMask(asciiVector) != 0; - } - else if (AdvSimd.Arm64.IsSupported) - { - asciiVector = AdvSimd.LoadVector128(pAsciiBuffer + currentOffset); - containsNonAsciiBytes = ContainsNonAsciiByte(asciiVector); - } - else - { - throw new PlatformNotSupportedException(); - } + asciiVector = Vector128.Load(pAsciiBuffer + currentOffset); + containsNonAsciiBytes = ContainsNonAsciiByte(asciiVector); if (containsNonAsciiBytes) { @@ -1826,24 +1766,9 @@ private static unsafe nuint WidenAsciiToUtf16_Intrinsified(byte* pAsciiBuffer, c goto NonAsciiDataSeenInInnerLoop; } - if (Sse2.IsSupported) - { - Vector128 low = Sse2.UnpackLow(asciiVector, zeroVector); - Sse2.StoreAligned((byte*)pCurrentWriteAddress, low); - - Vector128 high = Sse2.UnpackHigh(asciiVector, zeroVector); - Sse2.StoreAligned((byte*)pCurrentWriteAddress + SizeOfVector128, high); - } - else if (AdvSimd.Arm64.IsSupported) - { - Vector128 low = AdvSimd.ZeroExtendWideningLower(asciiVector.GetLower()); - Vector128 high = AdvSimd.ZeroExtendWideningUpper(asciiVector); - AdvSimd.Arm64.StorePair((ushort*)pCurrentWriteAddress, low, high); - } - else - { - throw new PlatformNotSupportedException(); - } + (Vector128 low, Vector128 high) = Vector128.Widen(asciiVector); + low.StoreAligned((ushort*)pCurrentWriteAddress); + high.StoreAligned((ushort*)pCurrentWriteAddress + Vector128.Count); currentOffset += SizeOfVector128; pCurrentWriteAddress += SizeOfVector128; @@ -1860,20 +1785,8 @@ private static unsafe nuint WidenAsciiToUtf16_Intrinsified(byte* pAsciiBuffer, c if (!containsNonAsciiBytes) { // First part was all ASCII, widen - if (Sse2.IsSupported) - { - utf16FirstHalfVector = Sse2.UnpackLow(asciiVector, zeroVector); - Sse2.StoreAligned((byte*)(pUtf16Buffer + currentOffset), utf16FirstHalfVector); - } - else if (AdvSimd.Arm64.IsSupported) - { - Vector128 lower = AdvSimd.ZeroExtendWideningLower(asciiVector.GetLower()); - AdvSimd.Store((ushort*)(pUtf16Buffer + currentOffset), lower); - } - else - { - throw new PlatformNotSupportedException(); - } + utf16FirstHalfVector = Vector128.WidenLower(asciiVector); + utf16FirstHalfVector.StoreAligned((ushort*)(pUtf16Buffer + currentOffset)); currentOffset += SizeOfVector128 / 2; } From 53cdf1d00b3e589d31df75fc40ec4671efd81f2b Mon Sep 17 00:00:00 2001 From: Adam Sitnik Date: Fri, 29 Jul 2022 11:34:49 +0200 Subject: [PATCH 03/14] enforce the inlining of Vector128.WidenVector128) to remove performance regression --- .../src/System/Runtime/Intrinsics/Vector128.cs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs index 498f54e3afeed..fb8594be42842 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs @@ -3618,6 +3618,7 @@ public static bool TryCopyTo(this Vector128 vector, Span destination) /// The vector whose elements are to be widened. /// A pair of vectors that contain the widened lower and upper halves of . [CLSCompliant(false)] + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe (Vector128 Lower, Vector128 Upper) Widen(Vector128 source) => (WidenLower(source), WidenUpper(source)); From a3b10cb16c6a00c094164b69462e9d68bf1c76df Mon Sep 17 00:00:00 2001 From: Adam Sitnik Date: Fri, 29 Jul 2022 13:23:28 +0200 Subject: [PATCH 04/14] don't use Vector128.Widen as it less optimal on ARM64 --- .../src/System/Text/ASCIIUtility.cs | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs index dd6ebe8a375a8..03dc13d20c6a0 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs @@ -1735,8 +1735,8 @@ private static unsafe nuint WidenAsciiToUtf16_Intrinsified(byte* pAsciiBuffer, c return 0; } - Vector128 utf16FirstHalfVector = Vector128.WidenLower(asciiVector); - utf16FirstHalfVector.Store((ushort*)pUtf16Buffer); + Vector128 utf16HalfVector = Vector128.WidenLower(asciiVector); + utf16HalfVector.Store((ushort*)pUtf16Buffer); // Calculate how many elements we wrote in order to get pOutputBuffer to its next alignment // point, then use that as the base offset going forward. Remember the >> 1 to account for @@ -1766,9 +1766,11 @@ private static unsafe nuint WidenAsciiToUtf16_Intrinsified(byte* pAsciiBuffer, c goto NonAsciiDataSeenInInnerLoop; } - (Vector128 low, Vector128 high) = Vector128.Widen(asciiVector); - low.StoreAligned((ushort*)pCurrentWriteAddress); - high.StoreAligned((ushort*)pCurrentWriteAddress + Vector128.Count); + // Vector128.Widen is not used here as it less performant on ARM64 + utf16HalfVector = Vector128.WidenLower(asciiVector); + utf16HalfVector.StoreAligned((ushort*)pCurrentWriteAddress); + utf16HalfVector = Vector128.WidenUpper(asciiVector); + utf16HalfVector.StoreAligned((ushort*)pCurrentWriteAddress + Vector128.Count); currentOffset += SizeOfVector128; pCurrentWriteAddress += SizeOfVector128; @@ -1785,8 +1787,8 @@ private static unsafe nuint WidenAsciiToUtf16_Intrinsified(byte* pAsciiBuffer, c if (!containsNonAsciiBytes) { // First part was all ASCII, widen - utf16FirstHalfVector = Vector128.WidenLower(asciiVector); - utf16FirstHalfVector.StoreAligned((ushort*)(pUtf16Buffer + currentOffset)); + utf16HalfVector = Vector128.WidenLower(asciiVector); + utf16HalfVector.StoreAligned((ushort*)(pUtf16Buffer + currentOffset)); currentOffset += SizeOfVector128 / 2; } From 865721c8f4423b3cc4ab755c7bb2f06f3145f065 Mon Sep 17 00:00:00 2001 From: Adam Sitnik Date: Fri, 29 Jul 2022 13:28:49 +0200 Subject: [PATCH 05/14] remove dead code: the condition was impossible to meet (jump to NonAsciiDataSeenInInnerLoop was performed only when containsNonAsciiBytes was true) --- .../src/System/Text/ASCIIUtility.cs | 23 +++---------------- 1 file changed, 3 insertions(+), 20 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs index 03dc13d20c6a0..1f4920c34b46e 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs @@ -1762,37 +1762,20 @@ private static unsafe nuint WidenAsciiToUtf16_Intrinsified(byte* pAsciiBuffer, c if (containsNonAsciiBytes) { - // non-ASCII byte somewhere - goto NonAsciiDataSeenInInnerLoop; + break; } // Vector128.Widen is not used here as it less performant on ARM64 utf16HalfVector = Vector128.WidenLower(asciiVector); - utf16HalfVector.StoreAligned((ushort*)pCurrentWriteAddress); + utf16HalfVector.Store((ushort*)pCurrentWriteAddress); utf16HalfVector = Vector128.WidenUpper(asciiVector); - utf16HalfVector.StoreAligned((ushort*)pCurrentWriteAddress + Vector128.Count); + utf16HalfVector.Store((ushort*)pCurrentWriteAddress + Vector128.Count); currentOffset += SizeOfVector128; pCurrentWriteAddress += SizeOfVector128; } while (currentOffset <= finalOffsetWhereCanRunLoop); - Finish: - return currentOffset; - - NonAsciiDataSeenInInnerLoop: - - // Can we at least widen the first part of the vector? - - if (!containsNonAsciiBytes) - { - // First part was all ASCII, widen - utf16HalfVector = Vector128.WidenLower(asciiVector); - utf16HalfVector.StoreAligned((ushort*)(pUtf16Buffer + currentOffset)); - currentOffset += SizeOfVector128 / 2; - } - - goto Finish; } /// From 349aca7d7d9c06be58bca3f60ebc92e4c15c734e Mon Sep 17 00:00:00 2001 From: Adam Sitnik Date: Fri, 29 Jul 2022 13:48:40 +0200 Subject: [PATCH 06/14] simplify the code and get perf boost on both x64 and ARM64 --- .../src/System/Text/ASCIIUtility.cs | 57 ++++--------------- 1 file changed, 11 insertions(+), 46 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs index 1f4920c34b46e..9192ce3df88fb 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs @@ -1709,70 +1709,35 @@ public static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16Buf private static unsafe nuint WidenAsciiToUtf16_Intrinsified(byte* pAsciiBuffer, char* pUtf16Buffer, nuint elementCount) { - // JIT turns the below into constants - - uint SizeOfVector128 = (uint)Vector128.Count; - nuint MaskOfAllBitsInVector128 = (nuint)(SizeOfVector128 - 1); - - // This method is written such that control generally flows top-to-bottom, avoiding - // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII - // data, we jump out of the hot paths to targets at the end of the method. - Debug.Assert(Vector128.IsHardwareAccelerated); Debug.Assert(BitConverter.IsLittleEndian); - Debug.Assert(elementCount >= 2 * SizeOfVector128); - - // We're going to get the best performance when we have aligned writes, so we'll take the - // hit of potentially unaligned reads in order to hit this sweet spot. + Debug.Assert(elementCount >= 2 * (uint)Vector128.Count); - // First, perform an unaligned read of the first part of the input buffer. - Vector128 asciiVector = Vector128.Load(pAsciiBuffer); - - // If there's non-ASCII data in the first 8 elements of the vector, there's nothing we can do. - bool containsNonAsciiBytes = ContainsNonAsciiByte(asciiVector); - if (containsNonAsciiBytes) - { - return 0; - } - - Vector128 utf16HalfVector = Vector128.WidenLower(asciiVector); - utf16HalfVector.Store((ushort*)pUtf16Buffer); - - // Calculate how many elements we wrote in order to get pOutputBuffer to its next alignment - // point, then use that as the base offset going forward. Remember the >> 1 to account for - // that we wrote chars, not bytes. This means we may re-read data in the next iteration of - // the loop, but this is ok. - - nuint currentOffset = (SizeOfVector128 >> 1) - (((nuint)pUtf16Buffer >> 1) & (MaskOfAllBitsInVector128 >> 1)); - Debug.Assert(0 < currentOffset && currentOffset <= SizeOfVector128 / sizeof(char)); - - nuint finalOffsetWhereCanRunLoop = elementCount - SizeOfVector128; + nuint currentOffset = 0; + ushort* pCurrentWriteAddress = (ushort*)pUtf16Buffer; // Calculating the destination address outside the loop results in significant // perf wins vs. relying on the JIT to fold memory addressing logic into the // write instructions. See: https://github.com/dotnet/runtime/issues/33002 - - char* pCurrentWriteAddress = pUtf16Buffer + currentOffset; + nuint finalOffsetWhereCanRunLoop = elementCount - (uint)Vector128.Count; do { - // In a loop, perform an unaligned read, widen to two vectors, then aligned write the two vectors. - asciiVector = Vector128.Load(pAsciiBuffer + currentOffset); - containsNonAsciiBytes = ContainsNonAsciiByte(asciiVector); + Vector128 asciiVector = Vector128.Load(pAsciiBuffer + currentOffset); - if (containsNonAsciiBytes) + if (asciiVector.ExtractMostSignificantBits() != 0) { break; } // Vector128.Widen is not used here as it less performant on ARM64 - utf16HalfVector = Vector128.WidenLower(asciiVector); - utf16HalfVector.Store((ushort*)pCurrentWriteAddress); + Vector128 utf16HalfVector = Vector128.WidenLower(asciiVector); + utf16HalfVector.Store(pCurrentWriteAddress); utf16HalfVector = Vector128.WidenUpper(asciiVector); - utf16HalfVector.Store((ushort*)pCurrentWriteAddress + Vector128.Count); + utf16HalfVector.Store(pCurrentWriteAddress + Vector128.Count); - currentOffset += SizeOfVector128; - pCurrentWriteAddress += SizeOfVector128; + currentOffset += (nuint)Vector128.Count; + pCurrentWriteAddress += (nuint)Vector128.Count; } while (currentOffset <= finalOffsetWhereCanRunLoop); return currentOffset; From 764d03859271274063590a3cf6d8823999c4599e Mon Sep 17 00:00:00 2001 From: Adam Sitnik Date: Fri, 29 Jul 2022 13:54:53 +0200 Subject: [PATCH 07/14] add Vector256 code path --- .../System/Runtime/Intrinsics/Vector256.cs | 1 + .../src/System/Text/ASCIIUtility.cs | 48 +++++++++++++++---- 2 files changed, 40 insertions(+), 9 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs index 06d4be47e178d..cc22690786f89 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs @@ -3722,6 +3722,7 @@ public static bool TryCopyTo(this Vector256 vector, Span destination) /// The vector whose elements are to be widened. /// A pair of vectors that contain the widened lower and upper halves of . [CLSCompliant(false)] + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe (Vector256 Lower, Vector256 Upper) Widen(Vector256 source) => (WidenLower(source), WidenUpper(source)); diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs index 9192ce3df88fb..2e246c0f8176a 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs @@ -1569,12 +1569,11 @@ public static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16Buf // Intrinsified in mono interpreter nuint currentOffset = 0; - if (Vector128.IsHardwareAccelerated && BitConverter.IsLittleEndian) + if (BitConverter.IsLittleEndian && Vector128.IsHardwareAccelerated && elementCount >= 2 * (uint)Vector.Count) { - if (elementCount >= 2 * (uint)Unsafe.SizeOf>()) - { - currentOffset = WidenAsciiToUtf16_Intrinsified(pAsciiBuffer, pUtf16Buffer, elementCount); - } + currentOffset = Vector256.IsHardwareAccelerated + ? WidenAsciiToUtf16_Vector256(pAsciiBuffer, pUtf16Buffer, elementCount) + : WidenAsciiToUtf16_Vector128(pAsciiBuffer, pUtf16Buffer, elementCount); } else if (Vector.IsHardwareAccelerated) { @@ -1704,10 +1703,7 @@ public static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16Buf goto Finish; } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool ContainsNonAsciiByte(Vector128 asciiVector) => asciiVector.ExtractMostSignificantBits() != 0; - - private static unsafe nuint WidenAsciiToUtf16_Intrinsified(byte* pAsciiBuffer, char* pUtf16Buffer, nuint elementCount) + private static unsafe nuint WidenAsciiToUtf16_Vector128(byte* pAsciiBuffer, char* pUtf16Buffer, nuint elementCount) { Debug.Assert(Vector128.IsHardwareAccelerated); Debug.Assert(BitConverter.IsLittleEndian); @@ -1743,6 +1739,40 @@ private static unsafe nuint WidenAsciiToUtf16_Intrinsified(byte* pAsciiBuffer, c return currentOffset; } + private static unsafe nuint WidenAsciiToUtf16_Vector256(byte* pAsciiBuffer, char* pUtf16Buffer, nuint elementCount) + { + Debug.Assert(Vector256.IsHardwareAccelerated); + Debug.Assert(BitConverter.IsLittleEndian); + Debug.Assert(elementCount >= 2 * (uint)Vector256.Count); + + nuint currentOffset = 0; + ushort* pCurrentWriteAddress = (ushort*)pUtf16Buffer; + + // Calculating the destination address outside the loop results in significant + // perf wins vs. relying on the JIT to fold memory addressing logic into the + // write instructions. See: https://github.com/dotnet/runtime/issues/33002 + nuint finalOffsetWhereCanRunLoop = elementCount - (uint)Vector256.Count; + + do + { + Vector256 asciiVector = Vector256.Load(pAsciiBuffer + currentOffset); + + if (asciiVector.ExtractMostSignificantBits() != 0) + { + break; + } + + (Vector256 low, Vector256 upper) = Vector256.Widen(asciiVector); + low.Store(pCurrentWriteAddress); + upper.Store(pCurrentWriteAddress + Vector256.Count); + + currentOffset += (nuint)Vector256.Count; + pCurrentWriteAddress += (nuint)Vector256.Count; + } while (currentOffset <= finalOffsetWhereCanRunLoop); + + return currentOffset; + } + /// /// Given a DWORD which represents a buffer of 4 bytes, widens the buffer into 4 WORDs and /// writes them to the output buffer with machine endianness. From 5d7ef96375e689c4deb562df49d67f42e34ca4a0 Mon Sep 17 00:00:00 2001 From: Adam Sitnik Date: Fri, 19 Aug 2022 10:14:01 +0200 Subject: [PATCH 08/14] don't use Vector128.ExtractMostSignificantBits as it's expensive on ARM --- .../src/System/Text/ASCIIUtility.cs | 23 ++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs index c360b68721580..334a7fcc7888d 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs @@ -1370,6 +1370,27 @@ public static unsafe nuint NarrowUtf16ToAscii(char* pUtf16Buffer, byte* pAsciiBu goto Finish; } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool VectorContainsNonAsciiChar(Vector128 asciiVector) + { + // max ASCII character is 0b_0111_1111, so the most significant bit (0x80) tells whether it contains non ascii + + // prefer architecture specific intrinsic as they offer better perf + if (Sse41.IsSupported) + { + return !Sse41.TestZ(asciiVector, Vector128.Create((byte)0x80)); + } + if (AdvSimd.Arm64.IsSupported) + { + Vector128 maxBytes = AdvSimd.Arm64.MaxPairwise(asciiVector, asciiVector); + return (maxBytes.AsUInt64().ToScalar() & 0x8080808080808080) != 0; + } + else + { + return asciiVector.ExtractMostSignificantBits() != 0; + } + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static bool VectorContainsNonAsciiChar(Vector128 utf16Vector) { @@ -1709,7 +1730,7 @@ private static unsafe nuint WidenAsciiToUtf16_Vector128(byte* pAsciiBuffer, char { Vector128 asciiVector = Vector128.Load(pAsciiBuffer + currentOffset); - if (asciiVector.ExtractMostSignificantBits() != 0) + if (VectorContainsNonAsciiChar(asciiVector)) { break; } From a84c7129a1d60bee70c0e05c5b6305208ac76123 Mon Sep 17 00:00:00 2001 From: Adam Sitnik Date: Fri, 19 Aug 2022 11:43:05 +0200 Subject: [PATCH 09/14] these methods are now getting inlined --- .../src/System/Runtime/Intrinsics/Vector128.cs | 1 - .../src/System/Runtime/Intrinsics/Vector256.cs | 1 - 2 files changed, 2 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs index fb8594be42842..498f54e3afeed 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs @@ -3618,7 +3618,6 @@ public static bool TryCopyTo(this Vector128 vector, Span destination) /// The vector whose elements are to be widened. /// A pair of vectors that contain the widened lower and upper halves of . [CLSCompliant(false)] - [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe (Vector128 Lower, Vector128 Upper) Widen(Vector128 source) => (WidenLower(source), WidenUpper(source)); diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs index cc22690786f89..06d4be47e178d 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs @@ -3722,7 +3722,6 @@ public static bool TryCopyTo(this Vector256 vector, Span destination) /// The vector whose elements are to be widened. /// A pair of vectors that contain the widened lower and upper halves of . [CLSCompliant(false)] - [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe (Vector256 Lower, Vector256 Upper) Widen(Vector256 source) => (WidenLower(source), WidenUpper(source)); From e44f2a783bc85050232e69b939fc2d025a0f05e8 Mon Sep 17 00:00:00 2001 From: Adam Sitnik Date: Fri, 19 Aug 2022 11:44:11 +0200 Subject: [PATCH 10/14] fix the size check --- .../src/System/Text/ASCIIUtility.cs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs index 334a7fcc7888d..b390093b43968 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs @@ -1578,11 +1578,13 @@ public static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16Buf // Intrinsified in mono interpreter nuint currentOffset = 0; - if (BitConverter.IsLittleEndian && Vector128.IsHardwareAccelerated && elementCount >= 2 * (uint)Vector.Count) + if (BitConverter.IsLittleEndian && Vector256.IsHardwareAccelerated && elementCount >= 2 * (uint)Vector256.Count) { - currentOffset = Vector256.IsHardwareAccelerated - ? WidenAsciiToUtf16_Vector256(pAsciiBuffer, pUtf16Buffer, elementCount) - : WidenAsciiToUtf16_Vector128(pAsciiBuffer, pUtf16Buffer, elementCount); + currentOffset = WidenAsciiToUtf16_Vector256(pAsciiBuffer, pUtf16Buffer, elementCount); + } + else if (BitConverter.IsLittleEndian && Vector128.IsHardwareAccelerated && elementCount >= 2 * (uint)Vector128.Count) + { + currentOffset = WidenAsciiToUtf16_Vector128(pAsciiBuffer, pUtf16Buffer, elementCount); } else if (Vector.IsHardwareAccelerated) { From 20a2f2313c34d2c48052336167895f66916e9cb5 Mon Sep 17 00:00:00 2001 From: Adam Sitnik Date: Fri, 19 Aug 2022 12:18:16 +0200 Subject: [PATCH 11/14] prefer arm-specific instructions to avoid regressions for smaller inputs on ARM --- .../src/System/Text/ASCIIUtility.cs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs index b390093b43968..3379e75776ae9 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs @@ -1380,7 +1380,7 @@ private static bool VectorContainsNonAsciiChar(Vector128 asciiVector) { return !Sse41.TestZ(asciiVector, Vector128.Create((byte)0x80)); } - if (AdvSimd.Arm64.IsSupported) + else if (AdvSimd.Arm64.IsSupported) { Vector128 maxBytes = AdvSimd.Arm64.MaxPairwise(asciiVector, asciiVector); return (maxBytes.AsUInt64().ToScalar() & 0x8080808080808080) != 0; @@ -1793,7 +1793,13 @@ internal static void WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref char outputB { Debug.Assert(AllBytesInUInt32AreAscii(value)); - if (Vector128.IsHardwareAccelerated) + if (AdvSimd.Arm64.IsSupported) + { + Vector128 vecNarrow = AdvSimd.DuplicateToVector128(value).AsByte(); + Vector128 vecWide = AdvSimd.Arm64.ZipLow(vecNarrow, Vector128.Zero).AsUInt64(); + Unsafe.WriteUnaligned(ref Unsafe.As(ref outputBuffer), vecWide.ToScalar()); + } + else if (Vector128.IsHardwareAccelerated) { Vector128 vecNarrow = Vector128.CreateScalar(value).AsByte(); Vector128 vecWide = Vector128.WidenLower(vecNarrow).AsUInt64(); From bc0b8be760673db96ce9c6d7ed3edf6745573455 Mon Sep 17 00:00:00 2001 From: Adam Sitnik Date: Fri, 19 Aug 2022 17:13:47 +0200 Subject: [PATCH 12/14] comment out the problematic assert --- .../System.Private.CoreLib/src/System/Text/ASCIIUtility.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs index 3379e75776ae9..d76e7554b5cce 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs @@ -1752,7 +1752,7 @@ private static unsafe nuint WidenAsciiToUtf16_Vector128(byte* pAsciiBuffer, char private static unsafe nuint WidenAsciiToUtf16_Vector256(byte* pAsciiBuffer, char* pUtf16Buffer, nuint elementCount) { - Debug.Assert(Vector256.IsHardwareAccelerated); + // Debug.Assert(Vector256.IsHardwareAccelerated); currently commented out due to R2R bug: #74253 Debug.Assert(BitConverter.IsLittleEndian); Debug.Assert(elementCount >= 2 * (uint)Vector256.Count); From 5325199485d1f8fa4e9a20a926b7b7d6cd354ea9 Mon Sep 17 00:00:00 2001 From: Adam Sitnik Date: Mon, 22 Aug 2022 21:29:29 +0200 Subject: [PATCH 13/14] it's beneficial to do it for fewer elements as well --- .../src/System/Text/ASCIIUtility.cs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs index d76e7554b5cce..a802cf71725d6 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs @@ -1578,11 +1578,11 @@ public static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16Buf // Intrinsified in mono interpreter nuint currentOffset = 0; - if (BitConverter.IsLittleEndian && Vector256.IsHardwareAccelerated && elementCount >= 2 * (uint)Vector256.Count) + if (BitConverter.IsLittleEndian && Vector256.IsHardwareAccelerated && elementCount >= (uint)Vector256.Count) { currentOffset = WidenAsciiToUtf16_Vector256(pAsciiBuffer, pUtf16Buffer, elementCount); } - else if (BitConverter.IsLittleEndian && Vector128.IsHardwareAccelerated && elementCount >= 2 * (uint)Vector128.Count) + else if (BitConverter.IsLittleEndian && Vector128.IsHardwareAccelerated && elementCount >= (uint)Vector128.Count) { currentOffset = WidenAsciiToUtf16_Vector128(pAsciiBuffer, pUtf16Buffer, elementCount); } @@ -1718,7 +1718,7 @@ private static unsafe nuint WidenAsciiToUtf16_Vector128(byte* pAsciiBuffer, char { Debug.Assert(Vector128.IsHardwareAccelerated); Debug.Assert(BitConverter.IsLittleEndian); - Debug.Assert(elementCount >= 2 * (uint)Vector128.Count); + Debug.Assert(elementCount >= (uint)Vector128.Count); nuint currentOffset = 0; ushort* pCurrentWriteAddress = (ushort*)pUtf16Buffer; @@ -1754,7 +1754,7 @@ private static unsafe nuint WidenAsciiToUtf16_Vector256(byte* pAsciiBuffer, char { // Debug.Assert(Vector256.IsHardwareAccelerated); currently commented out due to R2R bug: #74253 Debug.Assert(BitConverter.IsLittleEndian); - Debug.Assert(elementCount >= 2 * (uint)Vector256.Count); + Debug.Assert(elementCount >= (uint)Vector256.Count); nuint currentOffset = 0; ushort* pCurrentWriteAddress = (ushort*)pUtf16Buffer; From 5501301fa130837fa6b554abc9e31284f648bef0 Mon Sep 17 00:00:00 2001 From: Adam Sitnik Date: Tue, 23 Aug 2022 14:33:19 +0200 Subject: [PATCH 14/14] inline the helper methods to simply get better perf and avoid R2R issue --- .../src/System/Text/ASCIIUtility.cs | 129 +++++++----------- 1 file changed, 53 insertions(+), 76 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs index a802cf71725d6..118fdecc55786 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs @@ -1578,13 +1578,60 @@ public static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16Buf // Intrinsified in mono interpreter nuint currentOffset = 0; - if (BitConverter.IsLittleEndian && Vector256.IsHardwareAccelerated && elementCount >= (uint)Vector256.Count) + if (BitConverter.IsLittleEndian && Vector128.IsHardwareAccelerated && elementCount >= (uint)Vector128.Count) { - currentOffset = WidenAsciiToUtf16_Vector256(pAsciiBuffer, pUtf16Buffer, elementCount); - } - else if (BitConverter.IsLittleEndian && Vector128.IsHardwareAccelerated && elementCount >= (uint)Vector128.Count) - { - currentOffset = WidenAsciiToUtf16_Vector128(pAsciiBuffer, pUtf16Buffer, elementCount); + ushort* pCurrentWriteAddress = (ushort*)pUtf16Buffer; + + if (Vector256.IsHardwareAccelerated && elementCount >= (uint)Vector256.Count) + { + // Calculating the destination address outside the loop results in significant + // perf wins vs. relying on the JIT to fold memory addressing logic into the + // write instructions. See: https://github.com/dotnet/runtime/issues/33002 + nuint finalOffsetWhereCanRunLoop = elementCount - (uint)Vector256.Count; + + do + { + Vector256 asciiVector = Vector256.Load(pAsciiBuffer + currentOffset); + + if (asciiVector.ExtractMostSignificantBits() != 0) + { + break; + } + + (Vector256 low, Vector256 upper) = Vector256.Widen(asciiVector); + low.Store(pCurrentWriteAddress); + upper.Store(pCurrentWriteAddress + Vector256.Count); + + currentOffset += (nuint)Vector256.Count; + pCurrentWriteAddress += (nuint)Vector256.Count; + } while (currentOffset <= finalOffsetWhereCanRunLoop); + } + else + { + // Calculating the destination address outside the loop results in significant + // perf wins vs. relying on the JIT to fold memory addressing logic into the + // write instructions. See: https://github.com/dotnet/runtime/issues/33002 + nuint finalOffsetWhereCanRunLoop = elementCount - (uint)Vector128.Count; + + do + { + Vector128 asciiVector = Vector128.Load(pAsciiBuffer + currentOffset); + + if (VectorContainsNonAsciiChar(asciiVector)) + { + break; + } + + // Vector128.Widen is not used here as it less performant on ARM64 + Vector128 utf16HalfVector = Vector128.WidenLower(asciiVector); + utf16HalfVector.Store(pCurrentWriteAddress); + utf16HalfVector = Vector128.WidenUpper(asciiVector); + utf16HalfVector.Store(pCurrentWriteAddress + Vector128.Count); + + currentOffset += (nuint)Vector128.Count; + pCurrentWriteAddress += (nuint)Vector128.Count; + } while (currentOffset <= finalOffsetWhereCanRunLoop); + } } else if (Vector.IsHardwareAccelerated) { @@ -1714,76 +1761,6 @@ public static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16Buf goto Finish; } - private static unsafe nuint WidenAsciiToUtf16_Vector128(byte* pAsciiBuffer, char* pUtf16Buffer, nuint elementCount) - { - Debug.Assert(Vector128.IsHardwareAccelerated); - Debug.Assert(BitConverter.IsLittleEndian); - Debug.Assert(elementCount >= (uint)Vector128.Count); - - nuint currentOffset = 0; - ushort* pCurrentWriteAddress = (ushort*)pUtf16Buffer; - - // Calculating the destination address outside the loop results in significant - // perf wins vs. relying on the JIT to fold memory addressing logic into the - // write instructions. See: https://github.com/dotnet/runtime/issues/33002 - nuint finalOffsetWhereCanRunLoop = elementCount - (uint)Vector128.Count; - - do - { - Vector128 asciiVector = Vector128.Load(pAsciiBuffer + currentOffset); - - if (VectorContainsNonAsciiChar(asciiVector)) - { - break; - } - - // Vector128.Widen is not used here as it less performant on ARM64 - Vector128 utf16HalfVector = Vector128.WidenLower(asciiVector); - utf16HalfVector.Store(pCurrentWriteAddress); - utf16HalfVector = Vector128.WidenUpper(asciiVector); - utf16HalfVector.Store(pCurrentWriteAddress + Vector128.Count); - - currentOffset += (nuint)Vector128.Count; - pCurrentWriteAddress += (nuint)Vector128.Count; - } while (currentOffset <= finalOffsetWhereCanRunLoop); - - return currentOffset; - } - - private static unsafe nuint WidenAsciiToUtf16_Vector256(byte* pAsciiBuffer, char* pUtf16Buffer, nuint elementCount) - { - // Debug.Assert(Vector256.IsHardwareAccelerated); currently commented out due to R2R bug: #74253 - Debug.Assert(BitConverter.IsLittleEndian); - Debug.Assert(elementCount >= (uint)Vector256.Count); - - nuint currentOffset = 0; - ushort* pCurrentWriteAddress = (ushort*)pUtf16Buffer; - - // Calculating the destination address outside the loop results in significant - // perf wins vs. relying on the JIT to fold memory addressing logic into the - // write instructions. See: https://github.com/dotnet/runtime/issues/33002 - nuint finalOffsetWhereCanRunLoop = elementCount - (uint)Vector256.Count; - - do - { - Vector256 asciiVector = Vector256.Load(pAsciiBuffer + currentOffset); - - if (asciiVector.ExtractMostSignificantBits() != 0) - { - break; - } - - (Vector256 low, Vector256 upper) = Vector256.Widen(asciiVector); - low.Store(pCurrentWriteAddress); - upper.Store(pCurrentWriteAddress + Vector256.Count); - - currentOffset += (nuint)Vector256.Count; - pCurrentWriteAddress += (nuint)Vector256.Count; - } while (currentOffset <= finalOffsetWhereCanRunLoop); - - return currentOffset; - } - /// /// Given a DWORD which represents a buffer of 4 bytes, widens the buffer into 4 WORDs and /// writes them to the output buffer with machine endianness.