From 12bb8928f104f138221e4f542af82115722f86af Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Mon, 5 Jun 2023 15:13:54 -0700 Subject: [PATCH 1/9] Lib upgrade for ToUtf16 --- .../src/System/Text/Ascii.Utility.cs | 26 ++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs index 69b3cc125c08..c0fb439c2132 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs @@ -1654,7 +1654,31 @@ internal static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16B { ushort* pCurrentWriteAddress = (ushort*)pUtf16Buffer; - if (Vector256.IsHardwareAccelerated && elementCount >= (uint)Vector256.Count) + if (Vector512.IsHardwareAccelerated && elementCount >= (uint)Vector512.Count) + { + // Calculating the destination address outside the loop results in significant + // perf wins vs. relying on the JIT to fold memory addressing logic into the + // write instructions. See: https://github.com/dotnet/runtime/issues/33002 + nuint finalOffsetWhereCanRunLoop = elementCount - (uint)Vector512.Count; + + do + { + Vector512 asciiVector = Vector512.Load(pAsciiBuffer + currentOffset); + + if (asciiVector.ExtractMostSignificantBits() != 0) + { + break; + } + + (Vector512 low, Vector512 upper) = Vector512.Widen(asciiVector); + low.Store(pCurrentWriteAddress); + upper.Store(pCurrentWriteAddress + Vector512.Count); + + currentOffset += (nuint)Vector512.Count; + pCurrentWriteAddress += (nuint)Vector512.Count; + } while (currentOffset <= finalOffsetWhereCanRunLoop); + } + else if (Vector256.IsHardwareAccelerated && elementCount >= (uint)Vector256.Count) { // Calculating the destination address outside the loop results in significant // perf wins vs. relying on the JIT to fold memory addressing logic into the From ff185d0db08bc359bcd73e3b7a77c9daf015bfc2 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Fri, 30 Jun 2023 11:40:44 -0700 Subject: [PATCH 2/9] Upgrade NarrowUtf16ToAscii with Vector512 --- .../System/Runtime/Intrinsics/Vector512.cs | 17 ++ .../src/System/Text/Ascii.Utility.cs | 167 +++++++++++++++++- 2 files changed, 183 insertions(+), 1 deletion(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector512.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector512.cs index 5b740db43009..fad4f94bd660 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector512.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector512.cs @@ -2670,6 +2670,23 @@ public static void StoreUnsafe(this Vector512 source, ref T destination) ref byte address = ref Unsafe.As(ref destination); Unsafe.WriteUnaligned(ref address, source); } + /// + /// Stores to lower 256 bits of to memory destination of [] + /// + /// The type of the elements in the vector. + /// The vector that will be stored. + /// The destination to which will be added before the vector will be stored. + /// The element offset from from which the vector will be stored. + /// + /// Uses double instead of long to get a single instruction instead of storing temps on general porpose register (or stack) + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static void StoreLowerUnsafe(this Vector512 source, ref T destination, nuint elementOffset = 0) + { + ThrowHelper.ThrowForUnsupportedIntrinsicsVector512BaseType(); + ref byte address = ref Unsafe.As(ref Unsafe.Add(ref destination, elementOffset)); + Unsafe.WriteUnaligned(ref address, source._lower); + } /// Stores a vector at the given destination. /// The type of the elements in the vector. diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs index c0fb439c2132..3b72bb32ae6c 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs @@ -1172,7 +1172,36 @@ internal static unsafe nuint NarrowUtf16ToAscii(char* pUtf16Buffer, byte* pAscii uint utf16Data32BitsHigh = 0, utf16Data32BitsLow = 0; ulong utf16Data64Bits = 0; - if (Vector128.IsHardwareAccelerated && BitConverter.IsLittleEndian) + if (Vector512.IsHardwareAccelerated && BitConverter.IsLittleEndian) + { + if (elementCount >= 2 * (uint)Vector512.Count) + { + // Since there's overhead to setting up the vectorized code path, we only want to + // call into it after a quick probe to ensure the next immediate characters really are ASCII. + // If we see non-ASCII data, we'll jump immediately to the draining logic at the end of the method. + + if (IntPtr.Size >= 8) + { + utf16Data64Bits = Unsafe.ReadUnaligned(pUtf16Buffer); + if (!AllCharsInUInt64AreAscii(utf16Data64Bits)) + { + goto FoundNonAsciiDataIn64BitRead; + } + } + else + { + utf16Data32BitsHigh = Unsafe.ReadUnaligned(pUtf16Buffer); + utf16Data32BitsLow = Unsafe.ReadUnaligned(pUtf16Buffer + 4 / sizeof(char)); + if (!AllCharsInUInt32AreAscii(utf16Data32BitsHigh | utf16Data32BitsLow)) + { + goto FoundNonAsciiDataIn64BitRead; + } + } + + currentOffset = NarrowUtf16ToAscii_Intrinsified_512(pUtf16Buffer, pAsciiBuffer, elementCount); + } + } + else if (Vector128.IsHardwareAccelerated && BitConverter.IsLittleEndian) { if (elementCount >= 2 * (uint)Vector128.Count) { @@ -1444,6 +1473,13 @@ private static bool VectorContainsNonAsciiChar(Vector128 utf16Vector) } } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool VectorContainsNonAsciiChar(Vector512 utf16Vector) + { + ulong MostSignificantBitIsSet = Vector512.ExtractMostSignificantBits(utf16Vector); + return MostSignificantBitIsSet != 0; + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static bool VectorContainsNonAsciiChar(Vector128 vector) where T : unmanaged @@ -1520,6 +1556,16 @@ private static Vector128 ExtractAsciiVector(Vector128 vectorFirst, } } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector512 ExtractAsciiVector(Vector512 vectorFirst, Vector512 vectorSecond) + { + // Narrows two vectors of words [ w7 w6 w5 w4 w3 w2 w1 w0 ] and [ w7' w6' w5' w4' w3' w2' w1' w0' ] + // to a vector of bytes [ b7 ... b0 b7' ... b0']. + + // prefer architecture specific intrinsic as they don't perform additional AND like Vector128.Narrow does + return Vector512.Narrow(vectorFirst, vectorSecond); + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe nuint NarrowUtf16ToAscii_Intrinsified(char* pUtf16Buffer, byte* pAsciiBuffer, nuint elementCount) { @@ -1639,6 +1685,125 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified(char* pUtf16Buffer, goto Finish; } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_512(char* pUtf16Buffer, byte* pAsciiBuffer, nuint elementCount) + { + // This method contains logic optimized using vector instructions for x64 only. + // Much of the logic in this method will be elided by JIT once we determine which specific ISAs we support. + + // JIT turns the below into constants + + uint SizeOfVector512 = (uint)Vector512.Count; + nuint MaskOfAllBitsInVector512 = (nuint)(SizeOfVector512 - 1); + + // This method is written such that control generally flows top-to-bottom, avoiding + // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII + // data, we jump out of the hot paths to targets at the end of the method. + + Debug.Assert(Vector512.IsHardwareAccelerated, "Vector512 is required."); + Debug.Assert(BitConverter.IsLittleEndian, "This implementation assumes little-endian."); + Debug.Assert(elementCount >= 2 * SizeOfVector512); + + // First, perform an unaligned read of the first part of the input buffer. + ref ushort utf16Buffer = ref *(ushort*)pUtf16Buffer; + Vector512 utf16VectorFirst = Vector512.LoadUnsafe(ref utf16Buffer); + + // If there's non-ASCII data in the first 32 elements of the vector, there's nothing we can do. + if (VectorContainsNonAsciiChar(utf16VectorFirst)) + { + return 0; + } + + // Turn the 32 ASCII chars we just read into 32 ASCII bytes, then copy it to the destination. + + ref byte asciiBuffer = ref *pAsciiBuffer; + Vector512 asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorFirst); + asciiVector.StoreLowerUnsafe(ref asciiBuffer, 0); // how to store the lower part of a avx512 + nuint currentOffsetInElements = SizeOfVector512 / 2; // we processed 32 elements so far + + // We're going to get the best performance when we have aligned writes, so we'll take the + // hit of potentially unaligned reads in order to hit this sweet spot. + + // pAsciiBuffer points to the start of the destination buffer, immediately before where we wrote + // the 8 bytes previously. If the 0x08 bit is set at the pinned address, then the 8 bytes we wrote + // previously mean that the 0x08 bit is *not* set at address &pAsciiBuffer[SizeOfVector128 / 2]. In + // that case we can immediately back up to the previous aligned boundary and start the main loop. + // If the 0x08 bit is *not* set at the pinned address, then it means the 0x08 bit *is* set at + // address &pAsciiBuffer[SizeOfVector128 / 2], and we should perform one more 8-byte write to bump + // just past the next aligned boundary address. + + if (((uint)pAsciiBuffer & (SizeOfVector512 / 2)) == 0) + { + // We need to perform one more partial vector write before we can get the alignment we want. + + utf16VectorFirst = Vector512.LoadUnsafe(ref utf16Buffer, currentOffsetInElements); + + if (VectorContainsNonAsciiChar(utf16VectorFirst)) + { + goto Finish; + } + + // Turn the 32 ASCII chars we just read into 32 ASCII bytes, then copy it to the destination. + asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorFirst); + asciiVector.StoreLowerUnsafe(ref asciiBuffer, currentOffsetInElements); + } + + // Calculate how many elements we wrote in order to get pAsciiBuffer to its next alignment + // point, then use that as the base offset going forward. + + currentOffsetInElements = SizeOfVector512 - ((nuint)pAsciiBuffer & MaskOfAllBitsInVector512); + + Debug.Assert(0 < currentOffsetInElements && currentOffsetInElements <= SizeOfVector512, "We wrote at least 1 byte but no more than a whole vector."); + Debug.Assert(currentOffsetInElements <= elementCount, "Shouldn't have overrun the destination buffer."); + Debug.Assert(elementCount - currentOffsetInElements >= SizeOfVector512, "We should be able to run at least one whole vector."); + + nuint finalOffsetWhereCanRunLoop = elementCount - SizeOfVector512; + do + { + // In a loop, perform two unaligned reads, narrow to a single vector, then aligned write one vector. + + utf16VectorFirst = Vector512.LoadUnsafe(ref utf16Buffer, currentOffsetInElements); + Vector512 utf16VectorSecond = Vector512.LoadUnsafe(ref utf16Buffer, currentOffsetInElements + SizeOfVector512 / sizeof(short)); + Vector512 combinedVector = utf16VectorFirst | utf16VectorSecond; + + if (VectorContainsNonAsciiChar(combinedVector)) + { + goto FoundNonAsciiDataInLoop; + } + + // Build up the ASCII vector and perform the store. + + Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % SizeOfVector512 == 0, "Write should be aligned."); + asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorSecond); + asciiVector.StoreUnsafe(ref asciiBuffer, currentOffsetInElements); + + currentOffsetInElements += SizeOfVector512; + } while (currentOffsetInElements <= finalOffsetWhereCanRunLoop); + + Finish: + + // There might be some ASCII data left over. That's fine - we'll let our caller handle the final drain. + return currentOffsetInElements; + + FoundNonAsciiDataInLoop: + + // Can we at least narrow the high vector? + // See comments in GetIndexOfFirstNonAsciiChar_Intrinsified for information about how this works. + if (VectorContainsNonAsciiChar(utf16VectorFirst)) + { + goto Finish; + } + + // First part was all ASCII, narrow and aligned write. Note we're only filling in the low half of the vector. + + Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % sizeof(ulong) == 0, "Destination should be ulong-aligned."); + asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorFirst); + asciiVector.StoreLowerUnsafe(ref asciiBuffer, currentOffsetInElements); + currentOffsetInElements += SizeOfVector512 / 2; + + goto Finish; + } + /// /// Copies as many ASCII bytes (00..7F) as possible from /// to , stopping when the first non-ASCII byte is encountered From 9d45ff881e0f8a5dcf781bd9cbed466c7340c4fd Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Wed, 5 Jul 2023 16:53:13 -0700 Subject: [PATCH 3/9] Complete the upgrade in NarrowUtf16ToAscii method with Vector512 and Vector256 APIs. --- .../System/Runtime/Intrinsics/Vector256.cs | 18 ++ .../src/System/Text/Ascii.Utility.cs | 215 ++++++++++++++---- 2 files changed, 184 insertions(+), 49 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs index 758d0291af49..e36c41f50338 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs @@ -2634,6 +2634,24 @@ public static void StoreUnsafe(this Vector256 source, ref T destination, n Unsafe.WriteUnaligned(ref Unsafe.As(ref destination), source); } + /// + /// Stores to lower 128 bits of to memory destination of [] + /// + /// The type of the elements in the vector. + /// The vector that will be stored. + /// The destination to which will be added before the vector will be stored. + /// The element offset from from which the vector will be stored. + /// + /// Uses double instead of long to get a single instruction instead of storing temps on general porpose register (or stack) + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static void StoreLowerUnsafe(this Vector256 source, ref T destination, nuint elementOffset = 0) + { + ThrowHelper.ThrowForUnsupportedIntrinsicsVector256BaseType(); + ref byte address = ref Unsafe.As(ref Unsafe.Add(ref destination, elementOffset)); + Unsafe.WriteUnaligned(ref address, source._lower); + } + /// Subtracts two vectors to compute their difference. /// The vector from which will be subtracted. /// The vector to subtract from . diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs index 3b72bb32ae6c..8b2943c5136b 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs @@ -1172,61 +1172,39 @@ internal static unsafe nuint NarrowUtf16ToAscii(char* pUtf16Buffer, byte* pAscii uint utf16Data32BitsHigh = 0, utf16Data32BitsLow = 0; ulong utf16Data64Bits = 0; - if (Vector512.IsHardwareAccelerated && BitConverter.IsLittleEndian) + if (Vector128.IsHardwareAccelerated && BitConverter.IsLittleEndian && elementCount >= 2 * (uint)Vector128.Count) { - if (elementCount >= 2 * (uint)Vector512.Count) - { - // Since there's overhead to setting up the vectorized code path, we only want to - // call into it after a quick probe to ensure the next immediate characters really are ASCII. - // If we see non-ASCII data, we'll jump immediately to the draining logic at the end of the method. + // Since there's overhead to setting up the vectorized code path, we only want to + // call into it after a quick probe to ensure the next immediate characters really are ASCII. + // If we see non-ASCII data, we'll jump immediately to the draining logic at the end of the method. - if (IntPtr.Size >= 8) + if (IntPtr.Size >= 8) + { + utf16Data64Bits = Unsafe.ReadUnaligned(pUtf16Buffer); + if (!AllCharsInUInt64AreAscii(utf16Data64Bits)) { - utf16Data64Bits = Unsafe.ReadUnaligned(pUtf16Buffer); - if (!AllCharsInUInt64AreAscii(utf16Data64Bits)) - { - goto FoundNonAsciiDataIn64BitRead; - } + goto FoundNonAsciiDataIn64BitRead; } - else + } + else + { + utf16Data32BitsHigh = Unsafe.ReadUnaligned(pUtf16Buffer); + utf16Data32BitsLow = Unsafe.ReadUnaligned(pUtf16Buffer + 4 / sizeof(char)); + if (!AllCharsInUInt32AreAscii(utf16Data32BitsHigh | utf16Data32BitsLow)) { - utf16Data32BitsHigh = Unsafe.ReadUnaligned(pUtf16Buffer); - utf16Data32BitsLow = Unsafe.ReadUnaligned(pUtf16Buffer + 4 / sizeof(char)); - if (!AllCharsInUInt32AreAscii(utf16Data32BitsHigh | utf16Data32BitsLow)) - { - goto FoundNonAsciiDataIn64BitRead; - } + goto FoundNonAsciiDataIn64BitRead; } - + } + if (Vector512.IsHardwareAccelerated && elementCount >= 2 * (uint)Vector512.Count) + { currentOffset = NarrowUtf16ToAscii_Intrinsified_512(pUtf16Buffer, pAsciiBuffer, elementCount); } - } - else if (Vector128.IsHardwareAccelerated && BitConverter.IsLittleEndian) - { - if (elementCount >= 2 * (uint)Vector128.Count) + else if (Vector256.IsHardwareAccelerated && elementCount >= 2 * (uint)Vector256.Count) + { + currentOffset = NarrowUtf16ToAscii_Intrinsified_256(pUtf16Buffer, pAsciiBuffer, elementCount); + } + else { - // Since there's overhead to setting up the vectorized code path, we only want to - // call into it after a quick probe to ensure the next immediate characters really are ASCII. - // If we see non-ASCII data, we'll jump immediately to the draining logic at the end of the method. - - if (IntPtr.Size >= 8) - { - utf16Data64Bits = Unsafe.ReadUnaligned(pUtf16Buffer); - if (!AllCharsInUInt64AreAscii(utf16Data64Bits)) - { - goto FoundNonAsciiDataIn64BitRead; - } - } - else - { - utf16Data32BitsHigh = Unsafe.ReadUnaligned(pUtf16Buffer); - utf16Data32BitsLow = Unsafe.ReadUnaligned(pUtf16Buffer + 4 / sizeof(char)); - if (!AllCharsInUInt32AreAscii(utf16Data32BitsHigh | utf16Data32BitsLow)) - { - goto FoundNonAsciiDataIn64BitRead; - } - } - currentOffset = NarrowUtf16ToAscii_Intrinsified(pUtf16Buffer, pAsciiBuffer, elementCount); } } @@ -1473,11 +1451,22 @@ private static bool VectorContainsNonAsciiChar(Vector128 utf16Vector) } } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool VectorContainsNonAsciiChar(Vector256 utf16Vector) + { + const ushort asciiMask = ushort.MaxValue - 127; // 0xFF80 + Vector256 zeroIsAscii = utf16Vector & Vector256.Create(asciiMask); + // If a non-ASCII bit is set in any WORD of the vector, we have seen non-ASCII data. + return zeroIsAscii != Vector256.Zero; + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static bool VectorContainsNonAsciiChar(Vector512 utf16Vector) { - ulong MostSignificantBitIsSet = Vector512.ExtractMostSignificantBits(utf16Vector); - return MostSignificantBitIsSet != 0; + const ushort asciiMask = ushort.MaxValue - 127; // 0xFF80 + Vector512 zeroIsAscii = utf16Vector & Vector512.Create(asciiMask); + // If a non-ASCII bit is set in any WORD of the vector, we have seen non-ASCII data. + return zeroIsAscii != Vector512.Zero; } [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -1556,13 +1545,23 @@ private static Vector128 ExtractAsciiVector(Vector128 vectorFirst, } } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector256 ExtractAsciiVector(Vector256 vectorFirst, Vector256 vectorSecond) + { + // Narrows two vectors of words [ w7 w6 w5 w4 w3 w2 w1 w0 ] and [ w7' w6' w5' w4' w3' w2' w1' w0' ] + // to a vector of bytes [ b7 ... b0 b7' ... b0']. + + // prefer architecture specific intrinsic as they don't perform additional AND like Vector512.Narrow does + return Vector256.Narrow(vectorFirst, vectorSecond); + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static Vector512 ExtractAsciiVector(Vector512 vectorFirst, Vector512 vectorSecond) { // Narrows two vectors of words [ w7 w6 w5 w4 w3 w2 w1 w0 ] and [ w7' w6' w5' w4' w3' w2' w1' w0' ] // to a vector of bytes [ b7 ... b0 b7' ... b0']. - // prefer architecture specific intrinsic as they don't perform additional AND like Vector128.Narrow does + // prefer architecture specific intrinsic as they don't perform additional AND like Vector512.Narrow does return Vector512.Narrow(vectorFirst, vectorSecond); } @@ -1685,6 +1684,124 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified(char* pUtf16Buffer, goto Finish; } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_256(char* pUtf16Buffer, byte* pAsciiBuffer, nuint elementCount) + { + // This method contains logic optimized using vector instructions for x64 only. + // Much of the logic in this method will be elided by JIT once we determine which specific ISAs we support. + + // JIT turns the below into constants + + uint SizeOfVector256 = (uint)Vector256.Count; + nuint MaskOfAllBitsInVector256 = (nuint)(SizeOfVector256 - 1); + + // This method is written such that control generally flows top-to-bottom, avoiding + // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII + // data, we jump out of the hot paths to targets at the end of the method. + + Debug.Assert(Vector256.IsHardwareAccelerated, "Vector256 is required."); + Debug.Assert(BitConverter.IsLittleEndian, "This implementation assumes little-endian."); + Debug.Assert(elementCount >= 2 * SizeOfVector256); + + // First, perform an unaligned read of the first part of the input buffer. + ref ushort utf16Buffer = ref *(ushort*)pUtf16Buffer; + Vector256 utf16VectorFirst = Vector256.LoadUnsafe(ref utf16Buffer); + + // If there's non-ASCII data in the first 16 elements of the vector, there's nothing we can do. + if (VectorContainsNonAsciiChar(utf16VectorFirst)) + { + return 0; + } + + // Turn the 16 ASCII chars we just read into 16 ASCII bytes, then copy it to the destination. + + ref byte asciiBuffer = ref *pAsciiBuffer; + Vector256 asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorFirst); + asciiVector.StoreLowerUnsafe(ref asciiBuffer, 0); + nuint currentOffsetInElements = SizeOfVector256 / 2; // we processed 16 elements so far + + // We're going to get the best performance when we have aligned writes, so we'll take the + // hit of potentially unaligned reads in order to hit this sweet spot. + + // pAsciiBuffer points to the start of the destination buffer, immediately before where we wrote + // the 8 bytes previously. If the 0x08 bit is set at the pinned address, then the 8 bytes we wrote + // previously mean that the 0x08 bit is *not* set at address &pAsciiBuffer[SizeOfVector128 / 2]. In + // that case we can immediately back up to the previous aligned boundary and start the main loop. + // If the 0x08 bit is *not* set at the pinned address, then it means the 0x08 bit *is* set at + // address &pAsciiBuffer[SizeOfVector128 / 2], and we should perform one more 8-byte write to bump + // just past the next aligned boundary address. + if (((uint)pAsciiBuffer & (SizeOfVector256 / 2)) == 0) + { + // We need to perform one more partial vector write before we can get the alignment we want. + + utf16VectorFirst = Vector256.LoadUnsafe(ref utf16Buffer, currentOffsetInElements); + + if (VectorContainsNonAsciiChar(utf16VectorFirst)) + { + goto Finish; + } + + // Turn the 32 ASCII chars we just read into 32 ASCII bytes, then copy it to the destination. + asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorFirst); + asciiVector.StoreLowerUnsafe(ref asciiBuffer, currentOffsetInElements); + } + + // Calculate how many elements we wrote in order to get pAsciiBuffer to its next alignment + // point, then use that as the base offset going forward. + + currentOffsetInElements = SizeOfVector256 - ((nuint)pAsciiBuffer & MaskOfAllBitsInVector256); + + Debug.Assert(0 < currentOffsetInElements && currentOffsetInElements <= SizeOfVector256, "We wrote at least 1 byte but no more than a whole vector."); + Debug.Assert(currentOffsetInElements <= elementCount, "Shouldn't have overrun the destination buffer."); + Debug.Assert(elementCount - currentOffsetInElements >= SizeOfVector256, "We should be able to run at least one whole vector."); + + nuint finalOffsetWhereCanRunLoop = elementCount - SizeOfVector256; + do + { + // In a loop, perform two unaligned reads, narrow to a single vector, then aligned write one vector. + + utf16VectorFirst = Vector256.LoadUnsafe(ref utf16Buffer, currentOffsetInElements); + Vector256 utf16VectorSecond = Vector256.LoadUnsafe(ref utf16Buffer, currentOffsetInElements + SizeOfVector256 / sizeof(short)); + Vector256 combinedVector = utf16VectorFirst | utf16VectorSecond; + + if (VectorContainsNonAsciiChar(combinedVector)) + { + goto FoundNonAsciiDataInLoop; + } + + // Build up the ASCII vector and perform the store. + + Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % SizeOfVector256 == 0, "Write should be aligned."); + asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorSecond); + asciiVector.StoreUnsafe(ref asciiBuffer, currentOffsetInElements); + + currentOffsetInElements += SizeOfVector256; + } while (currentOffsetInElements <= finalOffsetWhereCanRunLoop); + + Finish: + + // There might be some ASCII data left over. That's fine - we'll let our caller handle the final drain. + return currentOffsetInElements; + + FoundNonAsciiDataInLoop: + + // Can we at least narrow the high vector? + // See comments in GetIndexOfFirstNonAsciiChar_Intrinsified for information about how this works. + if (VectorContainsNonAsciiChar(utf16VectorFirst)) + { + goto Finish; + } + + // First part was all ASCII, narrow and aligned write. Note we're only filling in the low half of the vector. + + Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % sizeof(ulong) == 0, "Destination should be ulong-aligned."); + asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorFirst); + asciiVector.StoreLowerUnsafe(ref asciiBuffer, currentOffsetInElements); + currentOffsetInElements += SizeOfVector256 / 2; + + goto Finish; + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_512(char* pUtf16Buffer, byte* pAsciiBuffer, nuint elementCount) { From 09b8e3117d062be85299b6a284c44901016476f2 Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Wed, 5 Jul 2023 17:05:22 -0700 Subject: [PATCH 4/9] Adding VectorXX paths to `GetIndexOfFirstNonAscii` functions. --- .../src/System/Text/Ascii.Utility.cs | 465 +++++++++++++++++- .../tests/Ascii/FromUtf16Tests.cs | 16 +- .../tests/Ascii/ToUtf16Tests.cs | 16 +- 3 files changed, 475 insertions(+), 22 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs index 8b2943c5136b..5a3be66531a9 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs @@ -101,9 +101,241 @@ internal static unsafe nuint GetIndexOfFirstNonAsciiByte(byte* pBuffer, nuint bu // pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while // this method is running. - return (Sse2.IsSupported || (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian)) - ? GetIndexOfFirstNonAsciiByte_Intrinsified(pBuffer, bufferLength) - : GetIndexOfFirstNonAsciiByte_Default(pBuffer, bufferLength); + if (Vector512.IsHardwareAccelerated || Vector256.IsHardwareAccelerated) + { + return GetIndexOfFirstNonAsciiByte_Vector(pBuffer, bufferLength); + } + else if (Sse2.IsSupported || (AdvSimd.IsSupported && BitConverter.IsLittleEndian)) + { + return GetIndexOfFirstNonAsciiByte_Intrinsified(pBuffer, bufferLength); + } + else + { + return GetIndexOfFirstNonAsciiByte_Default(pBuffer, bufferLength); + } + } + + private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nuint bufferLength) + { + // Squirrel away the original buffer reference. This method works by determining the exact + // byte reference where non-ASCII data begins, so we need this base value to perform the + // final subtraction at the end of the method to get the index into the original buffer. + + byte* pOriginalBuffer = pBuffer; + + // Before we drain off byte-by-byte, try a generic vectorized loop. + // Only run the loop if we have at least two vectors we can pull out. + // Note use of SBYTE instead of BYTE below; we're using the two's-complement + // representation of negative integers to act as a surrogate for "is ASCII?". + + if (Vector512.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector512.Count) + { + uint SizeOfVector512InBytes = (uint)Vector512.Count; // JIT will make this a const + + if (Unsafe.ReadUnaligned>(pBuffer).ExtractMostSignificantBits() == 0) + { + // The first several elements of the input buffer were ASCII. Bump up the pointer to the + // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII + // data or we approach the end of the buffer. It's possible we'll reread data; this is ok. + + byte* pFinalVectorReadPos = pBuffer + bufferLength - SizeOfVector512InBytes; + pBuffer = (byte*)(((nuint)pBuffer + SizeOfVector512InBytes) & ~(nuint)(SizeOfVector512InBytes - 1)); + +#if DEBUG + long numBytesRead = pBuffer - pOriginalBuffer; + Debug.Assert(0 < numBytesRead && numBytesRead <= SizeOfVector512InBytes, "We should've made forward progress of at least one byte."); + Debug.Assert((nuint)numBytesRead <= bufferLength, "We shouldn't have read past the end of the input buffer."); +#endif + + Debug.Assert(pBuffer <= pFinalVectorReadPos, "Should be able to read at least one vector."); + + do + { + Debug.Assert((nuint)pBuffer % SizeOfVector512InBytes == 0, "Vector read should be aligned."); + if (Unsafe.Read>(pBuffer).ExtractMostSignificantBits() != 0) + { + break; // found non-ASCII data + } + + pBuffer += SizeOfVector512InBytes; + } while (pBuffer <= pFinalVectorReadPos); + + // Adjust the remaining buffer length for the number of elements we just consumed. + + bufferLength -= (nuint)pBuffer; + bufferLength += (nuint)pOriginalBuffer; + } + } + else if (Vector256.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector256.Count) + { + uint SizeOfVector256InBytes = (uint)Vector256.Count; // JIT will make this a const + + if (Unsafe.ReadUnaligned>(pBuffer).ExtractMostSignificantBits() == 0) + { + // The first several elements of the input buffer were ASCII. Bump up the pointer to the + // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII + // data or we approach the end of the buffer. It's possible we'll reread data; this is ok. + + byte* pFinalVectorReadPos = pBuffer + bufferLength - SizeOfVector256InBytes; + pBuffer = (byte*)(((nuint)pBuffer + SizeOfVector256InBytes) & ~(nuint)(SizeOfVector256InBytes - 1)); + +#if DEBUG + long numBytesRead = pBuffer - pOriginalBuffer; + Debug.Assert(0 < numBytesRead && numBytesRead <= SizeOfVector256InBytes, "We should've made forward progress of at least one byte."); + Debug.Assert((nuint)numBytesRead <= bufferLength, "We shouldn't have read past the end of the input buffer."); +#endif + + Debug.Assert(pBuffer <= pFinalVectorReadPos, "Should be able to read at least one vector."); + + do + { + Debug.Assert((nuint)pBuffer % SizeOfVector256InBytes == 0, "Vector read should be aligned."); + if (Unsafe.Read>(pBuffer).ExtractMostSignificantBits() != 0) + { + break; // found non-ASCII data + } + + pBuffer += SizeOfVector256InBytes; + } while (pBuffer <= pFinalVectorReadPos); + + // Adjust the remaining buffer length for the number of elements we just consumed. + + bufferLength -= (nuint)pBuffer; + bufferLength += (nuint)pOriginalBuffer; + } + } + else if (Vector128.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector128.Count) + { + uint SizeOfVector128InBytes = (uint)Vector128.Count; // JIT will make this a const + + if (Unsafe.ReadUnaligned>(pBuffer).ExtractMostSignificantBits() == 0) + { + // The first several elements of the input buffer were ASCII. Bump up the pointer to the + // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII + // data or we approach the end of the buffer. It's possible we'll reread data; this is ok. + + byte* pFinalVectorReadPos = pBuffer + bufferLength - SizeOfVector128InBytes; + pBuffer = (byte*)(((nuint)pBuffer + SizeOfVector128InBytes) & ~(nuint)(SizeOfVector128InBytes - 1)); + +#if DEBUG + long numBytesRead = pBuffer - pOriginalBuffer; + Debug.Assert(0 < numBytesRead && numBytesRead <= SizeOfVector128InBytes, "We should've made forward progress of at least one byte."); + Debug.Assert((nuint)numBytesRead <= bufferLength, "We shouldn't have read past the end of the input buffer."); +#endif + + Debug.Assert(pBuffer <= pFinalVectorReadPos, "Should be able to read at least one vector."); + + do + { + Debug.Assert((nuint)pBuffer % SizeOfVector128InBytes == 0, "Vector read should be aligned."); + if (Unsafe.Read>(pBuffer).ExtractMostSignificantBits() != 0) + { + break; // found non-ASCII data + } + + pBuffer += SizeOfVector128InBytes; + } while (pBuffer <= pFinalVectorReadPos); + + // Adjust the remaining buffer length for the number of elements we just consumed. + + bufferLength -= (nuint)pBuffer; + bufferLength += (nuint)pOriginalBuffer; + } + } + + // At this point, the buffer length wasn't enough to perform a vectorized search, or we did perform + // a vectorized search and encountered non-ASCII data. In either case go down a non-vectorized code + // path to drain any remaining ASCII bytes. + // + // We're going to perform unaligned reads, so prefer 32-bit reads instead of 64-bit reads. + // This also allows us to perform more optimized bit twiddling tricks to count the number of ASCII bytes. + + uint currentUInt32; + + // Try reading 64 bits at a time in a loop. + + for (; bufferLength >= 8; bufferLength -= 8) + { + currentUInt32 = Unsafe.ReadUnaligned(pBuffer); + uint nextUInt32 = Unsafe.ReadUnaligned(pBuffer + 4); + + if (!AllBytesInUInt32AreAscii(currentUInt32 | nextUInt32)) + { + // One of these two values contains non-ASCII bytes. + // Figure out which one it is, then put it in 'current' so that we can drain the ASCII bytes. + + if (AllBytesInUInt32AreAscii(currentUInt32)) + { + currentUInt32 = nextUInt32; + pBuffer += 4; + } + + goto FoundNonAsciiData; + } + + pBuffer += 8; // consumed 8 ASCII bytes + } + + // From this point forward we don't need to update bufferLength. + // Try reading 32 bits. + + if ((bufferLength & 4) != 0) + { + currentUInt32 = Unsafe.ReadUnaligned(pBuffer); + if (!AllBytesInUInt32AreAscii(currentUInt32)) + { + goto FoundNonAsciiData; + } + + pBuffer += 4; + } + + // Try reading 16 bits. + + if ((bufferLength & 2) != 0) + { + currentUInt32 = Unsafe.ReadUnaligned(pBuffer); + if (!AllBytesInUInt32AreAscii(currentUInt32)) + { + if (!BitConverter.IsLittleEndian) + { + currentUInt32 <<= 16; + } + goto FoundNonAsciiData; + } + + pBuffer += 2; + } + + // Try reading 8 bits + + if ((bufferLength & 1) != 0) + { + // If the buffer contains non-ASCII data, the comparison below will fail, and + // we'll end up not incrementing the buffer reference. + + if (*(sbyte*)pBuffer >= 0) + { + pBuffer++; + } + } + + Finish: + + nuint totalNumBytesRead = (nuint)pBuffer - (nuint)pOriginalBuffer; + return totalNumBytesRead; + + FoundNonAsciiData: + + Debug.Assert(!AllBytesInUInt32AreAscii(currentUInt32), "Shouldn't have reached this point if we have an all-ASCII input."); + + // The method being called doesn't bother looking at whether the high byte is ASCII. There are only + // two scenarios: (a) either one of the earlier bytes is not ASCII and the search terminates before + // we get to the high byte; or (b) all of the earlier bytes are ASCII, so the high byte must be + // non-ASCII. In both cases we only care about the low 24 bits. + + pBuffer += CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiData(currentUInt32); + goto Finish; } private static unsafe nuint GetIndexOfFirstNonAsciiByte_Default(byte* pBuffer, nuint bufferLength) @@ -637,9 +869,230 @@ internal static unsafe nuint GetIndexOfFirstNonAsciiChar(char* pBuffer, nuint bu // pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while // this method is running. - return ((Sse2.IsSupported || AdvSimd.IsSupported) && BitConverter.IsLittleEndian) - ? GetIndexOfFirstNonAsciiChar_Intrinsified(pBuffer, bufferLength) - : GetIndexOfFirstNonAsciiChar_Default(pBuffer, bufferLength); + if (Vector512.IsHardwareAccelerated || Vector256.IsHardwareAccelerated) + { + return GetIndexOfFirstNonAsciiChar_Vector(pBuffer, bufferLength); + } + else if (Sse2.IsSupported || (AdvSimd.IsSupported && BitConverter.IsLittleEndian)) + { + return GetIndexOfFirstNonAsciiChar_Intrinsified(pBuffer, bufferLength); + } + else + { + return GetIndexOfFirstNonAsciiChar_Default(pBuffer, bufferLength); + } + } + + private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nuint bufferLength /* in chars */) + { + // Squirrel away the original buffer reference.This method works by determining the exact + // char reference where non-ASCII data begins, so we need this base value to perform the + // final subtraction at the end of the method to get the index into the original buffer. + char* pOriginalBuffer = pBuffer; + +#if SYSTEM_PRIVATE_CORELIB + Debug.Assert(bufferLength <= nuint.MaxValue / sizeof(char)); +#endif + + // Before we drain off char-by-char, try a generic vectorized loop. + // Only run the loop if we have at least two vectors we can pull out. + if (Vector512.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector512.Count) + { + uint SizeOfVector512InChars = (uint)Vector512.Count; // JIT will make this a const + uint SizeOfVector512InBytes = (uint)Vector512.Count; // JIT will make this a const + + Vector512 asciiMask = Vector512.Create((ushort) 0xFF80); + + if (!VectorContainsNonAsciiChar(Unsafe.ReadUnaligned>(pBuffer))) + { + // The first several elements of the input buffer were ASCII. Bump up the pointer to the + // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII + // data or we approach the end of the buffer. It's possible we'll reread data; this is ok. + + char* pFinalVectorReadPos = pBuffer + bufferLength - SizeOfVector512InChars; + pBuffer = (char*)(((nuint)pBuffer + SizeOfVector512InBytes) & ~(nuint)(SizeOfVector512InBytes - 1)); + +#if DEBUG + long numCharsRead = pBuffer - pOriginalBuffer; + Debug.Assert(0 < numCharsRead && numCharsRead <= SizeOfVector512InChars, "We should've made forward progress of at least one char."); + Debug.Assert((nuint)numCharsRead <= bufferLength, "We shouldn't have read past the end of the input buffer."); +#endif + + Debug.Assert(pBuffer <= pFinalVectorReadPos, "Should be able to read at least one vector."); + + do + { + Debug.Assert((nuint)pBuffer % SizeOfVector512InChars == 0, "Vector read should be aligned."); + if (VectorContainsNonAsciiChar(Unsafe.Read>(pBuffer))) + { + break; // found non-ASCII data + } + pBuffer += SizeOfVector512InChars; + } while (pBuffer <= pFinalVectorReadPos); + + // Adjust the remaining buffer length for the number of elements we just consumed. + + bufferLength -= ((nuint)pBuffer - (nuint)pOriginalBuffer) / sizeof(char); + } + } + else if (Vector256.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector256.Count) + { + uint SizeOfVector256InChars = (uint)Vector256.Count; // JIT will make this a const + uint SizeOfVector256InBytes = (uint)Vector256.Count; // JIT will make this a const + + Vector256 asciiMask = Vector256.Create((ushort) 0xFF80); + + if (!VectorContainsNonAsciiChar(Unsafe.ReadUnaligned>(pBuffer))) + { + // The first several elements of the input buffer were ASCII. Bump up the pointer to the + // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII + // data or we approach the end of the buffer. It's possible we'll reread data; this is ok. + + char* pFinalVectorReadPos = pBuffer + bufferLength - SizeOfVector256InChars; + pBuffer = (char*)(((nuint)pBuffer + SizeOfVector256InBytes) & ~(nuint)(SizeOfVector256InBytes - 1)); + +#if DEBUG + long numCharsRead = pBuffer - pOriginalBuffer; + Debug.Assert(0 < numCharsRead && numCharsRead <= SizeOfVector256InChars, "We should've made forward progress of at least one char."); + Debug.Assert((nuint)numCharsRead <= bufferLength, "We shouldn't have read past the end of the input buffer."); +#endif + + Debug.Assert(pBuffer <= pFinalVectorReadPos, "Should be able to read at least one vector."); + + do + { + Debug.Assert((nuint)pBuffer % SizeOfVector256InChars == 0, "Vector read should be aligned."); + if (VectorContainsNonAsciiChar(Unsafe.Read>(pBuffer))) + { + break; // found non-ASCII data + } + pBuffer += SizeOfVector256InChars; + } while (pBuffer <= pFinalVectorReadPos); + + // Adjust the remaining buffer length for the number of elements we just consumed. + + bufferLength -= ((nuint)pBuffer - (nuint)pOriginalBuffer) / sizeof(char); + } + } + else if (Vector128.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector128.Count) + { + uint SizeOfVector128InChars = (uint)Vector128.Count; // JIT will make this a const + uint SizeOfVector128InBytes = (uint)Vector128.Count; // JIT will make this a const + + Vector128 asciiMask = Vector128.Create((ushort) 0xFF80); + + if (!VectorContainsNonAsciiChar(Unsafe.ReadUnaligned>(pBuffer))) + { + // The first several elements of the input buffer were ASCII. Bump up the pointer to the + // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII + // data or we approach the end of the buffer. It's possible we'll reread data; this is ok. + char* pFinalVectorReadPos = pBuffer + bufferLength - SizeOfVector128InChars; + pBuffer = (char*)(((nuint)pBuffer + SizeOfVector128InBytes) & ~(nuint)(SizeOfVector128InBytes - 1)); + +#if DEBUG + long numCharsRead = pBuffer - pOriginalBuffer; + Debug.Assert(0 < numCharsRead && numCharsRead <= SizeOfVector128InChars, "We should've made forward progress of at least one char."); + Debug.Assert((nuint)numCharsRead <= bufferLength, "We shouldn't have read past the end of the input buffer."); +#endif + + Debug.Assert(pBuffer <= pFinalVectorReadPos, "Should be able to read at least one vector."); + + do + { + Debug.Assert((nuint)pBuffer % SizeOfVector128InChars == 0, "Vector read should be aligned."); + if (VectorContainsNonAsciiChar(Unsafe.Read>(pBuffer))) + { + break; // found non-ASCII data + } + pBuffer += SizeOfVector128InChars; + } while (pBuffer <= pFinalVectorReadPos); + + // Adjust the remaining buffer length for the number of elements we just consumed. + + bufferLength -= ((nuint)pBuffer - (nuint)pOriginalBuffer) / sizeof(char); + } + } + + + // At this point, the buffer length wasn't enough to perform a vectorized search, or we did perform + // a vectorized search and encountered non-ASCII data. In either case go down a non-vectorized code + // path to drain any remaining ASCII chars. + // + // We're going to perform unaligned reads, so prefer 32-bit reads instead of 64-bit reads. + // This also allows us to perform more optimized bit twiddling tricks to count the number of ASCII chars. + + uint currentUInt32; + + // Try reading 64 bits at a time in a loop. + + for (; bufferLength >= 4; bufferLength -= 4) // 64 bits = 4 * 16-bit chars + { + currentUInt32 = Unsafe.ReadUnaligned(pBuffer); + uint nextUInt32 = Unsafe.ReadUnaligned(pBuffer + 4 / sizeof(char)); + + if (!AllCharsInUInt32AreAscii(currentUInt32 | nextUInt32)) + { + // One of these two values contains non-ASCII chars. + // Figure out which one it is, then put it in 'current' so that we can drain the ASCII chars. + + if (AllCharsInUInt32AreAscii(currentUInt32)) + { + currentUInt32 = nextUInt32; + pBuffer += 2; + } + + goto FoundNonAsciiData; + } + + pBuffer += 4; // consumed 4 ASCII chars + } + + // From this point forward we don't need to keep track of the remaining buffer length. + // Try reading 32 bits. + + if ((bufferLength & 2) != 0) // 32 bits = 2 * 16-bit chars + { + currentUInt32 = Unsafe.ReadUnaligned(pBuffer); + if (!AllCharsInUInt32AreAscii(currentUInt32)) + { + goto FoundNonAsciiData; + } + + pBuffer += 2; + } + + // Try reading 16 bits. + // No need to try an 8-bit read after this since we're working with chars. + + if ((bufferLength & 1) != 0) + { + // If the buffer contains non-ASCII data, the comparison below will fail, and + // we'll end up not incrementing the buffer reference. + + if (*pBuffer <= 0x007F) + { + pBuffer++; + } + } + + Finish: + + nuint totalNumBytesRead = (nuint)pBuffer - (nuint)pOriginalBuffer; + Debug.Assert(totalNumBytesRead % sizeof(char) == 0, "Total number of bytes read should be even since we're working with chars."); + return totalNumBytesRead / sizeof(char); // convert byte count -> char count before returning + + FoundNonAsciiData: + + Debug.Assert(!AllCharsInUInt32AreAscii(currentUInt32), "Shouldn't have reached this point if we have an all-ASCII input."); + + // We don't bother looking at the second char - only the first char. + + if (FirstCharInUInt32IsAscii(currentUInt32)) + { + pBuffer++; + } + + goto Finish; } private static unsafe nuint GetIndexOfFirstNonAsciiChar_Default(char* pBuffer, nuint bufferLength /* in chars */) diff --git a/src/libraries/System.Text.Encoding/tests/Ascii/FromUtf16Tests.cs b/src/libraries/System.Text.Encoding/tests/Ascii/FromUtf16Tests.cs index 80a70042abb8..2f9186c4e9ec 100644 --- a/src/libraries/System.Text.Encoding/tests/Ascii/FromUtf16Tests.cs +++ b/src/libraries/System.Text.Encoding/tests/Ascii/FromUtf16Tests.cs @@ -19,15 +19,15 @@ public static unsafe void EmptyInputs() [Fact] public static void AllAsciiInput() { - using BoundedMemory utf16Mem = BoundedMemory.Allocate(128); - using BoundedMemory asciiMem = BoundedMemory.Allocate(128); + using BoundedMemory utf16Mem = BoundedMemory.Allocate(256); + using BoundedMemory asciiMem = BoundedMemory.Allocate(256); // Fill source with 00 .. 7F. Span utf16Span = utf16Mem.Span; for (int i = 0; i < utf16Span.Length; i++) { - utf16Span[i] = (char)i; + utf16Span[i] = (char)(i % 128); } utf16Mem.MakeReadonly(); @@ -42,11 +42,11 @@ public static void AllAsciiInput() // First, validate that the workhorse saw the incoming data as all-ASCII. Assert.Equal(OperationStatus.Done, Ascii.FromUtf16(utf16Span.Slice(i), asciiSpan.Slice(i), out int bytesWritten)); - Assert.Equal(128 - i, bytesWritten); + Assert.Equal(256 - i, bytesWritten); // Then, validate that the data was transcoded properly. - for (int j = i; j < 128; j++) + for (int j = i; j < 256; j++) { Assert.Equal((ushort)utf16Span[i], (ushort)asciiSpan[i]); } @@ -56,15 +56,15 @@ public static void AllAsciiInput() [Fact] public static void SomeNonAsciiInput() { - using BoundedMemory utf16Mem = BoundedMemory.Allocate(128); - using BoundedMemory asciiMem = BoundedMemory.Allocate(128); + using BoundedMemory utf16Mem = BoundedMemory.Allocate(256); + using BoundedMemory asciiMem = BoundedMemory.Allocate(256); // Fill source with 00 .. 7F. Span utf16Span = utf16Mem.Span; for (int i = 0; i < utf16Span.Length; i++) { - utf16Span[i] = (char)i; + utf16Span[i] = (char)(i % 128); } // We'll write to the ASCII span. diff --git a/src/libraries/System.Text.Encoding/tests/Ascii/ToUtf16Tests.cs b/src/libraries/System.Text.Encoding/tests/Ascii/ToUtf16Tests.cs index be9c71e14fbb..ec5c186973e3 100644 --- a/src/libraries/System.Text.Encoding/tests/Ascii/ToUtf16Tests.cs +++ b/src/libraries/System.Text.Encoding/tests/Ascii/ToUtf16Tests.cs @@ -20,15 +20,15 @@ public static void EmptyInputs() [Fact] public static void AllAsciiInput() { - using BoundedMemory asciiMem = BoundedMemory.Allocate(128); - using BoundedMemory utf16Mem = BoundedMemory.Allocate(128); + using BoundedMemory asciiMem = BoundedMemory.Allocate(256); + using BoundedMemory utf16Mem = BoundedMemory.Allocate(256); // Fill source with 00 .. 7F, then trap future writes. Span asciiSpan = asciiMem.Span; for (int i = 0; i < asciiSpan.Length; i++) { - asciiSpan[i] = (byte)i; + asciiSpan[i] = (byte)(i % 128); } asciiMem.MakeReadonly(); @@ -44,11 +44,11 @@ public static void AllAsciiInput() // First, validate that the workhorse saw the incoming data as all-ASCII. Assert.Equal(OperationStatus.Done, Ascii.ToUtf16(asciiSpan.Slice(i), utf16Span.Slice(i), out int charsWritten)); - Assert.Equal(128 - i, charsWritten); + Assert.Equal(256 - i, charsWritten); // Then, validate that the data was transcoded properly. - for (int j = i; j < 128; j++) + for (int j = i; j < 256; j++) { Assert.Equal((ushort)asciiSpan[i], (ushort)utf16Span[i]); } @@ -58,15 +58,15 @@ public static void AllAsciiInput() [Fact] public static void SomeNonAsciiInput() { - using BoundedMemory asciiMem = BoundedMemory.Allocate(128); - using BoundedMemory utf16Mem = BoundedMemory.Allocate(128); + using BoundedMemory asciiMem = BoundedMemory.Allocate(256); + using BoundedMemory utf16Mem = BoundedMemory.Allocate(256); // Fill source with 00 .. 7F, then trap future writes. Span asciiSpan = asciiMem.Span; for (int i = 0; i < asciiSpan.Length; i++) { - asciiSpan[i] = (byte)i; + asciiSpan[i] = (byte)(i % 128); } // We'll write to the UTF-16 span. From 01a43e7e2b96d2ed6f9a78a9f8fa753516e010de Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Fri, 7 Jul 2023 09:38:37 -0700 Subject: [PATCH 5/9] Adding optimization to Vecto256 VectorContainsNonAsciiChar method. --- .../src/System/Text/Ascii.Utility.cs | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs index 5a3be66531a9..5afd401d9c85 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs @@ -1907,10 +1907,18 @@ private static bool VectorContainsNonAsciiChar(Vector128 utf16Vector) [MethodImpl(MethodImplOptions.AggressiveInlining)] private static bool VectorContainsNonAsciiChar(Vector256 utf16Vector) { - const ushort asciiMask = ushort.MaxValue - 127; // 0xFF80 - Vector256 zeroIsAscii = utf16Vector & Vector256.Create(asciiMask); - // If a non-ASCII bit is set in any WORD of the vector, we have seen non-ASCII data. - return zeroIsAscii != Vector256.Zero; + if (Avx.IsSupported) + { + Vector256 asciiMaskForTestZ = Vector256.Create((ushort)0xFF80); + return !Avx.TestZ(utf16Vector.AsInt16(), asciiMaskForTestZ.AsInt16()); + } + else + { + const ushort asciiMask = ushort.MaxValue - 127; // 0xFF80 + Vector256 zeroIsAscii = utf16Vector & Vector256.Create(asciiMask); + // If a non-ASCII bit is set in any WORD of the vector, we have seen non-ASCII data. + return zeroIsAscii != Vector256.Zero; + } } [MethodImpl(MethodImplOptions.AggressiveInlining)] From ea913dbfcb2c626eabc104b55263842cc4850415 Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Wed, 12 Jul 2023 06:52:18 -0700 Subject: [PATCH 6/9] Code path refactoring and cleanup. --- .../src/System/Text/Ascii.Utility.cs | 72 ++++++++++++++----- 1 file changed, 54 insertions(+), 18 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs index 5afd401d9c85..e82ccb5755b3 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs @@ -130,9 +130,9 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu if (Vector512.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector512.Count) { - uint SizeOfVector512InBytes = (uint)Vector512.Count; // JIT will make this a const + uint SizeOfVector512InBytes = (uint)Vector512.Size; // JIT will make this a const - if (Unsafe.ReadUnaligned>(pBuffer).ExtractMostSignificantBits() == 0) + if (!VectorContainsNonAsciiChar(Vector512.Load(pBuffer))) { // The first several elements of the input buffer were ASCII. Bump up the pointer to the // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII @@ -152,7 +152,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu do { Debug.Assert((nuint)pBuffer % SizeOfVector512InBytes == 0, "Vector read should be aligned."); - if (Unsafe.Read>(pBuffer).ExtractMostSignificantBits() != 0) + if (VectorContainsNonAsciiChar(Vector512.LoadAligned(pBuffer))) { break; // found non-ASCII data } @@ -168,9 +168,9 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu } else if (Vector256.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector256.Count) { - uint SizeOfVector256InBytes = (uint)Vector256.Count; // JIT will make this a const + uint SizeOfVector256InBytes = (uint)Vector256.Size; // JIT will make this a const - if (Unsafe.ReadUnaligned>(pBuffer).ExtractMostSignificantBits() == 0) + if (!VectorContainsNonAsciiChar(Vector256.Load(pBuffer))) { // The first several elements of the input buffer were ASCII. Bump up the pointer to the // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII @@ -190,7 +190,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu do { Debug.Assert((nuint)pBuffer % SizeOfVector256InBytes == 0, "Vector read should be aligned."); - if (Unsafe.Read>(pBuffer).ExtractMostSignificantBits() != 0) + if (VectorContainsNonAsciiChar(Vector256.LoadAligned(pBuffer))) { break; // found non-ASCII data } @@ -206,9 +206,9 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu } else if (Vector128.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector128.Count) { - uint SizeOfVector128InBytes = (uint)Vector128.Count; // JIT will make this a const + uint SizeOfVector128InBytes = (uint)Vector128.Size; // JIT will make this a const - if (Unsafe.ReadUnaligned>(pBuffer).ExtractMostSignificantBits() == 0) + if (!VectorContainsNonAsciiChar(Vector128.Load(pBuffer))) { // The first several elements of the input buffer were ASCII. Bump up the pointer to the // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII @@ -228,7 +228,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu do { Debug.Assert((nuint)pBuffer % SizeOfVector128InBytes == 0, "Vector read should be aligned."); - if (Unsafe.Read>(pBuffer).ExtractMostSignificantBits() != 0) + if (VectorContainsNonAsciiChar(Vector128.LoadAligned(pBuffer))) { break; // found non-ASCII data } @@ -899,11 +899,11 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu if (Vector512.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector512.Count) { uint SizeOfVector512InChars = (uint)Vector512.Count; // JIT will make this a const - uint SizeOfVector512InBytes = (uint)Vector512.Count; // JIT will make this a const + uint SizeOfVector512InBytes = (uint)Vector512.Size; // JIT will make this a const Vector512 asciiMask = Vector512.Create((ushort) 0xFF80); - if (!VectorContainsNonAsciiChar(Unsafe.ReadUnaligned>(pBuffer))) + if (!VectorContainsNonAsciiChar(Vector512.Load((ushort*)pBuffer))) { // The first several elements of the input buffer were ASCII. Bump up the pointer to the // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII @@ -923,7 +923,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu do { Debug.Assert((nuint)pBuffer % SizeOfVector512InChars == 0, "Vector read should be aligned."); - if (VectorContainsNonAsciiChar(Unsafe.Read>(pBuffer))) + if (VectorContainsNonAsciiChar(Vector512.LoadAligned((ushort*)pBuffer))) { break; // found non-ASCII data } @@ -938,11 +938,11 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu else if (Vector256.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector256.Count) { uint SizeOfVector256InChars = (uint)Vector256.Count; // JIT will make this a const - uint SizeOfVector256InBytes = (uint)Vector256.Count; // JIT will make this a const + uint SizeOfVector256InBytes = (uint)Vector256.Size; // JIT will make this a const Vector256 asciiMask = Vector256.Create((ushort) 0xFF80); - if (!VectorContainsNonAsciiChar(Unsafe.ReadUnaligned>(pBuffer))) + if (!VectorContainsNonAsciiChar(Vector256.Load((ushort*)pBuffer))) { // The first several elements of the input buffer were ASCII. Bump up the pointer to the // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII @@ -962,7 +962,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu do { Debug.Assert((nuint)pBuffer % SizeOfVector256InChars == 0, "Vector read should be aligned."); - if (VectorContainsNonAsciiChar(Unsafe.Read>(pBuffer))) + if (VectorContainsNonAsciiChar(Vector256.LoadAligned((ushort*)pBuffer))) { break; // found non-ASCII data } @@ -977,11 +977,11 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu else if (Vector128.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector128.Count) { uint SizeOfVector128InChars = (uint)Vector128.Count; // JIT will make this a const - uint SizeOfVector128InBytes = (uint)Vector128.Count; // JIT will make this a const + uint SizeOfVector128InBytes = (uint)Vector128.Size; // JIT will make this a const Vector128 asciiMask = Vector128.Create((ushort) 0xFF80); - if (!VectorContainsNonAsciiChar(Unsafe.ReadUnaligned>(pBuffer))) + if (!VectorContainsNonAsciiChar(Vector128.Load((ushort*)pBuffer))) { // The first several elements of the input buffer were ASCII. Bump up the pointer to the // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII @@ -1000,7 +1000,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu do { Debug.Assert((nuint)pBuffer % SizeOfVector128InChars == 0, "Vector read should be aligned."); - if (VectorContainsNonAsciiChar(Unsafe.Read>(pBuffer))) + if (VectorContainsNonAsciiChar(Vector128.LoadAligned((ushort*)pBuffer))) { break; // found non-ASCII data } @@ -1867,6 +1867,20 @@ private static bool VectorContainsNonAsciiChar(Vector128 asciiVector) } } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool VectorContainsNonAsciiChar(Vector256 asciiVector) + { + // max ASCII character is 0b_0111_1111, so the most significant bit (0x80) tells whether it contains non ascii + return asciiVector.ExtractMostSignificantBits() != 0; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool VectorContainsNonAsciiChar(Vector512 asciiVector) + { + // max ASCII character is 0b_0111_1111, so the most significant bit (0x80) tells whether it contains non ascii + return asciiVector.ExtractMostSignificantBits() != 0; + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static bool VectorContainsNonAsciiChar(Vector128 utf16Vector) { @@ -1941,6 +1955,28 @@ private static bool VectorContainsNonAsciiChar(Vector128 vector) : VectorContainsNonAsciiChar(vector.AsUInt16()); } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool VectorContainsNonAsciiChar(Vector256 vector) + where T : unmanaged + { + Debug.Assert(typeof(T) == typeof(byte) || typeof(T) == typeof(ushort)); + + return typeof(T) == typeof(byte) + ? VectorContainsNonAsciiChar(vector.AsByte()) + : VectorContainsNonAsciiChar(vector.AsUInt16()); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool VectorContainsNonAsciiChar(Vector512 vector) + where T : unmanaged + { + Debug.Assert(typeof(T) == typeof(byte) || typeof(T) == typeof(ushort)); + + return typeof(T) == typeof(byte) + ? VectorContainsNonAsciiChar(vector.AsByte()) + : VectorContainsNonAsciiChar(vector.AsUInt16()); + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static bool AllCharsInVectorAreAscii(Vector128 vector) where T : unmanaged From 5cb1efc99d8d533966aaf55b4a6346fecd03731d Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Wed, 12 Jul 2023 15:25:49 -0700 Subject: [PATCH 7/9] Code changes based on the review: 1. turn some variables into explicitly specified const. 2. removed some helper functions and inlined them. --- .../System/Runtime/Intrinsics/Vector256.cs | 18 -- .../System/Runtime/Intrinsics/Vector512.cs | 17 -- .../src/System/Text/Ascii.Utility.cs | 207 ++++++------------ 3 files changed, 71 insertions(+), 171 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs index e36c41f50338..758d0291af49 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs @@ -2634,24 +2634,6 @@ public static void StoreUnsafe(this Vector256 source, ref T destination, n Unsafe.WriteUnaligned(ref Unsafe.As(ref destination), source); } - /// - /// Stores to lower 128 bits of to memory destination of [] - /// - /// The type of the elements in the vector. - /// The vector that will be stored. - /// The destination to which will be added before the vector will be stored. - /// The element offset from from which the vector will be stored. - /// - /// Uses double instead of long to get a single instruction instead of storing temps on general porpose register (or stack) - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static void StoreLowerUnsafe(this Vector256 source, ref T destination, nuint elementOffset = 0) - { - ThrowHelper.ThrowForUnsupportedIntrinsicsVector256BaseType(); - ref byte address = ref Unsafe.As(ref Unsafe.Add(ref destination, elementOffset)); - Unsafe.WriteUnaligned(ref address, source._lower); - } - /// Subtracts two vectors to compute their difference. /// The vector from which will be subtracted. /// The vector to subtract from . diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector512.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector512.cs index fad4f94bd660..5b740db43009 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector512.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector512.cs @@ -2670,23 +2670,6 @@ public static void StoreUnsafe(this Vector512 source, ref T destination) ref byte address = ref Unsafe.As(ref destination); Unsafe.WriteUnaligned(ref address, source); } - /// - /// Stores to lower 256 bits of to memory destination of [] - /// - /// The type of the elements in the vector. - /// The vector that will be stored. - /// The destination to which will be added before the vector will be stored. - /// The element offset from from which the vector will be stored. - /// - /// Uses double instead of long to get a single instruction instead of storing temps on general porpose register (or stack) - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static void StoreLowerUnsafe(this Vector512 source, ref T destination, nuint elementOffset = 0) - { - ThrowHelper.ThrowForUnsupportedIntrinsicsVector512BaseType(); - ref byte address = ref Unsafe.As(ref Unsafe.Add(ref destination, elementOffset)); - Unsafe.WriteUnaligned(ref address, source._lower); - } /// Stores a vector at the given destination. /// The type of the elements in the vector. diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs index e82ccb5755b3..7d74bfb6d8fa 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs @@ -130,20 +130,19 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu if (Vector512.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector512.Count) { - uint SizeOfVector512InBytes = (uint)Vector512.Size; // JIT will make this a const - if (!VectorContainsNonAsciiChar(Vector512.Load(pBuffer))) + if (Vector512.Load(pBuffer).ExtractMostSignificantBits() == 0) { // The first several elements of the input buffer were ASCII. Bump up the pointer to the // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII // data or we approach the end of the buffer. It's possible we'll reread data; this is ok. - byte* pFinalVectorReadPos = pBuffer + bufferLength - SizeOfVector512InBytes; - pBuffer = (byte*)(((nuint)pBuffer + SizeOfVector512InBytes) & ~(nuint)(SizeOfVector512InBytes - 1)); + byte* pFinalVectorReadPos = pBuffer + bufferLength - Vector512.Size; + pBuffer = (byte*)(((nuint)pBuffer + Vector512.Size) & ~(nuint)(Vector512.Size - 1)); #if DEBUG long numBytesRead = pBuffer - pOriginalBuffer; - Debug.Assert(0 < numBytesRead && numBytesRead <= SizeOfVector512InBytes, "We should've made forward progress of at least one byte."); + Debug.Assert(0 < numBytesRead && numBytesRead <= Vector512.Size, "We should've made forward progress of at least one byte."); Debug.Assert((nuint)numBytesRead <= bufferLength, "We shouldn't have read past the end of the input buffer."); #endif @@ -151,13 +150,13 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu do { - Debug.Assert((nuint)pBuffer % SizeOfVector512InBytes == 0, "Vector read should be aligned."); - if (VectorContainsNonAsciiChar(Vector512.LoadAligned(pBuffer))) + Debug.Assert((nuint)pBuffer % Vector512.Size == 0, "Vector read should be aligned."); + if (Vector512.LoadAligned(pBuffer).ExtractMostSignificantBits() != 0) { break; // found non-ASCII data } - pBuffer += SizeOfVector512InBytes; + pBuffer += Vector512.Size; } while (pBuffer <= pFinalVectorReadPos); // Adjust the remaining buffer length for the number of elements we just consumed. @@ -168,20 +167,19 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu } else if (Vector256.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector256.Count) { - uint SizeOfVector256InBytes = (uint)Vector256.Size; // JIT will make this a const - if (!VectorContainsNonAsciiChar(Vector256.Load(pBuffer))) + if (Vector256.Load(pBuffer).ExtractMostSignificantBits() == 0) { // The first several elements of the input buffer were ASCII. Bump up the pointer to the // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII // data or we approach the end of the buffer. It's possible we'll reread data; this is ok. - byte* pFinalVectorReadPos = pBuffer + bufferLength - SizeOfVector256InBytes; - pBuffer = (byte*)(((nuint)pBuffer + SizeOfVector256InBytes) & ~(nuint)(SizeOfVector256InBytes - 1)); + byte* pFinalVectorReadPos = pBuffer + bufferLength - Vector256.Size; + pBuffer = (byte*)(((nuint)pBuffer + Vector256.Size) & ~(nuint)(Vector256.Size - 1)); #if DEBUG long numBytesRead = pBuffer - pOriginalBuffer; - Debug.Assert(0 < numBytesRead && numBytesRead <= SizeOfVector256InBytes, "We should've made forward progress of at least one byte."); + Debug.Assert(0 < numBytesRead && numBytesRead <= Vector256.Size, "We should've made forward progress of at least one byte."); Debug.Assert((nuint)numBytesRead <= bufferLength, "We shouldn't have read past the end of the input buffer."); #endif @@ -189,13 +187,13 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu do { - Debug.Assert((nuint)pBuffer % SizeOfVector256InBytes == 0, "Vector read should be aligned."); - if (VectorContainsNonAsciiChar(Vector256.LoadAligned(pBuffer))) + Debug.Assert((nuint)pBuffer % Vector256.Size == 0, "Vector read should be aligned."); + if (Vector256.LoadAligned(pBuffer).ExtractMostSignificantBits() != 0) { break; // found non-ASCII data } - pBuffer += SizeOfVector256InBytes; + pBuffer += Vector256.Size; } while (pBuffer <= pFinalVectorReadPos); // Adjust the remaining buffer length for the number of elements we just consumed. @@ -206,7 +204,6 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu } else if (Vector128.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector128.Count) { - uint SizeOfVector128InBytes = (uint)Vector128.Size; // JIT will make this a const if (!VectorContainsNonAsciiChar(Vector128.Load(pBuffer))) { @@ -214,12 +211,12 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII // data or we approach the end of the buffer. It's possible we'll reread data; this is ok. - byte* pFinalVectorReadPos = pBuffer + bufferLength - SizeOfVector128InBytes; - pBuffer = (byte*)(((nuint)pBuffer + SizeOfVector128InBytes) & ~(nuint)(SizeOfVector128InBytes - 1)); + byte* pFinalVectorReadPos = pBuffer + bufferLength - Vector128.Size; + pBuffer = (byte*)(((nuint)pBuffer + Vector128.Size) & ~(nuint)(Vector128.Size - 1)); #if DEBUG long numBytesRead = pBuffer - pOriginalBuffer; - Debug.Assert(0 < numBytesRead && numBytesRead <= SizeOfVector128InBytes, "We should've made forward progress of at least one byte."); + Debug.Assert(0 < numBytesRead && numBytesRead <= Vector128.Size, "We should've made forward progress of at least one byte."); Debug.Assert((nuint)numBytesRead <= bufferLength, "We shouldn't have read past the end of the input buffer."); #endif @@ -227,13 +224,13 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu do { - Debug.Assert((nuint)pBuffer % SizeOfVector128InBytes == 0, "Vector read should be aligned."); + Debug.Assert((nuint)pBuffer % Vector128.Size == 0, "Vector read should be aligned."); if (VectorContainsNonAsciiChar(Vector128.LoadAligned(pBuffer))) { break; // found non-ASCII data } - pBuffer += SizeOfVector128InBytes; + pBuffer += Vector128.Size; } while (pBuffer <= pFinalVectorReadPos); // Adjust the remaining buffer length for the number of elements we just consumed. @@ -898,8 +895,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu // Only run the loop if we have at least two vectors we can pull out. if (Vector512.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector512.Count) { - uint SizeOfVector512InChars = (uint)Vector512.Count; // JIT will make this a const - uint SizeOfVector512InBytes = (uint)Vector512.Size; // JIT will make this a const + const uint SizeOfVector512InChars = Vector512.Size / sizeof(ushort); Vector512 asciiMask = Vector512.Create((ushort) 0xFF80); @@ -910,7 +906,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu // data or we approach the end of the buffer. It's possible we'll reread data; this is ok. char* pFinalVectorReadPos = pBuffer + bufferLength - SizeOfVector512InChars; - pBuffer = (char*)(((nuint)pBuffer + SizeOfVector512InBytes) & ~(nuint)(SizeOfVector512InBytes - 1)); + pBuffer = (char*)(((nuint)pBuffer + Vector512.Size) & ~(nuint)(Vector512.Size - 1)); #if DEBUG long numCharsRead = pBuffer - pOriginalBuffer; @@ -937,8 +933,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu } else if (Vector256.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector256.Count) { - uint SizeOfVector256InChars = (uint)Vector256.Count; // JIT will make this a const - uint SizeOfVector256InBytes = (uint)Vector256.Size; // JIT will make this a const + const uint SizeOfVector256InChars = Vector256.Size / sizeof(ushort); Vector256 asciiMask = Vector256.Create((ushort) 0xFF80); @@ -949,7 +944,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu // data or we approach the end of the buffer. It's possible we'll reread data; this is ok. char* pFinalVectorReadPos = pBuffer + bufferLength - SizeOfVector256InChars; - pBuffer = (char*)(((nuint)pBuffer + SizeOfVector256InBytes) & ~(nuint)(SizeOfVector256InBytes - 1)); + pBuffer = (char*)(((nuint)pBuffer + Vector256.Size) & ~(nuint)(Vector256.Size - 1)); #if DEBUG long numCharsRead = pBuffer - pOriginalBuffer; @@ -976,8 +971,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu } else if (Vector128.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector128.Count) { - uint SizeOfVector128InChars = (uint)Vector128.Count; // JIT will make this a const - uint SizeOfVector128InBytes = (uint)Vector128.Size; // JIT will make this a const + const uint SizeOfVector128InChars = Vector128.Size / sizeof(ushort); // JIT will make this a const Vector128 asciiMask = Vector128.Create((ushort) 0xFF80); @@ -987,7 +981,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII // data or we approach the end of the buffer. It's possible we'll reread data; this is ok. char* pFinalVectorReadPos = pBuffer + bufferLength - SizeOfVector128InChars; - pBuffer = (char*)(((nuint)pBuffer + SizeOfVector128InBytes) & ~(nuint)(SizeOfVector128InBytes - 1)); + pBuffer = (char*)(((nuint)pBuffer + Vector128.Size) & ~(nuint)(Vector128.Size - 1)); #if DEBUG long numCharsRead = pBuffer - pOriginalBuffer; @@ -1245,8 +1239,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Intrinsified(char* pBuff // JIT turns the below into constants - uint SizeOfVector128InBytes = (uint)sizeof(Vector128); - uint SizeOfVector128InChars = SizeOfVector128InBytes / sizeof(char); + uint SizeOfVector128InChars = Vector128.Size / sizeof(char); Debug.Assert(Sse2.IsSupported || AdvSimd.Arm64.IsSupported, "Should've been checked by caller."); Debug.Assert(BitConverter.IsLittleEndian, "This SSE2/Arm64 assumes little-endian."); @@ -1284,14 +1277,14 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Intrinsified(char* pBuff bufferLength <<= 1; // chars to bytes - if (bufferLength < 2 * SizeOfVector128InBytes) + if (bufferLength < 2 * Vector128.Size) { goto IncrementCurrentOffsetBeforeFinalUnalignedVectorRead; } // Now adjust the read pointer so that future reads are aligned. - pBuffer = (char*)(((nuint)pBuffer + SizeOfVector128InBytes) & ~(nuint)(SizeOfVector128InBytes - 1)); + pBuffer = (char*)(((nuint)pBuffer + Vector128.Size) & ~(nuint)(Vector128.Size - 1)); #if DEBUG long numCharsRead = pBuffer - pOriginalBuffer; @@ -1306,9 +1299,9 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Intrinsified(char* pBuff // The buffer is now properly aligned. // Read 2 vectors at a time if possible. - if (bufferLength >= 2 * SizeOfVector128InBytes) + if (bufferLength >= 2 * Vector128.Size) { - char* pFinalVectorReadPos = (char*)((nuint)pBuffer + bufferLength - 2 * SizeOfVector128InBytes); + char* pFinalVectorReadPos = (char*)((nuint)pBuffer + bufferLength - 2 * Vector128.Size); // After this point, we no longer need to update the bufferLength value. do @@ -1337,7 +1330,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Intrinsified(char* pBuff // If there is fewer than one vector length remaining, skip the next aligned read. // Remember, at this point bufferLength is measured in bytes, not chars. - if ((bufferLength & SizeOfVector128InBytes) == 0) + if ((bufferLength & Vector128.Size) == 0) { goto DoFinalUnalignedVectorRead; } @@ -1357,12 +1350,12 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Intrinsified(char* pBuff DoFinalUnalignedVectorRead: - if (((byte)bufferLength & (SizeOfVector128InBytes - 1)) != 0) + if (((byte)bufferLength & (Vector128.Size - 1)) != 0) { // Perform an unaligned read of the last vector. // We need to adjust the pointer because we're re-reading data. - pBuffer = (char*)((byte*)pBuffer + (bufferLength & (SizeOfVector128InBytes - 1)) - SizeOfVector128InBytes); + pBuffer = (char*)((byte*)pBuffer + (bufferLength & (Vector128.Size - 1)) - Vector128.Size); firstVector = Vector128.LoadUnsafe(ref *(ushort*)pBuffer); if (VectorContainsNonAsciiChar(firstVector)) { @@ -1867,20 +1860,6 @@ private static bool VectorContainsNonAsciiChar(Vector128 asciiVector) } } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool VectorContainsNonAsciiChar(Vector256 asciiVector) - { - // max ASCII character is 0b_0111_1111, so the most significant bit (0x80) tells whether it contains non ascii - return asciiVector.ExtractMostSignificantBits() != 0; - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool VectorContainsNonAsciiChar(Vector512 asciiVector) - { - // max ASCII character is 0b_0111_1111, so the most significant bit (0x80) tells whether it contains non ascii - return asciiVector.ExtractMostSignificantBits() != 0; - } - [MethodImpl(MethodImplOptions.AggressiveInlining)] private static bool VectorContainsNonAsciiChar(Vector128 utf16Vector) { @@ -1955,28 +1934,6 @@ private static bool VectorContainsNonAsciiChar(Vector128 vector) : VectorContainsNonAsciiChar(vector.AsUInt16()); } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool VectorContainsNonAsciiChar(Vector256 vector) - where T : unmanaged - { - Debug.Assert(typeof(T) == typeof(byte) || typeof(T) == typeof(ushort)); - - return typeof(T) == typeof(byte) - ? VectorContainsNonAsciiChar(vector.AsByte()) - : VectorContainsNonAsciiChar(vector.AsUInt16()); - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool VectorContainsNonAsciiChar(Vector512 vector) - where T : unmanaged - { - Debug.Assert(typeof(T) == typeof(byte) || typeof(T) == typeof(ushort)); - - return typeof(T) == typeof(byte) - ? VectorContainsNonAsciiChar(vector.AsByte()) - : VectorContainsNonAsciiChar(vector.AsUInt16()); - } - [MethodImpl(MethodImplOptions.AggressiveInlining)] private static bool AllCharsInVectorAreAscii(Vector128 vector) where T : unmanaged @@ -2042,26 +1999,6 @@ private static Vector128 ExtractAsciiVector(Vector128 vectorFirst, } } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static Vector256 ExtractAsciiVector(Vector256 vectorFirst, Vector256 vectorSecond) - { - // Narrows two vectors of words [ w7 w6 w5 w4 w3 w2 w1 w0 ] and [ w7' w6' w5' w4' w3' w2' w1' w0' ] - // to a vector of bytes [ b7 ... b0 b7' ... b0']. - - // prefer architecture specific intrinsic as they don't perform additional AND like Vector512.Narrow does - return Vector256.Narrow(vectorFirst, vectorSecond); - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static Vector512 ExtractAsciiVector(Vector512 vectorFirst, Vector512 vectorSecond) - { - // Narrows two vectors of words [ w7 w6 w5 w4 w3 w2 w1 w0 ] and [ w7' w6' w5' w4' w3' w2' w1' w0' ] - // to a vector of bytes [ b7 ... b0 b7' ... b0']. - - // prefer architecture specific intrinsic as they don't perform additional AND like Vector512.Narrow does - return Vector512.Narrow(vectorFirst, vectorSecond); - } - [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe nuint NarrowUtf16ToAscii_Intrinsified(char* pUtf16Buffer, byte* pAsciiBuffer, nuint elementCount) { @@ -2189,8 +2126,7 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_256(char* pUtf16Buff // JIT turns the below into constants - uint SizeOfVector256 = (uint)Vector256.Count; - nuint MaskOfAllBitsInVector256 = (nuint)(SizeOfVector256 - 1); + const nuint MaskOfAllBitsInVector256 = (nuint)(Vector256.Size - 1); // This method is written such that control generally flows top-to-bottom, avoiding // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII @@ -2198,7 +2134,7 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_256(char* pUtf16Buff Debug.Assert(Vector256.IsHardwareAccelerated, "Vector256 is required."); Debug.Assert(BitConverter.IsLittleEndian, "This implementation assumes little-endian."); - Debug.Assert(elementCount >= 2 * SizeOfVector256); + Debug.Assert(elementCount >= 2 * Vector256.Size); // First, perform an unaligned read of the first part of the input buffer. ref ushort utf16Buffer = ref *(ushort*)pUtf16Buffer; @@ -2213,9 +2149,9 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_256(char* pUtf16Buff // Turn the 16 ASCII chars we just read into 16 ASCII bytes, then copy it to the destination. ref byte asciiBuffer = ref *pAsciiBuffer; - Vector256 asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorFirst); - asciiVector.StoreLowerUnsafe(ref asciiBuffer, 0); - nuint currentOffsetInElements = SizeOfVector256 / 2; // we processed 16 elements so far + Vector256 asciiVector = Vector256.Narrow(utf16VectorFirst, utf16VectorFirst); + asciiVector.GetLower().StoreUnsafe(ref asciiBuffer, 0); + nuint currentOffsetInElements = Vector256.Size / 2; // we processed 16 elements so far // We're going to get the best performance when we have aligned writes, so we'll take the // hit of potentially unaligned reads in order to hit this sweet spot. @@ -2227,7 +2163,7 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_256(char* pUtf16Buff // If the 0x08 bit is *not* set at the pinned address, then it means the 0x08 bit *is* set at // address &pAsciiBuffer[SizeOfVector128 / 2], and we should perform one more 8-byte write to bump // just past the next aligned boundary address. - if (((uint)pAsciiBuffer & (SizeOfVector256 / 2)) == 0) + if (((uint)pAsciiBuffer & (Vector256.Size / 2)) == 0) { // We need to perform one more partial vector write before we can get the alignment we want. @@ -2239,26 +2175,26 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_256(char* pUtf16Buff } // Turn the 32 ASCII chars we just read into 32 ASCII bytes, then copy it to the destination. - asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorFirst); - asciiVector.StoreLowerUnsafe(ref asciiBuffer, currentOffsetInElements); + asciiVector = Vector256.Narrow(utf16VectorFirst, utf16VectorFirst); + asciiVector.GetLower().StoreUnsafe(ref asciiBuffer, currentOffsetInElements); } // Calculate how many elements we wrote in order to get pAsciiBuffer to its next alignment // point, then use that as the base offset going forward. - currentOffsetInElements = SizeOfVector256 - ((nuint)pAsciiBuffer & MaskOfAllBitsInVector256); + currentOffsetInElements = Vector256.Size - ((nuint)pAsciiBuffer & MaskOfAllBitsInVector256); - Debug.Assert(0 < currentOffsetInElements && currentOffsetInElements <= SizeOfVector256, "We wrote at least 1 byte but no more than a whole vector."); + Debug.Assert(0 < currentOffsetInElements && currentOffsetInElements <= Vector256.Size, "We wrote at least 1 byte but no more than a whole vector."); Debug.Assert(currentOffsetInElements <= elementCount, "Shouldn't have overrun the destination buffer."); - Debug.Assert(elementCount - currentOffsetInElements >= SizeOfVector256, "We should be able to run at least one whole vector."); + Debug.Assert(elementCount - currentOffsetInElements >= Vector256.Size, "We should be able to run at least one whole vector."); - nuint finalOffsetWhereCanRunLoop = elementCount - SizeOfVector256; + nuint finalOffsetWhereCanRunLoop = elementCount - Vector256.Size; do { // In a loop, perform two unaligned reads, narrow to a single vector, then aligned write one vector. utf16VectorFirst = Vector256.LoadUnsafe(ref utf16Buffer, currentOffsetInElements); - Vector256 utf16VectorSecond = Vector256.LoadUnsafe(ref utf16Buffer, currentOffsetInElements + SizeOfVector256 / sizeof(short)); + Vector256 utf16VectorSecond = Vector256.LoadUnsafe(ref utf16Buffer, currentOffsetInElements + Vector256.Size / sizeof(short)); Vector256 combinedVector = utf16VectorFirst | utf16VectorSecond; if (VectorContainsNonAsciiChar(combinedVector)) @@ -2268,11 +2204,11 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_256(char* pUtf16Buff // Build up the ASCII vector and perform the store. - Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % SizeOfVector256 == 0, "Write should be aligned."); - asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorSecond); + Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % Vector256.Size == 0, "Write should be aligned."); + asciiVector = Vector256.Narrow(utf16VectorFirst, utf16VectorSecond); asciiVector.StoreUnsafe(ref asciiBuffer, currentOffsetInElements); - currentOffsetInElements += SizeOfVector256; + currentOffsetInElements += Vector256.Size; } while (currentOffsetInElements <= finalOffsetWhereCanRunLoop); Finish: @@ -2292,9 +2228,9 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_256(char* pUtf16Buff // First part was all ASCII, narrow and aligned write. Note we're only filling in the low half of the vector. Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % sizeof(ulong) == 0, "Destination should be ulong-aligned."); - asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorFirst); - asciiVector.StoreLowerUnsafe(ref asciiBuffer, currentOffsetInElements); - currentOffsetInElements += SizeOfVector256 / 2; + asciiVector = Vector256.Narrow(utf16VectorFirst, utf16VectorFirst); + asciiVector.GetLower().StoreUnsafe(ref asciiBuffer, currentOffsetInElements); + currentOffsetInElements += Vector256.Size / 2; goto Finish; } @@ -2307,8 +2243,7 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_512(char* pUtf16Buff // JIT turns the below into constants - uint SizeOfVector512 = (uint)Vector512.Count; - nuint MaskOfAllBitsInVector512 = (nuint)(SizeOfVector512 - 1); + const nuint MaskOfAllBitsInVector512 = (nuint)(Vector512.Size - 1); // This method is written such that control generally flows top-to-bottom, avoiding // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII @@ -2316,7 +2251,7 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_512(char* pUtf16Buff Debug.Assert(Vector512.IsHardwareAccelerated, "Vector512 is required."); Debug.Assert(BitConverter.IsLittleEndian, "This implementation assumes little-endian."); - Debug.Assert(elementCount >= 2 * SizeOfVector512); + Debug.Assert(elementCount >= 2 * Vector512.Size); // First, perform an unaligned read of the first part of the input buffer. ref ushort utf16Buffer = ref *(ushort*)pUtf16Buffer; @@ -2331,9 +2266,9 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_512(char* pUtf16Buff // Turn the 32 ASCII chars we just read into 32 ASCII bytes, then copy it to the destination. ref byte asciiBuffer = ref *pAsciiBuffer; - Vector512 asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorFirst); - asciiVector.StoreLowerUnsafe(ref asciiBuffer, 0); // how to store the lower part of a avx512 - nuint currentOffsetInElements = SizeOfVector512 / 2; // we processed 32 elements so far + Vector512 asciiVector = Vector512.Narrow(utf16VectorFirst, utf16VectorFirst); + asciiVector.GetLower().StoreUnsafe(ref asciiBuffer, 0); // how to store the lower part of a avx512 + nuint currentOffsetInElements = Vector512.Size / 2; // we processed 32 elements so far // We're going to get the best performance when we have aligned writes, so we'll take the // hit of potentially unaligned reads in order to hit this sweet spot. @@ -2346,7 +2281,7 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_512(char* pUtf16Buff // address &pAsciiBuffer[SizeOfVector128 / 2], and we should perform one more 8-byte write to bump // just past the next aligned boundary address. - if (((uint)pAsciiBuffer & (SizeOfVector512 / 2)) == 0) + if (((uint)pAsciiBuffer & (Vector512.Size / 2)) == 0) { // We need to perform one more partial vector write before we can get the alignment we want. @@ -2358,26 +2293,26 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_512(char* pUtf16Buff } // Turn the 32 ASCII chars we just read into 32 ASCII bytes, then copy it to the destination. - asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorFirst); - asciiVector.StoreLowerUnsafe(ref asciiBuffer, currentOffsetInElements); + asciiVector = Vector512.Narrow(utf16VectorFirst, utf16VectorFirst); + asciiVector.GetLower().StoreUnsafe(ref asciiBuffer, currentOffsetInElements); } // Calculate how many elements we wrote in order to get pAsciiBuffer to its next alignment // point, then use that as the base offset going forward. - currentOffsetInElements = SizeOfVector512 - ((nuint)pAsciiBuffer & MaskOfAllBitsInVector512); + currentOffsetInElements = Vector512.Size - ((nuint)pAsciiBuffer & MaskOfAllBitsInVector512); - Debug.Assert(0 < currentOffsetInElements && currentOffsetInElements <= SizeOfVector512, "We wrote at least 1 byte but no more than a whole vector."); + Debug.Assert(0 < currentOffsetInElements && currentOffsetInElements <= Vector512.Size, "We wrote at least 1 byte but no more than a whole vector."); Debug.Assert(currentOffsetInElements <= elementCount, "Shouldn't have overrun the destination buffer."); - Debug.Assert(elementCount - currentOffsetInElements >= SizeOfVector512, "We should be able to run at least one whole vector."); + Debug.Assert(elementCount - currentOffsetInElements >= Vector512.Size, "We should be able to run at least one whole vector."); - nuint finalOffsetWhereCanRunLoop = elementCount - SizeOfVector512; + nuint finalOffsetWhereCanRunLoop = elementCount - Vector512.Size; do { // In a loop, perform two unaligned reads, narrow to a single vector, then aligned write one vector. utf16VectorFirst = Vector512.LoadUnsafe(ref utf16Buffer, currentOffsetInElements); - Vector512 utf16VectorSecond = Vector512.LoadUnsafe(ref utf16Buffer, currentOffsetInElements + SizeOfVector512 / sizeof(short)); + Vector512 utf16VectorSecond = Vector512.LoadUnsafe(ref utf16Buffer, currentOffsetInElements + Vector512.Size / sizeof(short)); Vector512 combinedVector = utf16VectorFirst | utf16VectorSecond; if (VectorContainsNonAsciiChar(combinedVector)) @@ -2387,11 +2322,11 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_512(char* pUtf16Buff // Build up the ASCII vector and perform the store. - Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % SizeOfVector512 == 0, "Write should be aligned."); - asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorSecond); + Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % Vector512.Size == 0, "Write should be aligned."); + asciiVector = Vector512.Narrow(utf16VectorFirst, utf16VectorSecond); asciiVector.StoreUnsafe(ref asciiBuffer, currentOffsetInElements); - currentOffsetInElements += SizeOfVector512; + currentOffsetInElements += Vector512.Size; } while (currentOffsetInElements <= finalOffsetWhereCanRunLoop); Finish: @@ -2411,9 +2346,9 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_512(char* pUtf16Buff // First part was all ASCII, narrow and aligned write. Note we're only filling in the low half of the vector. Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % sizeof(ulong) == 0, "Destination should be ulong-aligned."); - asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorFirst); - asciiVector.StoreLowerUnsafe(ref asciiBuffer, currentOffsetInElements); - currentOffsetInElements += SizeOfVector512 / 2; + asciiVector = Vector512.Narrow(utf16VectorFirst, utf16VectorFirst); + asciiVector.GetLower().StoreUnsafe(ref asciiBuffer, currentOffsetInElements); + currentOffsetInElements += Vector512.Size / 2; goto Finish; } From 5d06c67e42331736f114eb879aed3dfb7f54bebb Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Thu, 13 Jul 2023 20:13:51 -0700 Subject: [PATCH 8/9] Resolve comments --- .../src/System/Text/Ascii.Utility.cs | 52 +++++++++---------- 1 file changed, 24 insertions(+), 28 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs index 7d74bfb6d8fa..9aa79da62211 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs @@ -151,9 +151,10 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu do { Debug.Assert((nuint)pBuffer % Vector512.Size == 0, "Vector read should be aligned."); - if (Vector512.LoadAligned(pBuffer).ExtractMostSignificantBits() != 0) + ulong msbMask = Vector512.LoadAligned(pBuffer).ExtractMostSignificantBits(); + if (msbMask != 0) { - break; // found non-ASCII data + return (nuint)pBuffer - (nuint)pOriginalBuffer + (nuint)BitOperations.TrailingZeroCount(msbMask); // found non-ASCII data } pBuffer += Vector512.Size; @@ -188,9 +189,10 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu do { Debug.Assert((nuint)pBuffer % Vector256.Size == 0, "Vector read should be aligned."); - if (Vector256.LoadAligned(pBuffer).ExtractMostSignificantBits() != 0) + uint msbMask = Vector256.LoadAligned(pBuffer).ExtractMostSignificantBits(); + if (msbMask != 0) { - break; // found non-ASCII data + return (nuint)pBuffer - (nuint)pOriginalBuffer + (nuint)BitOperations.TrailingZeroCount(msbMask); // found non-ASCII data } pBuffer += Vector256.Size; @@ -897,8 +899,6 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu { const uint SizeOfVector512InChars = Vector512.Size / sizeof(ushort); - Vector512 asciiMask = Vector512.Create((ushort) 0xFF80); - if (!VectorContainsNonAsciiChar(Vector512.Load((ushort*)pBuffer))) { // The first several elements of the input buffer were ASCII. Bump up the pointer to the @@ -918,7 +918,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu do { - Debug.Assert((nuint)pBuffer % SizeOfVector512InChars == 0, "Vector read should be aligned."); + Debug.Assert((nuint)pBuffer % Vector512.Size == 0, "Vector read should be aligned."); if (VectorContainsNonAsciiChar(Vector512.LoadAligned((ushort*)pBuffer))) { break; // found non-ASCII data @@ -935,8 +935,6 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu { const uint SizeOfVector256InChars = Vector256.Size / sizeof(ushort); - Vector256 asciiMask = Vector256.Create((ushort) 0xFF80); - if (!VectorContainsNonAsciiChar(Vector256.Load((ushort*)pBuffer))) { // The first several elements of the input buffer were ASCII. Bump up the pointer to the @@ -956,7 +954,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu do { - Debug.Assert((nuint)pBuffer % SizeOfVector256InChars == 0, "Vector read should be aligned."); + Debug.Assert((nuint)pBuffer % Vector256.Size == 0, "Vector read should be aligned."); if (VectorContainsNonAsciiChar(Vector256.LoadAligned((ushort*)pBuffer))) { break; // found non-ASCII data @@ -973,8 +971,6 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu { const uint SizeOfVector128InChars = Vector128.Size / sizeof(ushort); // JIT will make this a const - Vector128 asciiMask = Vector128.Create((ushort) 0xFF80); - if (!VectorContainsNonAsciiChar(Vector128.Load((ushort*)pBuffer))) { // The first several elements of the input buffer were ASCII. Bump up the pointer to the @@ -993,7 +989,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu do { - Debug.Assert((nuint)pBuffer % SizeOfVector128InChars == 0, "Vector read should be aligned."); + Debug.Assert((nuint)pBuffer % Vector128.Size == 0, "Vector read should be aligned."); if (VectorContainsNonAsciiChar(Vector128.LoadAligned((ushort*)pBuffer))) { break; // found non-ASCII data @@ -1130,7 +1126,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Default(char* pBuffer, n do { - Debug.Assert((nuint)pBuffer % SizeOfVectorInChars == 0, "Vector read should be aligned."); + Debug.Assert((nuint)pBuffer % SizeOfVectorInBytes == 0, "Vector read should be aligned."); if (Vector.GreaterThanAny(Unsafe.Read>(pBuffer), maxAscii)) { break; // found non-ASCII data @@ -2157,11 +2153,11 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_256(char* pUtf16Buff // hit of potentially unaligned reads in order to hit this sweet spot. // pAsciiBuffer points to the start of the destination buffer, immediately before where we wrote - // the 8 bytes previously. If the 0x08 bit is set at the pinned address, then the 8 bytes we wrote - // previously mean that the 0x08 bit is *not* set at address &pAsciiBuffer[SizeOfVector128 / 2]. In + // the 16 bytes previously. If the 0x10 bit is set at the pinned address, then the 16 bytes we wrote + // previously mean that the 0x10 bit is *not* set at address &pAsciiBuffer[SizeOfVector256 / 2]. In // that case we can immediately back up to the previous aligned boundary and start the main loop. - // If the 0x08 bit is *not* set at the pinned address, then it means the 0x08 bit *is* set at - // address &pAsciiBuffer[SizeOfVector128 / 2], and we should perform one more 8-byte write to bump + // If the 0x10 bit is *not* set at the pinned address, then it means the 0x10 bit *is* set at + // address &pAsciiBuffer[SizeOfVector256 / 2], and we should perform one more 16-byte write to bump // just past the next aligned boundary address. if (((uint)pAsciiBuffer & (Vector256.Size / 2)) == 0) { @@ -2174,7 +2170,7 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_256(char* pUtf16Buff goto Finish; } - // Turn the 32 ASCII chars we just read into 32 ASCII bytes, then copy it to the destination. + // Turn the 16 ASCII chars we just read into 16 ASCII bytes, then copy it to the destination. asciiVector = Vector256.Narrow(utf16VectorFirst, utf16VectorFirst); asciiVector.GetLower().StoreUnsafe(ref asciiBuffer, currentOffsetInElements); } @@ -2227,7 +2223,7 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_256(char* pUtf16Buff // First part was all ASCII, narrow and aligned write. Note we're only filling in the low half of the vector. - Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % sizeof(ulong) == 0, "Destination should be ulong-aligned."); + Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % Vector128.Size == 0, "Destination should be 128-bit-aligned."); asciiVector = Vector256.Narrow(utf16VectorFirst, utf16VectorFirst); asciiVector.GetLower().StoreUnsafe(ref asciiBuffer, currentOffsetInElements); currentOffsetInElements += Vector256.Size / 2; @@ -2274,11 +2270,11 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_512(char* pUtf16Buff // hit of potentially unaligned reads in order to hit this sweet spot. // pAsciiBuffer points to the start of the destination buffer, immediately before where we wrote - // the 8 bytes previously. If the 0x08 bit is set at the pinned address, then the 8 bytes we wrote - // previously mean that the 0x08 bit is *not* set at address &pAsciiBuffer[SizeOfVector128 / 2]. In + // the 32 bytes previously. If the 0x20 bit is set at the pinned address, then the 32 bytes we wrote + // previously mean that the 0x20 bit is *not* set at address &pAsciiBuffer[SizeOfVector512 / 2]. In // that case we can immediately back up to the previous aligned boundary and start the main loop. - // If the 0x08 bit is *not* set at the pinned address, then it means the 0x08 bit *is* set at - // address &pAsciiBuffer[SizeOfVector128 / 2], and we should perform one more 8-byte write to bump + // If the 0x20 bit is *not* set at the pinned address, then it means the 0x20 bit *is* set at + // address &pAsciiBuffer[SizeOfVector512 / 2], and we should perform one more 32-byte write to bump // just past the next aligned boundary address. if (((uint)pAsciiBuffer & (Vector512.Size / 2)) == 0) @@ -2345,7 +2341,7 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_512(char* pUtf16Buff // First part was all ASCII, narrow and aligned write. Note we're only filling in the low half of the vector. - Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % sizeof(ulong) == 0, "Destination should be ulong-aligned."); + Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % Vector256.Size == 0, "Destination should be 256-bit-aligned."); asciiVector = Vector512.Narrow(utf16VectorFirst, utf16VectorFirst); asciiVector.GetLower().StoreUnsafe(ref asciiBuffer, currentOffsetInElements); currentOffsetInElements += Vector512.Size / 2; @@ -2384,9 +2380,9 @@ internal static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16B break; } - (Vector512 low, Vector512 upper) = Vector512.Widen(asciiVector); - low.Store(pCurrentWriteAddress); - upper.Store(pCurrentWriteAddress + Vector512.Count); + (Vector512 utf16LowVector, Vector512 utf16HighVector) = Vector512.Widen(asciiVector); + utf16LowVector.Store(pCurrentWriteAddress); + utf16HighVector.Store(pCurrentWriteAddress + Vector512.Count); currentOffset += (nuint)Vector512.Count; pCurrentWriteAddress += (nuint)Vector512.Count; From 17d3b28368e9afc69efcc6da3d50fef448bf7c98 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Fri, 14 Jul 2023 10:00:59 -0700 Subject: [PATCH 9/9] revert the changes at GetIndexOfFirstNonAsciiByte --- .../src/System/Text/Ascii.Utility.cs | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs index 9aa79da62211..e30b3d06c781 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs @@ -151,10 +151,9 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu do { Debug.Assert((nuint)pBuffer % Vector512.Size == 0, "Vector read should be aligned."); - ulong msbMask = Vector512.LoadAligned(pBuffer).ExtractMostSignificantBits(); - if (msbMask != 0) + if (Vector512.LoadAligned(pBuffer).ExtractMostSignificantBits() != 0) { - return (nuint)pBuffer - (nuint)pOriginalBuffer + (nuint)BitOperations.TrailingZeroCount(msbMask); // found non-ASCII data + break; // found non-ASCII data } pBuffer += Vector512.Size; @@ -189,10 +188,9 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu do { Debug.Assert((nuint)pBuffer % Vector256.Size == 0, "Vector read should be aligned."); - uint msbMask = Vector256.LoadAligned(pBuffer).ExtractMostSignificantBits(); - if (msbMask != 0) + if (Vector256.LoadAligned(pBuffer).ExtractMostSignificantBits() != 0) { - return (nuint)pBuffer - (nuint)pOriginalBuffer + (nuint)BitOperations.TrailingZeroCount(msbMask); // found non-ASCII data + break; // found non-ASCII data } pBuffer += Vector256.Size;