From 12bb8928f104f138221e4f542af82115722f86af Mon Sep 17 00:00:00 2001
From: Ruihan-Yin <ruihan.yin@intel.com>
Date: Mon, 5 Jun 2023 15:13:54 -0700
Subject: [PATCH 1/9] Lib upgrade for ToUtf16

---
 .../src/System/Text/Ascii.Utility.cs          | 26 ++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)
diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs
index 69b3cc125c08..c0fb439c2132 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs
@@ -1654,7 +1654,31 @@ internal static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16B
             {
                 ushort* pCurrentWriteAddress = (ushort*)pUtf16Buffer;
 
-                if (Vector256.IsHardwareAccelerated && elementCount >= (uint)Vector256<byte>.Count)
+                if (Vector512.IsHardwareAccelerated && elementCount >= (uint)Vector512<byte>.Count)
+                {
+                    // Calculating the destination address outside the loop results in significant
+                    // perf wins vs. relying on the JIT to fold memory addressing logic into the
+                    // write instructions. See: https://github.com/dotnet/runtime/issues/33002
+                    nuint finalOffsetWhereCanRunLoop = elementCount - (uint)Vector512<byte>.Count;
+
+                    do
+                    {
+                        Vector512<byte> asciiVector = Vector512.Load(pAsciiBuffer + currentOffset);
+
+                        if (asciiVector.ExtractMostSignificantBits() != 0)
+                        {
+                            break;
+                        }
+
+                        (Vector512<ushort> low, Vector512<ushort> upper) = Vector512.Widen(asciiVector);
+                        low.Store(pCurrentWriteAddress);
+                        upper.Store(pCurrentWriteAddress + Vector512<ushort>.Count);
+
+                        currentOffset += (nuint)Vector512<byte>.Count;
+                        pCurrentWriteAddress += (nuint)Vector512<byte>.Count;
+                    } while (currentOffset <= finalOffsetWhereCanRunLoop);
+                }
+                else if (Vector256.IsHardwareAccelerated && elementCount >= (uint)Vector256<byte>.Count)
                 {
                     // Calculating the destination address outside the loop results in significant
                     // perf wins vs. relying on the JIT to fold memory addressing logic into the

From ff185d0db08bc359bcd73e3b7a77c9daf015bfc2 Mon Sep 17 00:00:00 2001
From: Ruihan-Yin <ruihan.yin@intel.com>
Date: Fri, 30 Jun 2023 11:40:44 -0700
Subject: [PATCH 2/9] Upgrade NarrowUtf16ToAscii with Vector512

---
 .../System/Runtime/Intrinsics/Vector512.cs    |  17 ++
 .../src/System/Text/Ascii.Utility.cs          | 167 +++++++++++++++++-
 2 files changed, 183 insertions(+), 1 deletion(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector512.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector512.cs
index 5b740db43009..fad4f94bd660 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector512.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector512.cs
@@ -2670,6 +2670,23 @@ public static void StoreUnsafe<T>(this Vector512<T> source, ref T destination)
             ref byte address = ref Unsafe.As<T, byte>(ref destination);
             Unsafe.WriteUnaligned(ref address, source);
         }
+        /// <summary>
+        /// Stores to lower 256 bits of <paramref name="source"/> to memory destination of <paramref name="destination"/>[<paramref name="elementOffset"/>]
+        /// </summary>
+        /// <typeparam name="T">The type of the elements in the vector.</typeparam>
+        /// <param name="source">The vector that will be stored.</param>
+        /// <param name="destination">The destination to which <paramref name="elementOffset" /> will be added before the vector will be stored.</param>
+        /// <param name="elementOffset">The element offset from <paramref name="destination" /> from which the vector will be stored.</param>
+        /// <remarks>
+        /// Uses double instead of long to get a single instruction instead of storing temps on general porpose register (or stack)
+        /// </remarks>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static void StoreLowerUnsafe<T>(this Vector512<T> source, ref T destination, nuint elementOffset = 0)
+        {
+            ThrowHelper.ThrowForUnsupportedIntrinsicsVector512BaseType<T>();
+            ref byte address = ref Unsafe.As<T, byte>(ref Unsafe.Add(ref destination, elementOffset));
+            Unsafe.WriteUnaligned(ref address, source._lower);
+        }
 
         /// <summary>Stores a vector at the given destination.</summary>
         /// <typeparam name="T">The type of the elements in the vector.</typeparam>
diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs
index c0fb439c2132..3b72bb32ae6c 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs
@@ -1172,7 +1172,36 @@ internal static unsafe nuint NarrowUtf16ToAscii(char* pUtf16Buffer, byte* pAscii
             uint utf16Data32BitsHigh = 0, utf16Data32BitsLow = 0;
             ulong utf16Data64Bits = 0;
 
-            if (Vector128.IsHardwareAccelerated && BitConverter.IsLittleEndian)
+            if (Vector512.IsHardwareAccelerated && BitConverter.IsLittleEndian)
+            {
+                if (elementCount >= 2 * (uint)Vector512<byte>.Count)
+                {
+                    // Since there's overhead to setting up the vectorized code path, we only want to
+                    // call into it after a quick probe to ensure the next immediate characters really are ASCII.
+                    // If we see non-ASCII data, we'll jump immediately to the draining logic at the end of the method.
+
+                    if (IntPtr.Size >= 8)
+                    {
+                        utf16Data64Bits = Unsafe.ReadUnaligned<ulong>(pUtf16Buffer);
+                        if (!AllCharsInUInt64AreAscii(utf16Data64Bits))
+                        {
+                            goto FoundNonAsciiDataIn64BitRead;
+                        }
+                    }
+                    else
+                    {
+                        utf16Data32BitsHigh = Unsafe.ReadUnaligned<uint>(pUtf16Buffer);
+                        utf16Data32BitsLow = Unsafe.ReadUnaligned<uint>(pUtf16Buffer + 4 / sizeof(char));
+                        if (!AllCharsInUInt32AreAscii(utf16Data32BitsHigh | utf16Data32BitsLow))
+                        {
+                            goto FoundNonAsciiDataIn64BitRead;
+                        }
+                    }
+
+                    currentOffset = NarrowUtf16ToAscii_Intrinsified_512(pUtf16Buffer, pAsciiBuffer, elementCount);
+                }
+            }
+            else if (Vector128.IsHardwareAccelerated && BitConverter.IsLittleEndian)
             {
                 if (elementCount >= 2 * (uint)Vector128<byte>.Count)
                 {
@@ -1444,6 +1473,13 @@ private static bool VectorContainsNonAsciiChar(Vector128<ushort> utf16Vector)
             }
         }
 
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool VectorContainsNonAsciiChar(Vector512<ushort> utf16Vector)
+        {
+            ulong MostSignificantBitIsSet = Vector512.ExtractMostSignificantBits(utf16Vector);
+            return MostSignificantBitIsSet != 0;
+        }
+
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private static bool VectorContainsNonAsciiChar<T>(Vector128<T> vector)
             where T : unmanaged
@@ -1520,6 +1556,16 @@ private static Vector128<byte> ExtractAsciiVector(Vector128<ushort> vectorFirst,
             }
         }
 
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static Vector512<byte> ExtractAsciiVector(Vector512<ushort> vectorFirst, Vector512<ushort> vectorSecond)
+        {
+            // Narrows two vectors of words [ w7 w6 w5 w4 w3 w2 w1 w0 ] and [ w7' w6' w5' w4' w3' w2' w1' w0' ]
+            // to a vector of bytes [ b7 ... b0 b7' ... b0'].
+
+            // prefer architecture specific intrinsic as they don't perform additional AND like Vector128.Narrow does
+            return Vector512.Narrow(vectorFirst, vectorSecond);
+        }
+
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private static unsafe nuint NarrowUtf16ToAscii_Intrinsified(char* pUtf16Buffer, byte* pAsciiBuffer, nuint elementCount)
         {
@@ -1639,6 +1685,125 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified(char* pUtf16Buffer,
             goto Finish;
         }
 
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_512(char* pUtf16Buffer, byte* pAsciiBuffer, nuint elementCount)
+        {
+            // This method contains logic optimized using vector instructions for x64 only.
+            // Much of the logic in this method will be elided by JIT once we determine which specific ISAs we support.
+
+            // JIT turns the below into constants
+
+            uint SizeOfVector512 = (uint)Vector512<byte>.Count;
+            nuint MaskOfAllBitsInVector512 = (nuint)(SizeOfVector512 - 1);
+
+            // This method is written such that control generally flows top-to-bottom, avoiding
+            // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII
+            // data, we jump out of the hot paths to targets at the end of the method.
+
+            Debug.Assert(Vector512.IsHardwareAccelerated, "Vector512 is required.");
+            Debug.Assert(BitConverter.IsLittleEndian, "This implementation assumes little-endian.");
+            Debug.Assert(elementCount >= 2 * SizeOfVector512);
+
+            // First, perform an unaligned read of the first part of the input buffer.
+            ref ushort utf16Buffer = ref *(ushort*)pUtf16Buffer;
+            Vector512<ushort> utf16VectorFirst = Vector512.LoadUnsafe(ref utf16Buffer);
+
+            // If there's non-ASCII data in the first 32 elements of the vector, there's nothing we can do.
+            if (VectorContainsNonAsciiChar(utf16VectorFirst))
+            {
+                return 0;
+            }
+
+            // Turn the 32 ASCII chars we just read into 32 ASCII bytes, then copy it to the destination.
+
+            ref byte asciiBuffer = ref *pAsciiBuffer;
+            Vector512<byte> asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorFirst);
+            asciiVector.StoreLowerUnsafe(ref asciiBuffer, 0); // how to store the lower part of a avx512
+            nuint currentOffsetInElements = SizeOfVector512 / 2; // we processed 32 elements so far
+
+            // We're going to get the best performance when we have aligned writes, so we'll take the
+            // hit of potentially unaligned reads in order to hit this sweet spot.
+
+            // pAsciiBuffer points to the start of the destination buffer, immediately before where we wrote
+            // the 8 bytes previously. If the 0x08 bit is set at the pinned address, then the 8 bytes we wrote
+            // previously mean that the 0x08 bit is *not* set at address &pAsciiBuffer[SizeOfVector128 / 2]. In
+            // that case we can immediately back up to the previous aligned boundary and start the main loop.
+            // If the 0x08 bit is *not* set at the pinned address, then it means the 0x08 bit *is* set at
+            // address &pAsciiBuffer[SizeOfVector128 / 2], and we should perform one more 8-byte write to bump
+            // just past the next aligned boundary address.
+
+            if (((uint)pAsciiBuffer & (SizeOfVector512 / 2)) == 0)
+            {
+                // We need to perform one more partial vector write before we can get the alignment we want.
+
+                utf16VectorFirst = Vector512.LoadUnsafe(ref utf16Buffer, currentOffsetInElements);
+
+                if (VectorContainsNonAsciiChar(utf16VectorFirst))
+                {
+                    goto Finish;
+                }
+
+                // Turn the 32 ASCII chars we just read into 32 ASCII bytes, then copy it to the destination.
+                asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorFirst);
+                asciiVector.StoreLowerUnsafe(ref asciiBuffer, currentOffsetInElements);
+            }
+
+            // Calculate how many elements we wrote in order to get pAsciiBuffer to its next alignment
+            // point, then use that as the base offset going forward.
+
+            currentOffsetInElements = SizeOfVector512 - ((nuint)pAsciiBuffer & MaskOfAllBitsInVector512);
+
+            Debug.Assert(0 < currentOffsetInElements && currentOffsetInElements <= SizeOfVector512, "We wrote at least 1 byte but no more than a whole vector.");
+            Debug.Assert(currentOffsetInElements <= elementCount, "Shouldn't have overrun the destination buffer.");
+            Debug.Assert(elementCount - currentOffsetInElements >= SizeOfVector512, "We should be able to run at least one whole vector.");
+
+            nuint finalOffsetWhereCanRunLoop = elementCount - SizeOfVector512;
+            do
+            {
+                // In a loop, perform two unaligned reads, narrow to a single vector, then aligned write one vector.
+
+                utf16VectorFirst = Vector512.LoadUnsafe(ref utf16Buffer, currentOffsetInElements);
+                Vector512<ushort> utf16VectorSecond = Vector512.LoadUnsafe(ref utf16Buffer, currentOffsetInElements + SizeOfVector512 / sizeof(short));
+                Vector512<ushort> combinedVector = utf16VectorFirst | utf16VectorSecond;
+
+                if (VectorContainsNonAsciiChar(combinedVector))
+                {
+                    goto FoundNonAsciiDataInLoop;
+                }
+
+                // Build up the ASCII vector and perform the store.
+
+                Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % SizeOfVector512 == 0, "Write should be aligned.");
+                asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorSecond);
+                asciiVector.StoreUnsafe(ref asciiBuffer, currentOffsetInElements);
+
+                currentOffsetInElements += SizeOfVector512;
+            } while (currentOffsetInElements <= finalOffsetWhereCanRunLoop);
+
+        Finish:
+
+            // There might be some ASCII data left over. That's fine - we'll let our caller handle the final drain.
+            return currentOffsetInElements;
+
+        FoundNonAsciiDataInLoop:
+
+            // Can we at least narrow the high vector?
+            // See comments in GetIndexOfFirstNonAsciiChar_Intrinsified for information about how this works.
+            if (VectorContainsNonAsciiChar(utf16VectorFirst))
+            {
+                goto Finish;
+            }
+
+            // First part was all ASCII, narrow and aligned write. Note we're only filling in the low half of the vector.
+
+            Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % sizeof(ulong) == 0, "Destination should be ulong-aligned.");
+            asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorFirst);
+            asciiVector.StoreLowerUnsafe(ref asciiBuffer, currentOffsetInElements);
+            currentOffsetInElements += SizeOfVector512 / 2;
+
+            goto Finish;
+        }
+
         /// <summary>
         /// Copies as many ASCII bytes (00..7F) as possible from <paramref name="pAsciiBuffer"/>
         /// to <paramref name="pUtf16Buffer"/>, stopping when the first non-ASCII byte is encountered

From 9d45ff881e0f8a5dcf781bd9cbed466c7340c4fd Mon Sep 17 00:00:00 2001
From: Ruihan-Yin <ruihan.yin@intel.com>
Date: Wed, 5 Jul 2023 16:53:13 -0700
Subject: [PATCH 3/9] Complete the upgrade in NarrowUtf16ToAscii method with
 Vector512 and Vector256 APIs.

---
 .../System/Runtime/Intrinsics/Vector256.cs    |  18 ++
 .../src/System/Text/Ascii.Utility.cs          | 215 ++++++++++++++----
 2 files changed, 184 insertions(+), 49 deletions(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs
index 758d0291af49..e36c41f50338 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs
@@ -2634,6 +2634,24 @@ public static void StoreUnsafe<T>(this Vector256<T> source, ref T destination, n
             Unsafe.WriteUnaligned(ref Unsafe.As<T, byte>(ref destination), source);
         }
 
+        /// <summary>
+        /// Stores to lower 128 bits of <paramref name="source"/> to memory destination of <paramref name="destination"/>[<paramref name="elementOffset"/>]
+        /// </summary>
+        /// <typeparam name="T">The type of the elements in the vector.</typeparam>
+        /// <param name="source">The vector that will be stored.</param>
+        /// <param name="destination">The destination to which <paramref name="elementOffset" /> will be added before the vector will be stored.</param>
+        /// <param name="elementOffset">The element offset from <paramref name="destination" /> from which the vector will be stored.</param>
+        /// <remarks>
+        /// Uses double instead of long to get a single instruction instead of storing temps on general porpose register (or stack)
+        /// </remarks>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static void StoreLowerUnsafe<T>(this Vector256<T> source, ref T destination, nuint elementOffset = 0)
+        {
+            ThrowHelper.ThrowForUnsupportedIntrinsicsVector256BaseType<T>();
+            ref byte address = ref Unsafe.As<T, byte>(ref Unsafe.Add(ref destination, elementOffset));
+            Unsafe.WriteUnaligned(ref address, source._lower);
+        }
+
         /// <summary>Subtracts two vectors to compute their difference.</summary>
         /// <param name="left">The vector from which <paramref name="right" /> will be subtracted.</param>
         /// <param name="right">The vector to subtract from <paramref name="left" />.</param>
diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs
index 3b72bb32ae6c..8b2943c5136b 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs
@@ -1172,61 +1172,39 @@ internal static unsafe nuint NarrowUtf16ToAscii(char* pUtf16Buffer, byte* pAscii
             uint utf16Data32BitsHigh = 0, utf16Data32BitsLow = 0;
             ulong utf16Data64Bits = 0;
 
-            if (Vector512.IsHardwareAccelerated && BitConverter.IsLittleEndian)
+            if (Vector128.IsHardwareAccelerated && BitConverter.IsLittleEndian && elementCount >= 2 * (uint)Vector128<byte>.Count)
             {
-                if (elementCount >= 2 * (uint)Vector512<byte>.Count)
-                {
-                    // Since there's overhead to setting up the vectorized code path, we only want to
-                    // call into it after a quick probe to ensure the next immediate characters really are ASCII.
-                    // If we see non-ASCII data, we'll jump immediately to the draining logic at the end of the method.
+                // Since there's overhead to setting up the vectorized code path, we only want to
+                // call into it after a quick probe to ensure the next immediate characters really are ASCII.
+                // If we see non-ASCII data, we'll jump immediately to the draining logic at the end of the method.
 
-                    if (IntPtr.Size >= 8)
+                if (IntPtr.Size >= 8)
+                {
+                    utf16Data64Bits = Unsafe.ReadUnaligned<ulong>(pUtf16Buffer);
+                    if (!AllCharsInUInt64AreAscii(utf16Data64Bits))
                     {
-                        utf16Data64Bits = Unsafe.ReadUnaligned<ulong>(pUtf16Buffer);
-                        if (!AllCharsInUInt64AreAscii(utf16Data64Bits))
-                        {
-                            goto FoundNonAsciiDataIn64BitRead;
-                        }
+                        goto FoundNonAsciiDataIn64BitRead;
                     }
-                    else
+                }
+                else
+                {
+                    utf16Data32BitsHigh = Unsafe.ReadUnaligned<uint>(pUtf16Buffer);
+                    utf16Data32BitsLow = Unsafe.ReadUnaligned<uint>(pUtf16Buffer + 4 / sizeof(char));
+                    if (!AllCharsInUInt32AreAscii(utf16Data32BitsHigh | utf16Data32BitsLow))
                     {
-                        utf16Data32BitsHigh = Unsafe.ReadUnaligned<uint>(pUtf16Buffer);
-                        utf16Data32BitsLow = Unsafe.ReadUnaligned<uint>(pUtf16Buffer + 4 / sizeof(char));
-                        if (!AllCharsInUInt32AreAscii(utf16Data32BitsHigh | utf16Data32BitsLow))
-                        {
-                            goto FoundNonAsciiDataIn64BitRead;
-                        }
+                        goto FoundNonAsciiDataIn64BitRead;
                     }
-
+                }
+                if (Vector512.IsHardwareAccelerated && elementCount >= 2 * (uint)Vector512<byte>.Count)
+                {
                     currentOffset = NarrowUtf16ToAscii_Intrinsified_512(pUtf16Buffer, pAsciiBuffer, elementCount);
                 }
-            }
-            else if (Vector128.IsHardwareAccelerated && BitConverter.IsLittleEndian)
-            {
-                if (elementCount >= 2 * (uint)Vector128<byte>.Count)
+                else if (Vector256.IsHardwareAccelerated && elementCount >= 2 * (uint)Vector256<byte>.Count)
+                {
+                    currentOffset = NarrowUtf16ToAscii_Intrinsified_256(pUtf16Buffer, pAsciiBuffer, elementCount);
+                }
+                else
                 {
-                    // Since there's overhead to setting up the vectorized code path, we only want to
-                    // call into it after a quick probe to ensure the next immediate characters really are ASCII.
-                    // If we see non-ASCII data, we'll jump immediately to the draining logic at the end of the method.
-
-                    if (IntPtr.Size >= 8)
-                    {
-                        utf16Data64Bits = Unsafe.ReadUnaligned<ulong>(pUtf16Buffer);
-                        if (!AllCharsInUInt64AreAscii(utf16Data64Bits))
-                        {
-                            goto FoundNonAsciiDataIn64BitRead;
-                        }
-                    }
-                    else
-                    {
-                        utf16Data32BitsHigh = Unsafe.ReadUnaligned<uint>(pUtf16Buffer);
-                        utf16Data32BitsLow = Unsafe.ReadUnaligned<uint>(pUtf16Buffer + 4 / sizeof(char));
-                        if (!AllCharsInUInt32AreAscii(utf16Data32BitsHigh | utf16Data32BitsLow))
-                        {
-                            goto FoundNonAsciiDataIn64BitRead;
-                        }
-                    }
-
                     currentOffset = NarrowUtf16ToAscii_Intrinsified(pUtf16Buffer, pAsciiBuffer, elementCount);
                 }
             }
@@ -1473,11 +1451,22 @@ private static bool VectorContainsNonAsciiChar(Vector128<ushort> utf16Vector)
             }
         }
 
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool VectorContainsNonAsciiChar(Vector256<ushort> utf16Vector)
+        {
+            const ushort asciiMask = ushort.MaxValue - 127; // 0xFF80
+            Vector256<ushort> zeroIsAscii = utf16Vector & Vector256.Create(asciiMask);
+            // If a non-ASCII bit is set in any WORD of the vector, we have seen non-ASCII data.
+            return zeroIsAscii != Vector256<ushort>.Zero;
+        }
+
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private static bool VectorContainsNonAsciiChar(Vector512<ushort> utf16Vector)
         {
-            ulong MostSignificantBitIsSet = Vector512.ExtractMostSignificantBits(utf16Vector);
-            return MostSignificantBitIsSet != 0;
+            const ushort asciiMask = ushort.MaxValue - 127; // 0xFF80
+            Vector512<ushort> zeroIsAscii = utf16Vector & Vector512.Create(asciiMask);
+            // If a non-ASCII bit is set in any WORD of the vector, we have seen non-ASCII data.
+            return zeroIsAscii != Vector512<ushort>.Zero;
         }
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -1556,13 +1545,23 @@ private static Vector128<byte> ExtractAsciiVector(Vector128<ushort> vectorFirst,
             }
         }
 
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static Vector256<byte> ExtractAsciiVector(Vector256<ushort> vectorFirst, Vector256<ushort> vectorSecond)
+        {
+            // Narrows two vectors of words [ w7 w6 w5 w4 w3 w2 w1 w0 ] and [ w7' w6' w5' w4' w3' w2' w1' w0' ]
+            // to a vector of bytes [ b7 ... b0 b7' ... b0'].
+
+            // prefer architecture specific intrinsic as they don't perform additional AND like Vector512.Narrow does
+            return Vector256.Narrow(vectorFirst, vectorSecond);
+        }
+
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private static Vector512<byte> ExtractAsciiVector(Vector512<ushort> vectorFirst, Vector512<ushort> vectorSecond)
         {
             // Narrows two vectors of words [ w7 w6 w5 w4 w3 w2 w1 w0 ] and [ w7' w6' w5' w4' w3' w2' w1' w0' ]
             // to a vector of bytes [ b7 ... b0 b7' ... b0'].
 
-            // prefer architecture specific intrinsic as they don't perform additional AND like Vector128.Narrow does
+            // prefer architecture specific intrinsic as they don't perform additional AND like Vector512.Narrow does
             return Vector512.Narrow(vectorFirst, vectorSecond);
         }
 
@@ -1685,6 +1684,124 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified(char* pUtf16Buffer,
             goto Finish;
         }
 
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_256(char* pUtf16Buffer, byte* pAsciiBuffer, nuint elementCount)
+        {
+            // This method contains logic optimized using vector instructions for x64 only.
+            // Much of the logic in this method will be elided by JIT once we determine which specific ISAs we support.
+
+            // JIT turns the below into constants
+
+            uint SizeOfVector256 = (uint)Vector256<byte>.Count;
+            nuint MaskOfAllBitsInVector256 = (nuint)(SizeOfVector256 - 1);
+
+            // This method is written such that control generally flows top-to-bottom, avoiding
+            // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII
+            // data, we jump out of the hot paths to targets at the end of the method.
+
+            Debug.Assert(Vector256.IsHardwareAccelerated, "Vector256 is required.");
+            Debug.Assert(BitConverter.IsLittleEndian, "This implementation assumes little-endian.");
+            Debug.Assert(elementCount >= 2 * SizeOfVector256);
+
+            // First, perform an unaligned read of the first part of the input buffer.
+            ref ushort utf16Buffer = ref *(ushort*)pUtf16Buffer;
+            Vector256<ushort> utf16VectorFirst = Vector256.LoadUnsafe(ref utf16Buffer);
+
+            // If there's non-ASCII data in the first 16 elements of the vector, there's nothing we can do.
+            if (VectorContainsNonAsciiChar(utf16VectorFirst))
+            {
+                return 0;
+            }
+
+            // Turn the 16 ASCII chars we just read into 16 ASCII bytes, then copy it to the destination.
+
+            ref byte asciiBuffer = ref *pAsciiBuffer;
+            Vector256<byte> asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorFirst);
+            asciiVector.StoreLowerUnsafe(ref asciiBuffer, 0);
+            nuint currentOffsetInElements = SizeOfVector256 / 2; // we processed 16 elements so far
+
+            // We're going to get the best performance when we have aligned writes, so we'll take the
+            // hit of potentially unaligned reads in order to hit this sweet spot.
+
+            // pAsciiBuffer points to the start of the destination buffer, immediately before where we wrote
+            // the 8 bytes previously. If the 0x08 bit is set at the pinned address, then the 8 bytes we wrote
+            // previously mean that the 0x08 bit is *not* set at address &pAsciiBuffer[SizeOfVector128 / 2]. In
+            // that case we can immediately back up to the previous aligned boundary and start the main loop.
+            // If the 0x08 bit is *not* set at the pinned address, then it means the 0x08 bit *is* set at
+            // address &pAsciiBuffer[SizeOfVector128 / 2], and we should perform one more 8-byte write to bump
+            // just past the next aligned boundary address.
+            if (((uint)pAsciiBuffer & (SizeOfVector256 / 2)) == 0)
+            {
+                // We need to perform one more partial vector write before we can get the alignment we want.
+
+                utf16VectorFirst = Vector256.LoadUnsafe(ref utf16Buffer, currentOffsetInElements);
+
+                if (VectorContainsNonAsciiChar(utf16VectorFirst))
+                {
+                    goto Finish;
+                }
+
+                // Turn the 32 ASCII chars we just read into 32 ASCII bytes, then copy it to the destination.
+                asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorFirst);
+                asciiVector.StoreLowerUnsafe(ref asciiBuffer, currentOffsetInElements);
+            }
+
+            // Calculate how many elements we wrote in order to get pAsciiBuffer to its next alignment
+            // point, then use that as the base offset going forward.
+
+            currentOffsetInElements = SizeOfVector256 - ((nuint)pAsciiBuffer & MaskOfAllBitsInVector256);
+
+            Debug.Assert(0 < currentOffsetInElements && currentOffsetInElements <= SizeOfVector256, "We wrote at least 1 byte but no more than a whole vector.");
+            Debug.Assert(currentOffsetInElements <= elementCount, "Shouldn't have overrun the destination buffer.");
+            Debug.Assert(elementCount - currentOffsetInElements >= SizeOfVector256, "We should be able to run at least one whole vector.");
+
+            nuint finalOffsetWhereCanRunLoop = elementCount - SizeOfVector256;
+            do
+            {
+                // In a loop, perform two unaligned reads, narrow to a single vector, then aligned write one vector.
+
+                utf16VectorFirst = Vector256.LoadUnsafe(ref utf16Buffer, currentOffsetInElements);
+                Vector256<ushort> utf16VectorSecond = Vector256.LoadUnsafe(ref utf16Buffer, currentOffsetInElements + SizeOfVector256 / sizeof(short));
+                Vector256<ushort> combinedVector = utf16VectorFirst | utf16VectorSecond;
+
+                if (VectorContainsNonAsciiChar(combinedVector))
+                {
+                    goto FoundNonAsciiDataInLoop;
+                }
+
+                // Build up the ASCII vector and perform the store.
+
+                Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % SizeOfVector256 == 0, "Write should be aligned.");
+                asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorSecond);
+                asciiVector.StoreUnsafe(ref asciiBuffer, currentOffsetInElements);
+
+                currentOffsetInElements += SizeOfVector256;
+            } while (currentOffsetInElements <= finalOffsetWhereCanRunLoop);
+
+        Finish:
+
+            // There might be some ASCII data left over. That's fine - we'll let our caller handle the final drain.
+            return currentOffsetInElements;
+
+        FoundNonAsciiDataInLoop:
+
+            // Can we at least narrow the high vector?
+            // See comments in GetIndexOfFirstNonAsciiChar_Intrinsified for information about how this works.
+            if (VectorContainsNonAsciiChar(utf16VectorFirst))
+            {
+                goto Finish;
+            }
+
+            // First part was all ASCII, narrow and aligned write. Note we're only filling in the low half of the vector.
+
+            Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % sizeof(ulong) == 0, "Destination should be ulong-aligned.");
+            asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorFirst);
+            asciiVector.StoreLowerUnsafe(ref asciiBuffer, currentOffsetInElements);
+            currentOffsetInElements += SizeOfVector256 / 2;
+
+            goto Finish;
+        }
+
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_512(char* pUtf16Buffer, byte* pAsciiBuffer, nuint elementCount)
         {

From 09b8e3117d062be85299b6a284c44901016476f2 Mon Sep 17 00:00:00 2001
From: "Canino, Anthony" <anthony.canino@intel.com>
Date: Wed, 5 Jul 2023 17:05:22 -0700
Subject: [PATCH 4/9] Adding VectorXX paths to `GetIndexOfFirstNonAscii`
 functions.

---
 .../src/System/Text/Ascii.Utility.cs          | 465 +++++++++++++++++-
 .../tests/Ascii/FromUtf16Tests.cs             |  16 +-
 .../tests/Ascii/ToUtf16Tests.cs               |  16 +-
 3 files changed, 475 insertions(+), 22 deletions(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs
index 8b2943c5136b..5a3be66531a9 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs
@@ -101,9 +101,241 @@ internal static unsafe nuint GetIndexOfFirstNonAsciiByte(byte* pBuffer, nuint bu
             // pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while
             // this method is running.
 
-            return (Sse2.IsSupported || (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian))
-                ? GetIndexOfFirstNonAsciiByte_Intrinsified(pBuffer, bufferLength)
-                : GetIndexOfFirstNonAsciiByte_Default(pBuffer, bufferLength);
+            if (Vector512.IsHardwareAccelerated || Vector256.IsHardwareAccelerated)
+            {
+                return GetIndexOfFirstNonAsciiByte_Vector(pBuffer, bufferLength);
+            }
+            else if (Sse2.IsSupported || (AdvSimd.IsSupported && BitConverter.IsLittleEndian))
+            {
+                return GetIndexOfFirstNonAsciiByte_Intrinsified(pBuffer, bufferLength);
+            }
+            else
+            {
+                return GetIndexOfFirstNonAsciiByte_Default(pBuffer, bufferLength);
+            }
+        }
+
+        private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nuint bufferLength)
+        {
+            // Squirrel away the original buffer reference. This method works by determining the exact
+            // byte reference where non-ASCII data begins, so we need this base value to perform the
+            // final subtraction at the end of the method to get the index into the original buffer.
+
+            byte* pOriginalBuffer = pBuffer;
+
+            // Before we drain off byte-by-byte, try a generic vectorized loop.
+            // Only run the loop if we have at least two vectors we can pull out.
+            // Note use of SBYTE instead of BYTE below; we're using the two's-complement
+            // representation of negative integers to act as a surrogate for "is ASCII?".
+
+            if (Vector512.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector512<byte>.Count)
+            {
+                uint SizeOfVector512InBytes = (uint)Vector512<byte>.Count; // JIT will make this a const
+
+                if (Unsafe.ReadUnaligned<Vector512<byte>>(pBuffer).ExtractMostSignificantBits() == 0)
+                {
+                    // The first several elements of the input buffer were ASCII. Bump up the pointer to the
+                    // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII
+                    // data or we approach the end of the buffer. It's possible we'll reread data; this is ok.
+
+                    byte* pFinalVectorReadPos = pBuffer + bufferLength - SizeOfVector512InBytes;
+                    pBuffer = (byte*)(((nuint)pBuffer + SizeOfVector512InBytes) & ~(nuint)(SizeOfVector512InBytes - 1));
+
+#if DEBUG
+                    long numBytesRead = pBuffer - pOriginalBuffer;
+                    Debug.Assert(0 < numBytesRead && numBytesRead <= SizeOfVector512InBytes, "We should've made forward progress of at least one byte.");
+                    Debug.Assert((nuint)numBytesRead <= bufferLength, "We shouldn't have read past the end of the input buffer.");
+#endif
+
+                    Debug.Assert(pBuffer <= pFinalVectorReadPos, "Should be able to read at least one vector.");
+
+                    do
+                    {
+                        Debug.Assert((nuint)pBuffer % SizeOfVector512InBytes == 0, "Vector read should be aligned.");
+                        if (Unsafe.Read<Vector512<byte>>(pBuffer).ExtractMostSignificantBits() != 0)
+                        {
+                            break; // found non-ASCII data
+                        }
+
+                        pBuffer += SizeOfVector512InBytes;
+                    } while (pBuffer <= pFinalVectorReadPos);
+
+                    // Adjust the remaining buffer length for the number of elements we just consumed.
+
+                    bufferLength -= (nuint)pBuffer;
+                    bufferLength += (nuint)pOriginalBuffer;
+                }
+            }
+            else if (Vector256.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector256<byte>.Count)
+            {
+                uint SizeOfVector256InBytes = (uint)Vector256<byte>.Count; // JIT will make this a const
+
+                if (Unsafe.ReadUnaligned<Vector256<byte>>(pBuffer).ExtractMostSignificantBits() == 0)
+                {
+                    // The first several elements of the input buffer were ASCII. Bump up the pointer to the
+                    // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII
+                    // data or we approach the end of the buffer. It's possible we'll reread data; this is ok.
+
+                    byte* pFinalVectorReadPos = pBuffer + bufferLength - SizeOfVector256InBytes;
+                    pBuffer = (byte*)(((nuint)pBuffer + SizeOfVector256InBytes) & ~(nuint)(SizeOfVector256InBytes - 1));
+
+#if DEBUG
+                    long numBytesRead = pBuffer - pOriginalBuffer;
+                    Debug.Assert(0 < numBytesRead && numBytesRead <= SizeOfVector256InBytes, "We should've made forward progress of at least one byte.");
+                    Debug.Assert((nuint)numBytesRead <= bufferLength, "We shouldn't have read past the end of the input buffer.");
+#endif
+
+                    Debug.Assert(pBuffer <= pFinalVectorReadPos, "Should be able to read at least one vector.");
+
+                    do
+                    {
+                        Debug.Assert((nuint)pBuffer % SizeOfVector256InBytes == 0, "Vector read should be aligned.");
+                        if (Unsafe.Read<Vector256<byte>>(pBuffer).ExtractMostSignificantBits() != 0)
+                        {
+                            break; // found non-ASCII data
+                        }
+
+                        pBuffer += SizeOfVector256InBytes;
+                    } while (pBuffer <= pFinalVectorReadPos);
+
+                    // Adjust the remaining buffer length for the number of elements we just consumed.
+
+                    bufferLength -= (nuint)pBuffer;
+                    bufferLength += (nuint)pOriginalBuffer;
+                }
+            }
+            else if (Vector128.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector128<byte>.Count)
+            {
+                uint SizeOfVector128InBytes = (uint)Vector128<byte>.Count; // JIT will make this a const
+
+                if (Unsafe.ReadUnaligned<Vector128<byte>>(pBuffer).ExtractMostSignificantBits() == 0)
+                {
+                    // The first several elements of the input buffer were ASCII. Bump up the pointer to the
+                    // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII
+                    // data or we approach the end of the buffer. It's possible we'll reread data; this is ok.
+
+                    byte* pFinalVectorReadPos = pBuffer + bufferLength - SizeOfVector128InBytes;
+                    pBuffer = (byte*)(((nuint)pBuffer + SizeOfVector128InBytes) & ~(nuint)(SizeOfVector128InBytes - 1));
+
+#if DEBUG
+                    long numBytesRead = pBuffer - pOriginalBuffer;
+                    Debug.Assert(0 < numBytesRead && numBytesRead <= SizeOfVector128InBytes, "We should've made forward progress of at least one byte.");
+                    Debug.Assert((nuint)numBytesRead <= bufferLength, "We shouldn't have read past the end of the input buffer.");
+#endif
+
+                    Debug.Assert(pBuffer <= pFinalVectorReadPos, "Should be able to read at least one vector.");
+
+                    do
+                    {
+                        Debug.Assert((nuint)pBuffer % SizeOfVector128InBytes == 0, "Vector read should be aligned.");
+                        if (Unsafe.Read<Vector128<byte>>(pBuffer).ExtractMostSignificantBits() != 0)
+                        {
+                            break; // found non-ASCII data
+                        }
+
+                        pBuffer += SizeOfVector128InBytes;
+                    } while (pBuffer <= pFinalVectorReadPos);
+
+                    // Adjust the remaining buffer length for the number of elements we just consumed.
+
+                    bufferLength -= (nuint)pBuffer;
+                    bufferLength += (nuint)pOriginalBuffer;
+                }
+            }
+
+            // At this point, the buffer length wasn't enough to perform a vectorized search, or we did perform
+            // a vectorized search and encountered non-ASCII data. In either case go down a non-vectorized code
+            // path to drain any remaining ASCII bytes.
+            //
+            // We're going to perform unaligned reads, so prefer 32-bit reads instead of 64-bit reads.
+            // This also allows us to perform more optimized bit twiddling tricks to count the number of ASCII bytes.
+
+            uint currentUInt32;
+
+            // Try reading 64 bits at a time in a loop.
+
+            for (; bufferLength >= 8; bufferLength -= 8)
+            {
+                currentUInt32 = Unsafe.ReadUnaligned<uint>(pBuffer);
+                uint nextUInt32 = Unsafe.ReadUnaligned<uint>(pBuffer + 4);
+
+                if (!AllBytesInUInt32AreAscii(currentUInt32 | nextUInt32))
+                {
+                    // One of these two values contains non-ASCII bytes.
+                    // Figure out which one it is, then put it in 'current' so that we can drain the ASCII bytes.
+
+                    if (AllBytesInUInt32AreAscii(currentUInt32))
+                    {
+                        currentUInt32 = nextUInt32;
+                        pBuffer += 4;
+                    }
+
+                    goto FoundNonAsciiData;
+                }
+
+                pBuffer += 8; // consumed 8 ASCII bytes
+            }
+
+            // From this point forward we don't need to update bufferLength.
+            // Try reading 32 bits.
+
+            if ((bufferLength & 4) != 0)
+            {
+                currentUInt32 = Unsafe.ReadUnaligned<uint>(pBuffer);
+                if (!AllBytesInUInt32AreAscii(currentUInt32))
+                {
+                    goto FoundNonAsciiData;
+                }
+
+                pBuffer += 4;
+            }
+
+            // Try reading 16 bits.
+
+            if ((bufferLength & 2) != 0)
+            {
+                currentUInt32 = Unsafe.ReadUnaligned<ushort>(pBuffer);
+                if (!AllBytesInUInt32AreAscii(currentUInt32))
+                {
+                    if (!BitConverter.IsLittleEndian)
+                    {
+                        currentUInt32 <<= 16;
+                    }
+                    goto FoundNonAsciiData;
+                }
+
+                pBuffer += 2;
+            }
+
+            // Try reading 8 bits
+
+            if ((bufferLength & 1) != 0)
+            {
+                // If the buffer contains non-ASCII data, the comparison below will fail, and
+                // we'll end up not incrementing the buffer reference.
+
+                if (*(sbyte*)pBuffer >= 0)
+                {
+                    pBuffer++;
+                }
+            }
+
+        Finish:
+
+            nuint totalNumBytesRead = (nuint)pBuffer - (nuint)pOriginalBuffer;
+            return totalNumBytesRead;
+
+        FoundNonAsciiData:
+
+            Debug.Assert(!AllBytesInUInt32AreAscii(currentUInt32), "Shouldn't have reached this point if we have an all-ASCII input.");
+
+            // The method being called doesn't bother looking at whether the high byte is ASCII. There are only
+            // two scenarios: (a) either one of the earlier bytes is not ASCII and the search terminates before
+            // we get to the high byte; or (b) all of the earlier bytes are ASCII, so the high byte must be
+            // non-ASCII. In both cases we only care about the low 24 bits.
+
+            pBuffer += CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiData(currentUInt32);
+            goto Finish;
         }
 
         private static unsafe nuint GetIndexOfFirstNonAsciiByte_Default(byte* pBuffer, nuint bufferLength)
@@ -637,9 +869,230 @@ internal static unsafe nuint GetIndexOfFirstNonAsciiChar(char* pBuffer, nuint bu
             // pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while
             // this method is running.
 
-            return ((Sse2.IsSupported || AdvSimd.IsSupported) && BitConverter.IsLittleEndian)
-                ? GetIndexOfFirstNonAsciiChar_Intrinsified(pBuffer, bufferLength)
-                : GetIndexOfFirstNonAsciiChar_Default(pBuffer, bufferLength);
+            if (Vector512.IsHardwareAccelerated || Vector256.IsHardwareAccelerated)
+            {
+                return GetIndexOfFirstNonAsciiChar_Vector(pBuffer, bufferLength);
+            }
+            else if (Sse2.IsSupported || (AdvSimd.IsSupported && BitConverter.IsLittleEndian))
+            {
+                return GetIndexOfFirstNonAsciiChar_Intrinsified(pBuffer, bufferLength);
+            }
+            else
+            {
+                return GetIndexOfFirstNonAsciiChar_Default(pBuffer, bufferLength);
+            }
+        }
+
+        private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nuint bufferLength /* in chars */)
+        {
+            // Squirrel away the original buffer reference.This method works by determining the exact
+            // char reference where non-ASCII data begins, so we need this base value to perform the
+            // final subtraction at the end of the method to get the index into the original buffer.
+            char* pOriginalBuffer = pBuffer;
+
+#if SYSTEM_PRIVATE_CORELIB
+            Debug.Assert(bufferLength <= nuint.MaxValue / sizeof(char));
+#endif
+
+            // Before we drain off char-by-char, try a generic vectorized loop.
+            // Only run the loop if we have at least two vectors we can pull out.
+            if (Vector512.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector512<ushort>.Count)
+            {
+                uint SizeOfVector512InChars = (uint)Vector512<ushort>.Count; // JIT will make this a const
+                uint SizeOfVector512InBytes = (uint)Vector512<byte>.Count; // JIT will make this a const
+
+                Vector512<ushort> asciiMask = Vector512.Create((ushort) 0xFF80);
+
+                if (!VectorContainsNonAsciiChar(Unsafe.ReadUnaligned<Vector512<ushort>>(pBuffer)))
+                {
+                    // The first several elements of the input buffer were ASCII. Bump up the pointer to the
+                    // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII
+                    // data or we approach the end of the buffer. It's possible we'll reread data; this is ok.
+
+                    char* pFinalVectorReadPos = pBuffer + bufferLength - SizeOfVector512InChars;
+                    pBuffer = (char*)(((nuint)pBuffer + SizeOfVector512InBytes) & ~(nuint)(SizeOfVector512InBytes - 1));
+
+#if DEBUG
+                    long numCharsRead = pBuffer - pOriginalBuffer;
+                    Debug.Assert(0 < numCharsRead && numCharsRead <= SizeOfVector512InChars, "We should've made forward progress of at least one char.");
+                    Debug.Assert((nuint)numCharsRead <= bufferLength, "We shouldn't have read past the end of the input buffer.");
+#endif
+
+                    Debug.Assert(pBuffer <= pFinalVectorReadPos, "Should be able to read at least one vector.");
+
+                    do
+                    {
+                        Debug.Assert((nuint)pBuffer % SizeOfVector512InChars == 0, "Vector read should be aligned.");
+                        if (VectorContainsNonAsciiChar(Unsafe.Read<Vector512<ushort>>(pBuffer)))
+                        {
+                            break; // found non-ASCII data
+                        }
+                        pBuffer += SizeOfVector512InChars;
+                    } while (pBuffer <= pFinalVectorReadPos);
+
+                    // Adjust the remaining buffer length for the number of elements we just consumed.
+
+                    bufferLength -= ((nuint)pBuffer - (nuint)pOriginalBuffer) / sizeof(char);
+                }
+            }
+            else if (Vector256.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector256<ushort>.Count)
+            {
+                uint SizeOfVector256InChars = (uint)Vector256<ushort>.Count; // JIT will make this a const
+                uint SizeOfVector256InBytes = (uint)Vector256<byte>.Count; // JIT will make this a const
+
+                Vector256<ushort> asciiMask = Vector256.Create((ushort) 0xFF80);
+
+                if (!VectorContainsNonAsciiChar(Unsafe.ReadUnaligned<Vector256<ushort>>(pBuffer)))
+                {
+                    // The first several elements of the input buffer were ASCII. Bump up the pointer to the
+                    // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII
+                    // data or we approach the end of the buffer. It's possible we'll reread data; this is ok.
+
+                    char* pFinalVectorReadPos = pBuffer + bufferLength - SizeOfVector256InChars;
+                    pBuffer = (char*)(((nuint)pBuffer + SizeOfVector256InBytes) & ~(nuint)(SizeOfVector256InBytes - 1));
+
+#if DEBUG
+                    long numCharsRead = pBuffer - pOriginalBuffer;
+                    Debug.Assert(0 < numCharsRead && numCharsRead <= SizeOfVector256InChars, "We should've made forward progress of at least one char.");
+                    Debug.Assert((nuint)numCharsRead <= bufferLength, "We shouldn't have read past the end of the input buffer.");
+#endif
+
+                    Debug.Assert(pBuffer <= pFinalVectorReadPos, "Should be able to read at least one vector.");
+
+                    do
+                    {
+                        Debug.Assert((nuint)pBuffer % SizeOfVector256InChars == 0, "Vector read should be aligned.");
+                        if (VectorContainsNonAsciiChar(Unsafe.Read<Vector256<ushort>>(pBuffer)))
+                        {
+                            break; // found non-ASCII data
+                        }
+                        pBuffer += SizeOfVector256InChars;
+                    } while (pBuffer <= pFinalVectorReadPos);
+
+                    // Adjust the remaining buffer length for the number of elements we just consumed.
+
+                    bufferLength -= ((nuint)pBuffer - (nuint)pOriginalBuffer) / sizeof(char);
+                }
+            }
+            else if (Vector128.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector128<ushort>.Count)
+            {
+                uint SizeOfVector128InChars = (uint)Vector128<ushort>.Count; // JIT will make this a const
+                uint SizeOfVector128InBytes = (uint)Vector128<byte>.Count; // JIT will make this a const
+
+                Vector128<ushort> asciiMask = Vector128.Create((ushort) 0xFF80);
+
+                if (!VectorContainsNonAsciiChar(Unsafe.ReadUnaligned<Vector128<ushort>>(pBuffer)))
+                {
+                    // The first several elements of the input buffer were ASCII. Bump up the pointer to the
+                    // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII
+                    // data or we approach the end of the buffer. It's possible we'll reread data; this is ok.
+                    char* pFinalVectorReadPos = pBuffer + bufferLength - SizeOfVector128InChars;
+                    pBuffer = (char*)(((nuint)pBuffer + SizeOfVector128InBytes) & ~(nuint)(SizeOfVector128InBytes - 1));
+
+#if DEBUG
+                    long numCharsRead = pBuffer - pOriginalBuffer;
+                    Debug.Assert(0 < numCharsRead && numCharsRead <= SizeOfVector128InChars, "We should've made forward progress of at least one char.");
+                    Debug.Assert((nuint)numCharsRead <= bufferLength, "We shouldn't have read past the end of the input buffer.");
+#endif
+
+                    Debug.Assert(pBuffer <= pFinalVectorReadPos, "Should be able to read at least one vector.");
+
+                    do
+                    {
+                        Debug.Assert((nuint)pBuffer % SizeOfVector128InChars == 0, "Vector read should be aligned.");
+                        if (VectorContainsNonAsciiChar(Unsafe.Read<Vector128<ushort>>(pBuffer)))
+                        {
+                            break; // found non-ASCII data
+                        }
+                        pBuffer += SizeOfVector128InChars;
+                    } while (pBuffer <= pFinalVectorReadPos);
+
+                    // Adjust the remaining buffer length for the number of elements we just consumed.
+
+                    bufferLength -= ((nuint)pBuffer - (nuint)pOriginalBuffer) / sizeof(char);
+                }
+            }
+
+
+            // At this point, the buffer length wasn't enough to perform a vectorized search, or we did perform
+            // a vectorized search and encountered non-ASCII data. In either case go down a non-vectorized code
+            // path to drain any remaining ASCII chars.
+            //
+            // We're going to perform unaligned reads, so prefer 32-bit reads instead of 64-bit reads.
+            // This also allows us to perform more optimized bit twiddling tricks to count the number of ASCII chars.
+
+            uint currentUInt32;
+
+            // Try reading 64 bits at a time in a loop.
+
+            for (; bufferLength >= 4; bufferLength -= 4) // 64 bits = 4 * 16-bit chars
+            {
+                currentUInt32 = Unsafe.ReadUnaligned<uint>(pBuffer);
+                uint nextUInt32 = Unsafe.ReadUnaligned<uint>(pBuffer + 4 / sizeof(char));
+
+                if (!AllCharsInUInt32AreAscii(currentUInt32 | nextUInt32))
+                {
+                    // One of these two values contains non-ASCII chars.
+                    // Figure out which one it is, then put it in 'current' so that we can drain the ASCII chars.
+
+                    if (AllCharsInUInt32AreAscii(currentUInt32))
+                    {
+                        currentUInt32 = nextUInt32;
+                        pBuffer += 2;
+                    }
+
+                    goto FoundNonAsciiData;
+                }
+
+                pBuffer += 4; // consumed 4 ASCII chars
+            }
+
+            // From this point forward we don't need to keep track of the remaining buffer length.
+            // Try reading 32 bits.
+
+            if ((bufferLength & 2) != 0) // 32 bits = 2 * 16-bit chars
+            {
+                currentUInt32 = Unsafe.ReadUnaligned<uint>(pBuffer);
+                if (!AllCharsInUInt32AreAscii(currentUInt32))
+                {
+                    goto FoundNonAsciiData;
+                }
+
+                pBuffer += 2;
+            }
+
+            // Try reading 16 bits.
+            // No need to try an 8-bit read after this since we're working with chars.
+
+            if ((bufferLength & 1) != 0)
+            {
+                // If the buffer contains non-ASCII data, the comparison below will fail, and
+                // we'll end up not incrementing the buffer reference.
+
+                if (*pBuffer <= 0x007F)
+                {
+                    pBuffer++;
+                }
+            }
+
+        Finish:
+
+            nuint totalNumBytesRead = (nuint)pBuffer - (nuint)pOriginalBuffer;
+            Debug.Assert(totalNumBytesRead % sizeof(char) == 0, "Total number of bytes read should be even since we're working with chars.");
+            return totalNumBytesRead / sizeof(char); // convert byte count -> char count before returning
+
+        FoundNonAsciiData:
+
+            Debug.Assert(!AllCharsInUInt32AreAscii(currentUInt32), "Shouldn't have reached this point if we have an all-ASCII input.");
+
+            // We don't bother looking at the second char - only the first char.
+
+            if (FirstCharInUInt32IsAscii(currentUInt32))
+            {
+                pBuffer++;
+            }
+
+            goto Finish;
         }
 
         private static unsafe nuint GetIndexOfFirstNonAsciiChar_Default(char* pBuffer, nuint bufferLength /* in chars */)
diff --git a/src/libraries/System.Text.Encoding/tests/Ascii/FromUtf16Tests.cs b/src/libraries/System.Text.Encoding/tests/Ascii/FromUtf16Tests.cs
index 80a70042abb8..2f9186c4e9ec 100644
--- a/src/libraries/System.Text.Encoding/tests/Ascii/FromUtf16Tests.cs
+++ b/src/libraries/System.Text.Encoding/tests/Ascii/FromUtf16Tests.cs
@@ -19,15 +19,15 @@ public static unsafe void EmptyInputs()
         [Fact]
         public static void AllAsciiInput()
         {
-            using BoundedMemory<char> utf16Mem = BoundedMemory.Allocate<char>(128);
-            using BoundedMemory<byte> asciiMem = BoundedMemory.Allocate<byte>(128);
+            using BoundedMemory<char> utf16Mem = BoundedMemory.Allocate<char>(256);
+            using BoundedMemory<byte> asciiMem = BoundedMemory.Allocate<byte>(256);
 
             // Fill source with 00 .. 7F.
 
             Span<char> utf16Span = utf16Mem.Span;
             for (int i = 0; i < utf16Span.Length; i++)
             {
-                utf16Span[i] = (char)i;
+                utf16Span[i] = (char)(i % 128);
             }
             utf16Mem.MakeReadonly();
 
@@ -42,11 +42,11 @@ public static void AllAsciiInput()
 
                 // First, validate that the workhorse saw the incoming data as all-ASCII.
                 Assert.Equal(OperationStatus.Done, Ascii.FromUtf16(utf16Span.Slice(i), asciiSpan.Slice(i), out int bytesWritten));
-                Assert.Equal(128 - i, bytesWritten);
+                Assert.Equal(256 - i, bytesWritten);
 
                 // Then, validate that the data was transcoded properly.
 
-                for (int j = i; j < 128; j++)
+                for (int j = i; j < 256; j++)
                 {
                     Assert.Equal((ushort)utf16Span[i], (ushort)asciiSpan[i]);
                 }
@@ -56,15 +56,15 @@ public static void AllAsciiInput()
         [Fact]
         public static void SomeNonAsciiInput()
         {
-            using BoundedMemory<char> utf16Mem = BoundedMemory.Allocate<char>(128);
-            using BoundedMemory<byte> asciiMem = BoundedMemory.Allocate<byte>(128);
+            using BoundedMemory<char> utf16Mem = BoundedMemory.Allocate<char>(256);
+            using BoundedMemory<byte> asciiMem = BoundedMemory.Allocate<byte>(256);
 
             // Fill source with 00 .. 7F.
 
             Span<char> utf16Span = utf16Mem.Span;
             for (int i = 0; i < utf16Span.Length; i++)
             {
-                utf16Span[i] = (char)i;
+                utf16Span[i] = (char)(i % 128);
             }
 
             // We'll write to the ASCII span.
diff --git a/src/libraries/System.Text.Encoding/tests/Ascii/ToUtf16Tests.cs b/src/libraries/System.Text.Encoding/tests/Ascii/ToUtf16Tests.cs
index be9c71e14fbb..ec5c186973e3 100644
--- a/src/libraries/System.Text.Encoding/tests/Ascii/ToUtf16Tests.cs
+++ b/src/libraries/System.Text.Encoding/tests/Ascii/ToUtf16Tests.cs
@@ -20,15 +20,15 @@ public static void EmptyInputs()
         [Fact]
         public static void AllAsciiInput()
         {
-            using BoundedMemory<byte> asciiMem = BoundedMemory.Allocate<byte>(128);
-            using BoundedMemory<char> utf16Mem = BoundedMemory.Allocate<char>(128);
+            using BoundedMemory<byte> asciiMem = BoundedMemory.Allocate<byte>(256);
+            using BoundedMemory<char> utf16Mem = BoundedMemory.Allocate<char>(256);
 
             // Fill source with 00 .. 7F, then trap future writes.
 
             Span<byte> asciiSpan = asciiMem.Span;
             for (int i = 0; i < asciiSpan.Length; i++)
             {
-                asciiSpan[i] = (byte)i;
+                asciiSpan[i] = (byte)(i % 128);
             }
             asciiMem.MakeReadonly();
 
@@ -44,11 +44,11 @@ public static void AllAsciiInput()
                 // First, validate that the workhorse saw the incoming data as all-ASCII.
 
                 Assert.Equal(OperationStatus.Done, Ascii.ToUtf16(asciiSpan.Slice(i), utf16Span.Slice(i), out int charsWritten));
-                Assert.Equal(128 - i, charsWritten);
+                Assert.Equal(256 - i, charsWritten);
 
                 // Then, validate that the data was transcoded properly.
 
-                for (int j = i; j < 128; j++)
+                for (int j = i; j < 256; j++)
                 {
                     Assert.Equal((ushort)asciiSpan[i], (ushort)utf16Span[i]);
                 }
@@ -58,15 +58,15 @@ public static void AllAsciiInput()
         [Fact]
         public static void SomeNonAsciiInput()
         {
-            using BoundedMemory<byte> asciiMem = BoundedMemory.Allocate<byte>(128);
-            using BoundedMemory<char> utf16Mem = BoundedMemory.Allocate<char>(128);
+            using BoundedMemory<byte> asciiMem = BoundedMemory.Allocate<byte>(256);
+            using BoundedMemory<char> utf16Mem = BoundedMemory.Allocate<char>(256);
 
             // Fill source with 00 .. 7F, then trap future writes.
 
             Span<byte> asciiSpan = asciiMem.Span;
             for (int i = 0; i < asciiSpan.Length; i++)
             {
-                asciiSpan[i] = (byte)i;
+                asciiSpan[i] = (byte)(i % 128);
             }
 
             // We'll write to the UTF-16 span.

From 01a43e7e2b96d2ed6f9a78a9f8fa753516e010de Mon Sep 17 00:00:00 2001
From: "Canino, Anthony" <anthony.canino@intel.com>
Date: Fri, 7 Jul 2023 09:38:37 -0700
Subject: [PATCH 5/9] Adding optimization to Vecto256
 VectorContainsNonAsciiChar method.

---
 .../src/System/Text/Ascii.Utility.cs             | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs
index 5a3be66531a9..5afd401d9c85 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs
@@ -1907,10 +1907,18 @@ private static bool VectorContainsNonAsciiChar(Vector128<ushort> utf16Vector)
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private static bool VectorContainsNonAsciiChar(Vector256<ushort> utf16Vector)
         {
-            const ushort asciiMask = ushort.MaxValue - 127; // 0xFF80
-            Vector256<ushort> zeroIsAscii = utf16Vector & Vector256.Create(asciiMask);
-            // If a non-ASCII bit is set in any WORD of the vector, we have seen non-ASCII data.
-            return zeroIsAscii != Vector256<ushort>.Zero;
+            if (Avx.IsSupported)
+            {
+                Vector256<ushort> asciiMaskForTestZ = Vector256.Create((ushort)0xFF80);
+                return !Avx.TestZ(utf16Vector.AsInt16(), asciiMaskForTestZ.AsInt16());
+            }
+            else
+            {
+                const ushort asciiMask = ushort.MaxValue - 127; // 0xFF80
+                Vector256<ushort> zeroIsAscii = utf16Vector & Vector256.Create(asciiMask);
+                // If a non-ASCII bit is set in any WORD of the vector, we have seen non-ASCII data.
+                return zeroIsAscii != Vector256<ushort>.Zero;
+            }
         }
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]

From ea913dbfcb2c626eabc104b55263842cc4850415 Mon Sep 17 00:00:00 2001
From: "Canino, Anthony" <anthony.canino@intel.com>
Date: Wed, 12 Jul 2023 06:52:18 -0700
Subject: [PATCH 6/9] Code path refactoring and cleanup.

---
 .../src/System/Text/Ascii.Utility.cs          | 72 ++++++++++++++-----
 1 file changed, 54 insertions(+), 18 deletions(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs
index 5afd401d9c85..e82ccb5755b3 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs
@@ -130,9 +130,9 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu
 
             if (Vector512.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector512<byte>.Count)
             {
-                uint SizeOfVector512InBytes = (uint)Vector512<byte>.Count; // JIT will make this a const
+                uint SizeOfVector512InBytes = (uint)Vector512.Size; // JIT will make this a const
 
-                if (Unsafe.ReadUnaligned<Vector512<byte>>(pBuffer).ExtractMostSignificantBits() == 0)
+                if (!VectorContainsNonAsciiChar(Vector512.Load(pBuffer)))
                 {
                     // The first several elements of the input buffer were ASCII. Bump up the pointer to the
                     // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII
@@ -152,7 +152,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu
                     do
                     {
                         Debug.Assert((nuint)pBuffer % SizeOfVector512InBytes == 0, "Vector read should be aligned.");
-                        if (Unsafe.Read<Vector512<byte>>(pBuffer).ExtractMostSignificantBits() != 0)
+                        if (VectorContainsNonAsciiChar(Vector512.LoadAligned(pBuffer)))
                         {
                             break; // found non-ASCII data
                         }
@@ -168,9 +168,9 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu
             }
             else if (Vector256.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector256<byte>.Count)
             {
-                uint SizeOfVector256InBytes = (uint)Vector256<byte>.Count; // JIT will make this a const
+                uint SizeOfVector256InBytes = (uint)Vector256.Size; // JIT will make this a const
 
-                if (Unsafe.ReadUnaligned<Vector256<byte>>(pBuffer).ExtractMostSignificantBits() == 0)
+                if (!VectorContainsNonAsciiChar(Vector256.Load(pBuffer)))
                 {
                     // The first several elements of the input buffer were ASCII. Bump up the pointer to the
                     // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII
@@ -190,7 +190,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu
                     do
                     {
                         Debug.Assert((nuint)pBuffer % SizeOfVector256InBytes == 0, "Vector read should be aligned.");
-                        if (Unsafe.Read<Vector256<byte>>(pBuffer).ExtractMostSignificantBits() != 0)
+                        if (VectorContainsNonAsciiChar(Vector256.LoadAligned(pBuffer)))
                         {
                             break; // found non-ASCII data
                         }
@@ -206,9 +206,9 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu
             }
             else if (Vector128.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector128<byte>.Count)
             {
-                uint SizeOfVector128InBytes = (uint)Vector128<byte>.Count; // JIT will make this a const
+                uint SizeOfVector128InBytes = (uint)Vector128.Size; // JIT will make this a const
 
-                if (Unsafe.ReadUnaligned<Vector128<byte>>(pBuffer).ExtractMostSignificantBits() == 0)
+                if (!VectorContainsNonAsciiChar(Vector128.Load(pBuffer)))
                 {
                     // The first several elements of the input buffer were ASCII. Bump up the pointer to the
                     // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII
@@ -228,7 +228,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu
                     do
                     {
                         Debug.Assert((nuint)pBuffer % SizeOfVector128InBytes == 0, "Vector read should be aligned.");
-                        if (Unsafe.Read<Vector128<byte>>(pBuffer).ExtractMostSignificantBits() != 0)
+                        if (VectorContainsNonAsciiChar(Vector128.LoadAligned(pBuffer)))
                         {
                             break; // found non-ASCII data
                         }
@@ -899,11 +899,11 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu
             if (Vector512.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector512<ushort>.Count)
             {
                 uint SizeOfVector512InChars = (uint)Vector512<ushort>.Count; // JIT will make this a const
-                uint SizeOfVector512InBytes = (uint)Vector512<byte>.Count; // JIT will make this a const
+                uint SizeOfVector512InBytes = (uint)Vector512.Size; // JIT will make this a const
 
                 Vector512<ushort> asciiMask = Vector512.Create((ushort) 0xFF80);
 
-                if (!VectorContainsNonAsciiChar(Unsafe.ReadUnaligned<Vector512<ushort>>(pBuffer)))
+                if (!VectorContainsNonAsciiChar(Vector512.Load((ushort*)pBuffer)))
                 {
                     // The first several elements of the input buffer were ASCII. Bump up the pointer to the
                     // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII
@@ -923,7 +923,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu
                     do
                     {
                         Debug.Assert((nuint)pBuffer % SizeOfVector512InChars == 0, "Vector read should be aligned.");
-                        if (VectorContainsNonAsciiChar(Unsafe.Read<Vector512<ushort>>(pBuffer)))
+                        if (VectorContainsNonAsciiChar(Vector512.LoadAligned((ushort*)pBuffer)))
                         {
                             break; // found non-ASCII data
                         }
@@ -938,11 +938,11 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu
             else if (Vector256.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector256<ushort>.Count)
             {
                 uint SizeOfVector256InChars = (uint)Vector256<ushort>.Count; // JIT will make this a const
-                uint SizeOfVector256InBytes = (uint)Vector256<byte>.Count; // JIT will make this a const
+                uint SizeOfVector256InBytes = (uint)Vector256.Size; // JIT will make this a const
 
                 Vector256<ushort> asciiMask = Vector256.Create((ushort) 0xFF80);
 
-                if (!VectorContainsNonAsciiChar(Unsafe.ReadUnaligned<Vector256<ushort>>(pBuffer)))
+                if (!VectorContainsNonAsciiChar(Vector256.Load((ushort*)pBuffer)))
                 {
                     // The first several elements of the input buffer were ASCII. Bump up the pointer to the
                     // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII
@@ -962,7 +962,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu
                     do
                     {
                         Debug.Assert((nuint)pBuffer % SizeOfVector256InChars == 0, "Vector read should be aligned.");
-                        if (VectorContainsNonAsciiChar(Unsafe.Read<Vector256<ushort>>(pBuffer)))
+                        if (VectorContainsNonAsciiChar(Vector256.LoadAligned((ushort*)pBuffer)))
                         {
                             break; // found non-ASCII data
                         }
@@ -977,11 +977,11 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu
             else if (Vector128.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector128<ushort>.Count)
             {
                 uint SizeOfVector128InChars = (uint)Vector128<ushort>.Count; // JIT will make this a const
-                uint SizeOfVector128InBytes = (uint)Vector128<byte>.Count; // JIT will make this a const
+                uint SizeOfVector128InBytes = (uint)Vector128.Size; // JIT will make this a const
 
                 Vector128<ushort> asciiMask = Vector128.Create((ushort) 0xFF80);
 
-                if (!VectorContainsNonAsciiChar(Unsafe.ReadUnaligned<Vector128<ushort>>(pBuffer)))
+                if (!VectorContainsNonAsciiChar(Vector128.Load((ushort*)pBuffer)))
                 {
                     // The first several elements of the input buffer were ASCII. Bump up the pointer to the
                     // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII
@@ -1000,7 +1000,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu
                     do
                     {
                         Debug.Assert((nuint)pBuffer % SizeOfVector128InChars == 0, "Vector read should be aligned.");
-                        if (VectorContainsNonAsciiChar(Unsafe.Read<Vector128<ushort>>(pBuffer)))
+                        if (VectorContainsNonAsciiChar(Vector128.LoadAligned((ushort*)pBuffer)))
                         {
                             break; // found non-ASCII data
                         }
@@ -1867,6 +1867,20 @@ private static bool VectorContainsNonAsciiChar(Vector128<byte> asciiVector)
             }
         }
 
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool VectorContainsNonAsciiChar(Vector256<byte> asciiVector)
+        {
+            // max ASCII character is 0b_0111_1111, so the most significant bit (0x80) tells whether it contains non ascii
+            return asciiVector.ExtractMostSignificantBits() != 0;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool VectorContainsNonAsciiChar(Vector512<byte> asciiVector)
+        {
+            // max ASCII character is 0b_0111_1111, so the most significant bit (0x80) tells whether it contains non ascii
+            return asciiVector.ExtractMostSignificantBits() != 0;
+        }
+
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private static bool VectorContainsNonAsciiChar(Vector128<ushort> utf16Vector)
         {
@@ -1941,6 +1955,28 @@ private static bool VectorContainsNonAsciiChar<T>(Vector128<T> vector)
                 : VectorContainsNonAsciiChar(vector.AsUInt16());
         }
 
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool VectorContainsNonAsciiChar<T>(Vector256<T> vector)
+            where T : unmanaged
+        {
+            Debug.Assert(typeof(T) == typeof(byte) || typeof(T) == typeof(ushort));
+
+            return typeof(T) == typeof(byte)
+                ? VectorContainsNonAsciiChar(vector.AsByte())
+                : VectorContainsNonAsciiChar(vector.AsUInt16());
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool VectorContainsNonAsciiChar<T>(Vector512<T> vector)
+            where T : unmanaged
+        {
+            Debug.Assert(typeof(T) == typeof(byte) || typeof(T) == typeof(ushort));
+
+            return typeof(T) == typeof(byte)
+                ? VectorContainsNonAsciiChar(vector.AsByte())
+                : VectorContainsNonAsciiChar(vector.AsUInt16());
+        }
+
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private static bool AllCharsInVectorAreAscii<T>(Vector128<T> vector)
             where T : unmanaged

From 5cb1efc99d8d533966aaf55b4a6346fecd03731d Mon Sep 17 00:00:00 2001
From: Ruihan-Yin <ruihan.yin@intel.com>
Date: Wed, 12 Jul 2023 15:25:49 -0700
Subject: [PATCH 7/9] Code changes based on the review: 1. turn some variables
 into explicitly specified const. 2. removed some helper functions and inlined
 them.

---
 .../System/Runtime/Intrinsics/Vector256.cs    |  18 --
 .../System/Runtime/Intrinsics/Vector512.cs    |  17 --
 .../src/System/Text/Ascii.Utility.cs          | 207 ++++++------------
 3 files changed, 71 insertions(+), 171 deletions(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs
index e36c41f50338..758d0291af49 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs
@@ -2634,24 +2634,6 @@ public static void StoreUnsafe<T>(this Vector256<T> source, ref T destination, n
             Unsafe.WriteUnaligned(ref Unsafe.As<T, byte>(ref destination), source);
         }
 
-        /// <summary>
-        /// Stores to lower 128 bits of <paramref name="source"/> to memory destination of <paramref name="destination"/>[<paramref name="elementOffset"/>]
-        /// </summary>
-        /// <typeparam name="T">The type of the elements in the vector.</typeparam>
-        /// <param name="source">The vector that will be stored.</param>
-        /// <param name="destination">The destination to which <paramref name="elementOffset" /> will be added before the vector will be stored.</param>
-        /// <param name="elementOffset">The element offset from <paramref name="destination" /> from which the vector will be stored.</param>
-        /// <remarks>
-        /// Uses double instead of long to get a single instruction instead of storing temps on general porpose register (or stack)
-        /// </remarks>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        internal static void StoreLowerUnsafe<T>(this Vector256<T> source, ref T destination, nuint elementOffset = 0)
-        {
-            ThrowHelper.ThrowForUnsupportedIntrinsicsVector256BaseType<T>();
-            ref byte address = ref Unsafe.As<T, byte>(ref Unsafe.Add(ref destination, elementOffset));
-            Unsafe.WriteUnaligned(ref address, source._lower);
-        }
-
         /// <summary>Subtracts two vectors to compute their difference.</summary>
         /// <param name="left">The vector from which <paramref name="right" /> will be subtracted.</param>
         /// <param name="right">The vector to subtract from <paramref name="left" />.</param>
diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector512.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector512.cs
index fad4f94bd660..5b740db43009 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector512.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector512.cs
@@ -2670,23 +2670,6 @@ public static void StoreUnsafe<T>(this Vector512<T> source, ref T destination)
             ref byte address = ref Unsafe.As<T, byte>(ref destination);
             Unsafe.WriteUnaligned(ref address, source);
         }
-        /// <summary>
-        /// Stores to lower 256 bits of <paramref name="source"/> to memory destination of <paramref name="destination"/>[<paramref name="elementOffset"/>]
-        /// </summary>
-        /// <typeparam name="T">The type of the elements in the vector.</typeparam>
-        /// <param name="source">The vector that will be stored.</param>
-        /// <param name="destination">The destination to which <paramref name="elementOffset" /> will be added before the vector will be stored.</param>
-        /// <param name="elementOffset">The element offset from <paramref name="destination" /> from which the vector will be stored.</param>
-        /// <remarks>
-        /// Uses double instead of long to get a single instruction instead of storing temps on general porpose register (or stack)
-        /// </remarks>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        internal static void StoreLowerUnsafe<T>(this Vector512<T> source, ref T destination, nuint elementOffset = 0)
-        {
-            ThrowHelper.ThrowForUnsupportedIntrinsicsVector512BaseType<T>();
-            ref byte address = ref Unsafe.As<T, byte>(ref Unsafe.Add(ref destination, elementOffset));
-            Unsafe.WriteUnaligned(ref address, source._lower);
-        }
 
         /// <summary>Stores a vector at the given destination.</summary>
         /// <typeparam name="T">The type of the elements in the vector.</typeparam>
diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs
index e82ccb5755b3..7d74bfb6d8fa 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs
@@ -130,20 +130,19 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu
 
             if (Vector512.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector512<byte>.Count)
             {
-                uint SizeOfVector512InBytes = (uint)Vector512.Size; // JIT will make this a const
 
-                if (!VectorContainsNonAsciiChar(Vector512.Load(pBuffer)))
+                if (Vector512.Load(pBuffer).ExtractMostSignificantBits() == 0)
                 {
                     // The first several elements of the input buffer were ASCII. Bump up the pointer to the
                     // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII
                     // data or we approach the end of the buffer. It's possible we'll reread data; this is ok.
 
-                    byte* pFinalVectorReadPos = pBuffer + bufferLength - SizeOfVector512InBytes;
-                    pBuffer = (byte*)(((nuint)pBuffer + SizeOfVector512InBytes) & ~(nuint)(SizeOfVector512InBytes - 1));
+                    byte* pFinalVectorReadPos = pBuffer + bufferLength - Vector512.Size;
+                    pBuffer = (byte*)(((nuint)pBuffer + Vector512.Size) & ~(nuint)(Vector512.Size - 1));
 
 #if DEBUG
                     long numBytesRead = pBuffer - pOriginalBuffer;
-                    Debug.Assert(0 < numBytesRead && numBytesRead <= SizeOfVector512InBytes, "We should've made forward progress of at least one byte.");
+                    Debug.Assert(0 < numBytesRead && numBytesRead <= Vector512.Size, "We should've made forward progress of at least one byte.");
                     Debug.Assert((nuint)numBytesRead <= bufferLength, "We shouldn't have read past the end of the input buffer.");
 #endif
 
@@ -151,13 +150,13 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu
 
                     do
                     {
-                        Debug.Assert((nuint)pBuffer % SizeOfVector512InBytes == 0, "Vector read should be aligned.");
-                        if (VectorContainsNonAsciiChar(Vector512.LoadAligned(pBuffer)))
+                        Debug.Assert((nuint)pBuffer % Vector512.Size == 0, "Vector read should be aligned.");
+                        if (Vector512.LoadAligned(pBuffer).ExtractMostSignificantBits() != 0)
                         {
                             break; // found non-ASCII data
                         }
 
-                        pBuffer += SizeOfVector512InBytes;
+                        pBuffer += Vector512.Size;
                     } while (pBuffer <= pFinalVectorReadPos);
 
                     // Adjust the remaining buffer length for the number of elements we just consumed.
@@ -168,20 +167,19 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu
             }
             else if (Vector256.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector256<byte>.Count)
             {
-                uint SizeOfVector256InBytes = (uint)Vector256.Size; // JIT will make this a const
 
-                if (!VectorContainsNonAsciiChar(Vector256.Load(pBuffer)))
+                if (Vector256.Load(pBuffer).ExtractMostSignificantBits() == 0)
                 {
                     // The first several elements of the input buffer were ASCII. Bump up the pointer to the
                     // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII
                     // data or we approach the end of the buffer. It's possible we'll reread data; this is ok.
 
-                    byte* pFinalVectorReadPos = pBuffer + bufferLength - SizeOfVector256InBytes;
-                    pBuffer = (byte*)(((nuint)pBuffer + SizeOfVector256InBytes) & ~(nuint)(SizeOfVector256InBytes - 1));
+                    byte* pFinalVectorReadPos = pBuffer + bufferLength - Vector256.Size;
+                    pBuffer = (byte*)(((nuint)pBuffer + Vector256.Size) & ~(nuint)(Vector256.Size - 1));
 
 #if DEBUG
                     long numBytesRead = pBuffer - pOriginalBuffer;
-                    Debug.Assert(0 < numBytesRead && numBytesRead <= SizeOfVector256InBytes, "We should've made forward progress of at least one byte.");
+                    Debug.Assert(0 < numBytesRead && numBytesRead <= Vector256.Size, "We should've made forward progress of at least one byte.");
                     Debug.Assert((nuint)numBytesRead <= bufferLength, "We shouldn't have read past the end of the input buffer.");
 #endif
 
@@ -189,13 +187,13 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu
 
                     do
                     {
-                        Debug.Assert((nuint)pBuffer % SizeOfVector256InBytes == 0, "Vector read should be aligned.");
-                        if (VectorContainsNonAsciiChar(Vector256.LoadAligned(pBuffer)))
+                        Debug.Assert((nuint)pBuffer % Vector256.Size == 0, "Vector read should be aligned.");
+                        if (Vector256.LoadAligned(pBuffer).ExtractMostSignificantBits() != 0)
                         {
                             break; // found non-ASCII data
                         }
 
-                        pBuffer += SizeOfVector256InBytes;
+                        pBuffer += Vector256.Size;
                     } while (pBuffer <= pFinalVectorReadPos);
 
                     // Adjust the remaining buffer length for the number of elements we just consumed.
@@ -206,7 +204,6 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu
             }
             else if (Vector128.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector128<byte>.Count)
             {
-                uint SizeOfVector128InBytes = (uint)Vector128.Size; // JIT will make this a const
 
                 if (!VectorContainsNonAsciiChar(Vector128.Load(pBuffer)))
                 {
@@ -214,12 +211,12 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu
                     // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII
                     // data or we approach the end of the buffer. It's possible we'll reread data; this is ok.
 
-                    byte* pFinalVectorReadPos = pBuffer + bufferLength - SizeOfVector128InBytes;
-                    pBuffer = (byte*)(((nuint)pBuffer + SizeOfVector128InBytes) & ~(nuint)(SizeOfVector128InBytes - 1));
+                    byte* pFinalVectorReadPos = pBuffer + bufferLength - Vector128.Size;
+                    pBuffer = (byte*)(((nuint)pBuffer + Vector128.Size) & ~(nuint)(Vector128.Size - 1));
 
 #if DEBUG
                     long numBytesRead = pBuffer - pOriginalBuffer;
-                    Debug.Assert(0 < numBytesRead && numBytesRead <= SizeOfVector128InBytes, "We should've made forward progress of at least one byte.");
+                    Debug.Assert(0 < numBytesRead && numBytesRead <= Vector128.Size, "We should've made forward progress of at least one byte.");
                     Debug.Assert((nuint)numBytesRead <= bufferLength, "We shouldn't have read past the end of the input buffer.");
 #endif
 
@@ -227,13 +224,13 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu
 
                     do
                     {
-                        Debug.Assert((nuint)pBuffer % SizeOfVector128InBytes == 0, "Vector read should be aligned.");
+                        Debug.Assert((nuint)pBuffer % Vector128.Size == 0, "Vector read should be aligned.");
                         if (VectorContainsNonAsciiChar(Vector128.LoadAligned(pBuffer)))
                         {
                             break; // found non-ASCII data
                         }
 
-                        pBuffer += SizeOfVector128InBytes;
+                        pBuffer += Vector128.Size;
                     } while (pBuffer <= pFinalVectorReadPos);
 
                     // Adjust the remaining buffer length for the number of elements we just consumed.
@@ -898,8 +895,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu
             // Only run the loop if we have at least two vectors we can pull out.
             if (Vector512.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector512<ushort>.Count)
             {
-                uint SizeOfVector512InChars = (uint)Vector512<ushort>.Count; // JIT will make this a const
-                uint SizeOfVector512InBytes = (uint)Vector512.Size; // JIT will make this a const
+                const uint SizeOfVector512InChars = Vector512.Size / sizeof(ushort);
 
                 Vector512<ushort> asciiMask = Vector512.Create((ushort) 0xFF80);
 
@@ -910,7 +906,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu
                     // data or we approach the end of the buffer. It's possible we'll reread data; this is ok.
 
                     char* pFinalVectorReadPos = pBuffer + bufferLength - SizeOfVector512InChars;
-                    pBuffer = (char*)(((nuint)pBuffer + SizeOfVector512InBytes) & ~(nuint)(SizeOfVector512InBytes - 1));
+                    pBuffer = (char*)(((nuint)pBuffer + Vector512.Size) & ~(nuint)(Vector512.Size - 1));
 
 #if DEBUG
                     long numCharsRead = pBuffer - pOriginalBuffer;
@@ -937,8 +933,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu
             }
             else if (Vector256.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector256<ushort>.Count)
             {
-                uint SizeOfVector256InChars = (uint)Vector256<ushort>.Count; // JIT will make this a const
-                uint SizeOfVector256InBytes = (uint)Vector256.Size; // JIT will make this a const
+                const uint SizeOfVector256InChars = Vector256.Size / sizeof(ushort);
 
                 Vector256<ushort> asciiMask = Vector256.Create((ushort) 0xFF80);
 
@@ -949,7 +944,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu
                     // data or we approach the end of the buffer. It's possible we'll reread data; this is ok.
 
                     char* pFinalVectorReadPos = pBuffer + bufferLength - SizeOfVector256InChars;
-                    pBuffer = (char*)(((nuint)pBuffer + SizeOfVector256InBytes) & ~(nuint)(SizeOfVector256InBytes - 1));
+                    pBuffer = (char*)(((nuint)pBuffer + Vector256.Size) & ~(nuint)(Vector256.Size - 1));
 
 #if DEBUG
                     long numCharsRead = pBuffer - pOriginalBuffer;
@@ -976,8 +971,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu
             }
             else if (Vector128.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector128<ushort>.Count)
             {
-                uint SizeOfVector128InChars = (uint)Vector128<ushort>.Count; // JIT will make this a const
-                uint SizeOfVector128InBytes = (uint)Vector128.Size; // JIT will make this a const
+                const uint SizeOfVector128InChars = Vector128.Size / sizeof(ushort); // JIT will make this a const
 
                 Vector128<ushort> asciiMask = Vector128.Create((ushort) 0xFF80);
 
@@ -987,7 +981,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu
                     // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII
                     // data or we approach the end of the buffer. It's possible we'll reread data; this is ok.
                     char* pFinalVectorReadPos = pBuffer + bufferLength - SizeOfVector128InChars;
-                    pBuffer = (char*)(((nuint)pBuffer + SizeOfVector128InBytes) & ~(nuint)(SizeOfVector128InBytes - 1));
+                    pBuffer = (char*)(((nuint)pBuffer + Vector128.Size) & ~(nuint)(Vector128.Size - 1));
 
 #if DEBUG
                     long numCharsRead = pBuffer - pOriginalBuffer;
@@ -1245,8 +1239,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Intrinsified(char* pBuff
 
             // JIT turns the below into constants
 
-            uint SizeOfVector128InBytes = (uint)sizeof(Vector128<byte>);
-            uint SizeOfVector128InChars = SizeOfVector128InBytes / sizeof(char);
+            uint SizeOfVector128InChars = Vector128.Size / sizeof(char);
 
             Debug.Assert(Sse2.IsSupported || AdvSimd.Arm64.IsSupported, "Should've been checked by caller.");
             Debug.Assert(BitConverter.IsLittleEndian, "This SSE2/Arm64 assumes little-endian.");
@@ -1284,14 +1277,14 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Intrinsified(char* pBuff
 
             bufferLength <<= 1; // chars to bytes
 
-            if (bufferLength < 2 * SizeOfVector128InBytes)
+            if (bufferLength < 2 * Vector128.Size)
             {
                 goto IncrementCurrentOffsetBeforeFinalUnalignedVectorRead;
             }
 
             // Now adjust the read pointer so that future reads are aligned.
 
-            pBuffer = (char*)(((nuint)pBuffer + SizeOfVector128InBytes) & ~(nuint)(SizeOfVector128InBytes - 1));
+            pBuffer = (char*)(((nuint)pBuffer + Vector128.Size) & ~(nuint)(Vector128.Size - 1));
 
 #if DEBUG
             long numCharsRead = pBuffer - pOriginalBuffer;
@@ -1306,9 +1299,9 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Intrinsified(char* pBuff
 
             // The buffer is now properly aligned.
             // Read 2 vectors at a time if possible.
-            if (bufferLength >= 2 * SizeOfVector128InBytes)
+            if (bufferLength >= 2 * Vector128.Size)
             {
-                char* pFinalVectorReadPos = (char*)((nuint)pBuffer + bufferLength - 2 * SizeOfVector128InBytes);
+                char* pFinalVectorReadPos = (char*)((nuint)pBuffer + bufferLength - 2 * Vector128.Size);
 
                 // After this point, we no longer need to update the bufferLength value.
                 do
@@ -1337,7 +1330,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Intrinsified(char* pBuff
             // If there is fewer than one vector length remaining, skip the next aligned read.
             // Remember, at this point bufferLength is measured in bytes, not chars.
 
-            if ((bufferLength & SizeOfVector128InBytes) == 0)
+            if ((bufferLength & Vector128.Size) == 0)
             {
                 goto DoFinalUnalignedVectorRead;
             }
@@ -1357,12 +1350,12 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Intrinsified(char* pBuff
 
         DoFinalUnalignedVectorRead:
 
-            if (((byte)bufferLength & (SizeOfVector128InBytes - 1)) != 0)
+            if (((byte)bufferLength & (Vector128.Size - 1)) != 0)
             {
                 // Perform an unaligned read of the last vector.
                 // We need to adjust the pointer because we're re-reading data.
 
-                pBuffer = (char*)((byte*)pBuffer + (bufferLength & (SizeOfVector128InBytes - 1)) - SizeOfVector128InBytes);
+                pBuffer = (char*)((byte*)pBuffer + (bufferLength & (Vector128.Size - 1)) - Vector128.Size);
                 firstVector = Vector128.LoadUnsafe(ref *(ushort*)pBuffer);
                 if (VectorContainsNonAsciiChar(firstVector))
                 {
@@ -1867,20 +1860,6 @@ private static bool VectorContainsNonAsciiChar(Vector128<byte> asciiVector)
             }
         }
 
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static bool VectorContainsNonAsciiChar(Vector256<byte> asciiVector)
-        {
-            // max ASCII character is 0b_0111_1111, so the most significant bit (0x80) tells whether it contains non ascii
-            return asciiVector.ExtractMostSignificantBits() != 0;
-        }
-
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static bool VectorContainsNonAsciiChar(Vector512<byte> asciiVector)
-        {
-            // max ASCII character is 0b_0111_1111, so the most significant bit (0x80) tells whether it contains non ascii
-            return asciiVector.ExtractMostSignificantBits() != 0;
-        }
-
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private static bool VectorContainsNonAsciiChar(Vector128<ushort> utf16Vector)
         {
@@ -1955,28 +1934,6 @@ private static bool VectorContainsNonAsciiChar<T>(Vector128<T> vector)
                 : VectorContainsNonAsciiChar(vector.AsUInt16());
         }
 
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static bool VectorContainsNonAsciiChar<T>(Vector256<T> vector)
-            where T : unmanaged
-        {
-            Debug.Assert(typeof(T) == typeof(byte) || typeof(T) == typeof(ushort));
-
-            return typeof(T) == typeof(byte)
-                ? VectorContainsNonAsciiChar(vector.AsByte())
-                : VectorContainsNonAsciiChar(vector.AsUInt16());
-        }
-
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static bool VectorContainsNonAsciiChar<T>(Vector512<T> vector)
-            where T : unmanaged
-        {
-            Debug.Assert(typeof(T) == typeof(byte) || typeof(T) == typeof(ushort));
-
-            return typeof(T) == typeof(byte)
-                ? VectorContainsNonAsciiChar(vector.AsByte())
-                : VectorContainsNonAsciiChar(vector.AsUInt16());
-        }
-
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private static bool AllCharsInVectorAreAscii<T>(Vector128<T> vector)
             where T : unmanaged
@@ -2042,26 +1999,6 @@ private static Vector128<byte> ExtractAsciiVector(Vector128<ushort> vectorFirst,
             }
         }
 
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static Vector256<byte> ExtractAsciiVector(Vector256<ushort> vectorFirst, Vector256<ushort> vectorSecond)
-        {
-            // Narrows two vectors of words [ w7 w6 w5 w4 w3 w2 w1 w0 ] and [ w7' w6' w5' w4' w3' w2' w1' w0' ]
-            // to a vector of bytes [ b7 ... b0 b7' ... b0'].
-
-            // prefer architecture specific intrinsic as they don't perform additional AND like Vector512.Narrow does
-            return Vector256.Narrow(vectorFirst, vectorSecond);
-        }
-
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static Vector512<byte> ExtractAsciiVector(Vector512<ushort> vectorFirst, Vector512<ushort> vectorSecond)
-        {
-            // Narrows two vectors of words [ w7 w6 w5 w4 w3 w2 w1 w0 ] and [ w7' w6' w5' w4' w3' w2' w1' w0' ]
-            // to a vector of bytes [ b7 ... b0 b7' ... b0'].
-
-            // prefer architecture specific intrinsic as they don't perform additional AND like Vector512.Narrow does
-            return Vector512.Narrow(vectorFirst, vectorSecond);
-        }
-
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private static unsafe nuint NarrowUtf16ToAscii_Intrinsified(char* pUtf16Buffer, byte* pAsciiBuffer, nuint elementCount)
         {
@@ -2189,8 +2126,7 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_256(char* pUtf16Buff
 
             // JIT turns the below into constants
 
-            uint SizeOfVector256 = (uint)Vector256<byte>.Count;
-            nuint MaskOfAllBitsInVector256 = (nuint)(SizeOfVector256 - 1);
+            const nuint MaskOfAllBitsInVector256 = (nuint)(Vector256.Size - 1);
 
             // This method is written such that control generally flows top-to-bottom, avoiding
             // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII
@@ -2198,7 +2134,7 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_256(char* pUtf16Buff
 
             Debug.Assert(Vector256.IsHardwareAccelerated, "Vector256 is required.");
             Debug.Assert(BitConverter.IsLittleEndian, "This implementation assumes little-endian.");
-            Debug.Assert(elementCount >= 2 * SizeOfVector256);
+            Debug.Assert(elementCount >= 2 * Vector256.Size);
 
             // First, perform an unaligned read of the first part of the input buffer.
             ref ushort utf16Buffer = ref *(ushort*)pUtf16Buffer;
@@ -2213,9 +2149,9 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_256(char* pUtf16Buff
             // Turn the 16 ASCII chars we just read into 16 ASCII bytes, then copy it to the destination.
 
             ref byte asciiBuffer = ref *pAsciiBuffer;
-            Vector256<byte> asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorFirst);
-            asciiVector.StoreLowerUnsafe(ref asciiBuffer, 0);
-            nuint currentOffsetInElements = SizeOfVector256 / 2; // we processed 16 elements so far
+            Vector256<byte> asciiVector = Vector256.Narrow(utf16VectorFirst, utf16VectorFirst);
+            asciiVector.GetLower().StoreUnsafe(ref asciiBuffer, 0);
+            nuint currentOffsetInElements = Vector256.Size / 2; // we processed 16 elements so far
 
             // We're going to get the best performance when we have aligned writes, so we'll take the
             // hit of potentially unaligned reads in order to hit this sweet spot.
@@ -2227,7 +2163,7 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_256(char* pUtf16Buff
             // If the 0x08 bit is *not* set at the pinned address, then it means the 0x08 bit *is* set at
             // address &pAsciiBuffer[SizeOfVector128 / 2], and we should perform one more 8-byte write to bump
             // just past the next aligned boundary address.
-            if (((uint)pAsciiBuffer & (SizeOfVector256 / 2)) == 0)
+            if (((uint)pAsciiBuffer & (Vector256.Size / 2)) == 0)
             {
                 // We need to perform one more partial vector write before we can get the alignment we want.
 
@@ -2239,26 +2175,26 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_256(char* pUtf16Buff
                 }
 
                 // Turn the 32 ASCII chars we just read into 32 ASCII bytes, then copy it to the destination.
-                asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorFirst);
-                asciiVector.StoreLowerUnsafe(ref asciiBuffer, currentOffsetInElements);
+                asciiVector = Vector256.Narrow(utf16VectorFirst, utf16VectorFirst);
+                asciiVector.GetLower().StoreUnsafe(ref asciiBuffer, currentOffsetInElements);
             }
 
             // Calculate how many elements we wrote in order to get pAsciiBuffer to its next alignment
             // point, then use that as the base offset going forward.
 
-            currentOffsetInElements = SizeOfVector256 - ((nuint)pAsciiBuffer & MaskOfAllBitsInVector256);
+            currentOffsetInElements = Vector256.Size - ((nuint)pAsciiBuffer & MaskOfAllBitsInVector256);
 
-            Debug.Assert(0 < currentOffsetInElements && currentOffsetInElements <= SizeOfVector256, "We wrote at least 1 byte but no more than a whole vector.");
+            Debug.Assert(0 < currentOffsetInElements && currentOffsetInElements <= Vector256.Size, "We wrote at least 1 byte but no more than a whole vector.");
             Debug.Assert(currentOffsetInElements <= elementCount, "Shouldn't have overrun the destination buffer.");
-            Debug.Assert(elementCount - currentOffsetInElements >= SizeOfVector256, "We should be able to run at least one whole vector.");
+            Debug.Assert(elementCount - currentOffsetInElements >= Vector256.Size, "We should be able to run at least one whole vector.");
 
-            nuint finalOffsetWhereCanRunLoop = elementCount - SizeOfVector256;
+            nuint finalOffsetWhereCanRunLoop = elementCount - Vector256.Size;
             do
             {
                 // In a loop, perform two unaligned reads, narrow to a single vector, then aligned write one vector.
 
                 utf16VectorFirst = Vector256.LoadUnsafe(ref utf16Buffer, currentOffsetInElements);
-                Vector256<ushort> utf16VectorSecond = Vector256.LoadUnsafe(ref utf16Buffer, currentOffsetInElements + SizeOfVector256 / sizeof(short));
+                Vector256<ushort> utf16VectorSecond = Vector256.LoadUnsafe(ref utf16Buffer, currentOffsetInElements + Vector256.Size / sizeof(short));
                 Vector256<ushort> combinedVector = utf16VectorFirst | utf16VectorSecond;
 
                 if (VectorContainsNonAsciiChar(combinedVector))
@@ -2268,11 +2204,11 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_256(char* pUtf16Buff
 
                 // Build up the ASCII vector and perform the store.
 
-                Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % SizeOfVector256 == 0, "Write should be aligned.");
-                asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorSecond);
+                Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % Vector256.Size == 0, "Write should be aligned.");
+                asciiVector = Vector256.Narrow(utf16VectorFirst, utf16VectorSecond);
                 asciiVector.StoreUnsafe(ref asciiBuffer, currentOffsetInElements);
 
-                currentOffsetInElements += SizeOfVector256;
+                currentOffsetInElements += Vector256.Size;
             } while (currentOffsetInElements <= finalOffsetWhereCanRunLoop);
 
         Finish:
@@ -2292,9 +2228,9 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_256(char* pUtf16Buff
             // First part was all ASCII, narrow and aligned write. Note we're only filling in the low half of the vector.
 
             Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % sizeof(ulong) == 0, "Destination should be ulong-aligned.");
-            asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorFirst);
-            asciiVector.StoreLowerUnsafe(ref asciiBuffer, currentOffsetInElements);
-            currentOffsetInElements += SizeOfVector256 / 2;
+            asciiVector = Vector256.Narrow(utf16VectorFirst, utf16VectorFirst);
+            asciiVector.GetLower().StoreUnsafe(ref asciiBuffer, currentOffsetInElements);
+            currentOffsetInElements += Vector256.Size / 2;
 
             goto Finish;
         }
@@ -2307,8 +2243,7 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_512(char* pUtf16Buff
 
             // JIT turns the below into constants
 
-            uint SizeOfVector512 = (uint)Vector512<byte>.Count;
-            nuint MaskOfAllBitsInVector512 = (nuint)(SizeOfVector512 - 1);
+            const nuint MaskOfAllBitsInVector512 = (nuint)(Vector512.Size - 1);
 
             // This method is written such that control generally flows top-to-bottom, avoiding
             // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII
@@ -2316,7 +2251,7 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_512(char* pUtf16Buff
 
             Debug.Assert(Vector512.IsHardwareAccelerated, "Vector512 is required.");
             Debug.Assert(BitConverter.IsLittleEndian, "This implementation assumes little-endian.");
-            Debug.Assert(elementCount >= 2 * SizeOfVector512);
+            Debug.Assert(elementCount >= 2 * Vector512.Size);
 
             // First, perform an unaligned read of the first part of the input buffer.
             ref ushort utf16Buffer = ref *(ushort*)pUtf16Buffer;
@@ -2331,9 +2266,9 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_512(char* pUtf16Buff
             // Turn the 32 ASCII chars we just read into 32 ASCII bytes, then copy it to the destination.
 
             ref byte asciiBuffer = ref *pAsciiBuffer;
-            Vector512<byte> asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorFirst);
-            asciiVector.StoreLowerUnsafe(ref asciiBuffer, 0); // how to store the lower part of a avx512
-            nuint currentOffsetInElements = SizeOfVector512 / 2; // we processed 32 elements so far
+            Vector512<byte> asciiVector = Vector512.Narrow(utf16VectorFirst, utf16VectorFirst);
+            asciiVector.GetLower().StoreUnsafe(ref asciiBuffer, 0); // how to store the lower part of a avx512
+            nuint currentOffsetInElements = Vector512.Size / 2; // we processed 32 elements so far
 
             // We're going to get the best performance when we have aligned writes, so we'll take the
             // hit of potentially unaligned reads in order to hit this sweet spot.
@@ -2346,7 +2281,7 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_512(char* pUtf16Buff
             // address &pAsciiBuffer[SizeOfVector128 / 2], and we should perform one more 8-byte write to bump
             // just past the next aligned boundary address.
 
-            if (((uint)pAsciiBuffer & (SizeOfVector512 / 2)) == 0)
+            if (((uint)pAsciiBuffer & (Vector512.Size / 2)) == 0)
             {
                 // We need to perform one more partial vector write before we can get the alignment we want.
 
@@ -2358,26 +2293,26 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_512(char* pUtf16Buff
                 }
 
                 // Turn the 32 ASCII chars we just read into 32 ASCII bytes, then copy it to the destination.
-                asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorFirst);
-                asciiVector.StoreLowerUnsafe(ref asciiBuffer, currentOffsetInElements);
+                asciiVector = Vector512.Narrow(utf16VectorFirst, utf16VectorFirst);
+                asciiVector.GetLower().StoreUnsafe(ref asciiBuffer, currentOffsetInElements);
             }
 
             // Calculate how many elements we wrote in order to get pAsciiBuffer to its next alignment
             // point, then use that as the base offset going forward.
 
-            currentOffsetInElements = SizeOfVector512 - ((nuint)pAsciiBuffer & MaskOfAllBitsInVector512);
+            currentOffsetInElements = Vector512.Size - ((nuint)pAsciiBuffer & MaskOfAllBitsInVector512);
 
-            Debug.Assert(0 < currentOffsetInElements && currentOffsetInElements <= SizeOfVector512, "We wrote at least 1 byte but no more than a whole vector.");
+            Debug.Assert(0 < currentOffsetInElements && currentOffsetInElements <= Vector512.Size, "We wrote at least 1 byte but no more than a whole vector.");
             Debug.Assert(currentOffsetInElements <= elementCount, "Shouldn't have overrun the destination buffer.");
-            Debug.Assert(elementCount - currentOffsetInElements >= SizeOfVector512, "We should be able to run at least one whole vector.");
+            Debug.Assert(elementCount - currentOffsetInElements >= Vector512.Size, "We should be able to run at least one whole vector.");
 
-            nuint finalOffsetWhereCanRunLoop = elementCount - SizeOfVector512;
+            nuint finalOffsetWhereCanRunLoop = elementCount - Vector512.Size;
             do
             {
                 // In a loop, perform two unaligned reads, narrow to a single vector, then aligned write one vector.
 
                 utf16VectorFirst = Vector512.LoadUnsafe(ref utf16Buffer, currentOffsetInElements);
-                Vector512<ushort> utf16VectorSecond = Vector512.LoadUnsafe(ref utf16Buffer, currentOffsetInElements + SizeOfVector512 / sizeof(short));
+                Vector512<ushort> utf16VectorSecond = Vector512.LoadUnsafe(ref utf16Buffer, currentOffsetInElements + Vector512.Size / sizeof(short));
                 Vector512<ushort> combinedVector = utf16VectorFirst | utf16VectorSecond;
 
                 if (VectorContainsNonAsciiChar(combinedVector))
@@ -2387,11 +2322,11 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_512(char* pUtf16Buff
 
                 // Build up the ASCII vector and perform the store.
 
-                Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % SizeOfVector512 == 0, "Write should be aligned.");
-                asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorSecond);
+                Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % Vector512.Size == 0, "Write should be aligned.");
+                asciiVector = Vector512.Narrow(utf16VectorFirst, utf16VectorSecond);
                 asciiVector.StoreUnsafe(ref asciiBuffer, currentOffsetInElements);
 
-                currentOffsetInElements += SizeOfVector512;
+                currentOffsetInElements += Vector512.Size;
             } while (currentOffsetInElements <= finalOffsetWhereCanRunLoop);
 
         Finish:
@@ -2411,9 +2346,9 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_512(char* pUtf16Buff
             // First part was all ASCII, narrow and aligned write. Note we're only filling in the low half of the vector.
 
             Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % sizeof(ulong) == 0, "Destination should be ulong-aligned.");
-            asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorFirst);
-            asciiVector.StoreLowerUnsafe(ref asciiBuffer, currentOffsetInElements);
-            currentOffsetInElements += SizeOfVector512 / 2;
+            asciiVector = Vector512.Narrow(utf16VectorFirst, utf16VectorFirst);
+            asciiVector.GetLower().StoreUnsafe(ref asciiBuffer, currentOffsetInElements);
+            currentOffsetInElements += Vector512.Size / 2;
 
             goto Finish;
         }

From 5d06c67e42331736f114eb879aed3dfb7f54bebb Mon Sep 17 00:00:00 2001
From: Ruihan-Yin <ruihan.yin@intel.com>
Date: Thu, 13 Jul 2023 20:13:51 -0700
Subject: [PATCH 8/9] Resolve comments

---
 .../src/System/Text/Ascii.Utility.cs          | 52 +++++++++----------
 1 file changed, 24 insertions(+), 28 deletions(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs
index 7d74bfb6d8fa..9aa79da62211 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs
@@ -151,9 +151,10 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu
                     do
                     {
                         Debug.Assert((nuint)pBuffer % Vector512.Size == 0, "Vector read should be aligned.");
-                        if (Vector512.LoadAligned(pBuffer).ExtractMostSignificantBits() != 0)
+                        ulong msbMask = Vector512.LoadAligned(pBuffer).ExtractMostSignificantBits();
+                        if (msbMask != 0)
                         {
-                            break; // found non-ASCII data
+                            return (nuint)pBuffer - (nuint)pOriginalBuffer + (nuint)BitOperations.TrailingZeroCount(msbMask); // found non-ASCII data
                         }
 
                         pBuffer += Vector512.Size;
@@ -188,9 +189,10 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu
                     do
                     {
                         Debug.Assert((nuint)pBuffer % Vector256.Size == 0, "Vector read should be aligned.");
-                        if (Vector256.LoadAligned(pBuffer).ExtractMostSignificantBits() != 0)
+                        uint msbMask = Vector256.LoadAligned(pBuffer).ExtractMostSignificantBits();
+                        if (msbMask != 0)
                         {
-                            break; // found non-ASCII data
+                            return (nuint)pBuffer - (nuint)pOriginalBuffer + (nuint)BitOperations.TrailingZeroCount(msbMask); // found non-ASCII data
                         }
 
                         pBuffer += Vector256.Size;
@@ -897,8 +899,6 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu
             {
                 const uint SizeOfVector512InChars = Vector512.Size / sizeof(ushort);
 
-                Vector512<ushort> asciiMask = Vector512.Create((ushort) 0xFF80);
-
                 if (!VectorContainsNonAsciiChar(Vector512.Load((ushort*)pBuffer)))
                 {
                     // The first several elements of the input buffer were ASCII. Bump up the pointer to the
@@ -918,7 +918,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu
 
                     do
                     {
-                        Debug.Assert((nuint)pBuffer % SizeOfVector512InChars == 0, "Vector read should be aligned.");
+                        Debug.Assert((nuint)pBuffer % Vector512.Size == 0, "Vector read should be aligned.");
                         if (VectorContainsNonAsciiChar(Vector512.LoadAligned((ushort*)pBuffer)))
                         {
                             break; // found non-ASCII data
@@ -935,8 +935,6 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu
             {
                 const uint SizeOfVector256InChars = Vector256.Size / sizeof(ushort);
 
-                Vector256<ushort> asciiMask = Vector256.Create((ushort) 0xFF80);
-
                 if (!VectorContainsNonAsciiChar(Vector256.Load((ushort*)pBuffer)))
                 {
                     // The first several elements of the input buffer were ASCII. Bump up the pointer to the
@@ -956,7 +954,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu
 
                     do
                     {
-                        Debug.Assert((nuint)pBuffer % SizeOfVector256InChars == 0, "Vector read should be aligned.");
+                        Debug.Assert((nuint)pBuffer % Vector256.Size == 0, "Vector read should be aligned.");
                         if (VectorContainsNonAsciiChar(Vector256.LoadAligned((ushort*)pBuffer)))
                         {
                             break; // found non-ASCII data
@@ -973,8 +971,6 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu
             {
                 const uint SizeOfVector128InChars = Vector128.Size / sizeof(ushort); // JIT will make this a const
 
-                Vector128<ushort> asciiMask = Vector128.Create((ushort) 0xFF80);
-
                 if (!VectorContainsNonAsciiChar(Vector128.Load((ushort*)pBuffer)))
                 {
                     // The first several elements of the input buffer were ASCII. Bump up the pointer to the
@@ -993,7 +989,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu
 
                     do
                     {
-                        Debug.Assert((nuint)pBuffer % SizeOfVector128InChars == 0, "Vector read should be aligned.");
+                        Debug.Assert((nuint)pBuffer % Vector128.Size == 0, "Vector read should be aligned.");
                         if (VectorContainsNonAsciiChar(Vector128.LoadAligned((ushort*)pBuffer)))
                         {
                             break; // found non-ASCII data
@@ -1130,7 +1126,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Default(char* pBuffer, n
 
                     do
                     {
-                        Debug.Assert((nuint)pBuffer % SizeOfVectorInChars == 0, "Vector read should be aligned.");
+                        Debug.Assert((nuint)pBuffer % SizeOfVectorInBytes == 0, "Vector read should be aligned.");
                         if (Vector.GreaterThanAny(Unsafe.Read<Vector<ushort>>(pBuffer), maxAscii))
                         {
                             break; // found non-ASCII data
@@ -2157,11 +2153,11 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_256(char* pUtf16Buff
             // hit of potentially unaligned reads in order to hit this sweet spot.
 
             // pAsciiBuffer points to the start of the destination buffer, immediately before where we wrote
-            // the 8 bytes previously. If the 0x08 bit is set at the pinned address, then the 8 bytes we wrote
-            // previously mean that the 0x08 bit is *not* set at address &pAsciiBuffer[SizeOfVector128 / 2]. In
+            // the 16 bytes previously. If the 0x10 bit is set at the pinned address, then the 16 bytes we wrote
+            // previously mean that the 0x10 bit is *not* set at address &pAsciiBuffer[SizeOfVector256 / 2]. In
             // that case we can immediately back up to the previous aligned boundary and start the main loop.
-            // If the 0x08 bit is *not* set at the pinned address, then it means the 0x08 bit *is* set at
-            // address &pAsciiBuffer[SizeOfVector128 / 2], and we should perform one more 8-byte write to bump
+            // If the 0x10 bit is *not* set at the pinned address, then it means the 0x10 bit *is* set at
+            // address &pAsciiBuffer[SizeOfVector256 / 2], and we should perform one more 16-byte write to bump
             // just past the next aligned boundary address.
             if (((uint)pAsciiBuffer & (Vector256.Size / 2)) == 0)
             {
@@ -2174,7 +2170,7 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_256(char* pUtf16Buff
                     goto Finish;
                 }
 
-                // Turn the 32 ASCII chars we just read into 32 ASCII bytes, then copy it to the destination.
+                // Turn the 16 ASCII chars we just read into 16 ASCII bytes, then copy it to the destination.
                 asciiVector = Vector256.Narrow(utf16VectorFirst, utf16VectorFirst);
                 asciiVector.GetLower().StoreUnsafe(ref asciiBuffer, currentOffsetInElements);
             }
@@ -2227,7 +2223,7 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_256(char* pUtf16Buff
 
             // First part was all ASCII, narrow and aligned write. Note we're only filling in the low half of the vector.
 
-            Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % sizeof(ulong) == 0, "Destination should be ulong-aligned.");
+            Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % Vector128.Size == 0, "Destination should be 128-bit-aligned.");
             asciiVector = Vector256.Narrow(utf16VectorFirst, utf16VectorFirst);
             asciiVector.GetLower().StoreUnsafe(ref asciiBuffer, currentOffsetInElements);
             currentOffsetInElements += Vector256.Size / 2;
@@ -2274,11 +2270,11 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_512(char* pUtf16Buff
             // hit of potentially unaligned reads in order to hit this sweet spot.
 
             // pAsciiBuffer points to the start of the destination buffer, immediately before where we wrote
-            // the 8 bytes previously. If the 0x08 bit is set at the pinned address, then the 8 bytes we wrote
-            // previously mean that the 0x08 bit is *not* set at address &pAsciiBuffer[SizeOfVector128 / 2]. In
+            // the 32 bytes previously. If the 0x20 bit is set at the pinned address, then the 32 bytes we wrote
+            // previously mean that the 0x20 bit is *not* set at address &pAsciiBuffer[SizeOfVector512 / 2]. In
             // that case we can immediately back up to the previous aligned boundary and start the main loop.
-            // If the 0x08 bit is *not* set at the pinned address, then it means the 0x08 bit *is* set at
-            // address &pAsciiBuffer[SizeOfVector128 / 2], and we should perform one more 8-byte write to bump
+            // If the 0x20 bit is *not* set at the pinned address, then it means the 0x20 bit *is* set at
+            // address &pAsciiBuffer[SizeOfVector512 / 2], and we should perform one more 32-byte write to bump
             // just past the next aligned boundary address.
 
             if (((uint)pAsciiBuffer & (Vector512.Size / 2)) == 0)
@@ -2345,7 +2341,7 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_512(char* pUtf16Buff
 
             // First part was all ASCII, narrow and aligned write. Note we're only filling in the low half of the vector.
 
-            Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % sizeof(ulong) == 0, "Destination should be ulong-aligned.");
+            Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % Vector256.Size == 0, "Destination should be 256-bit-aligned.");
             asciiVector = Vector512.Narrow(utf16VectorFirst, utf16VectorFirst);
             asciiVector.GetLower().StoreUnsafe(ref asciiBuffer, currentOffsetInElements);
             currentOffsetInElements += Vector512.Size / 2;
@@ -2384,9 +2380,9 @@ internal static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16B
                             break;
                         }
 
-                        (Vector512<ushort> low, Vector512<ushort> upper) = Vector512.Widen(asciiVector);
-                        low.Store(pCurrentWriteAddress);
-                        upper.Store(pCurrentWriteAddress + Vector512<ushort>.Count);
+                        (Vector512<ushort> utf16LowVector, Vector512<ushort> utf16HighVector) = Vector512.Widen(asciiVector);
+                        utf16LowVector.Store(pCurrentWriteAddress);
+                        utf16HighVector.Store(pCurrentWriteAddress + Vector512<ushort>.Count);
 
                         currentOffset += (nuint)Vector512<byte>.Count;
                         pCurrentWriteAddress += (nuint)Vector512<byte>.Count;

From 17d3b28368e9afc69efcc6da3d50fef448bf7c98 Mon Sep 17 00:00:00 2001
From: Ruihan-Yin <ruihan.yin@intel.com>
Date: Fri, 14 Jul 2023 10:00:59 -0700
Subject: [PATCH 9/9] revert the changes at GetIndexOfFirstNonAsciiByte

---
 .../src/System/Text/Ascii.Utility.cs                   | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs
index 9aa79da62211..e30b3d06c781 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs
@@ -151,10 +151,9 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu
                     do
                     {
                         Debug.Assert((nuint)pBuffer % Vector512.Size == 0, "Vector read should be aligned.");
-                        ulong msbMask = Vector512.LoadAligned(pBuffer).ExtractMostSignificantBits();
-                        if (msbMask != 0)
+                        if (Vector512.LoadAligned(pBuffer).ExtractMostSignificantBits() != 0)
                         {
-                            return (nuint)pBuffer - (nuint)pOriginalBuffer + (nuint)BitOperations.TrailingZeroCount(msbMask); // found non-ASCII data
+                            break; // found non-ASCII data
                         }
 
                         pBuffer += Vector512.Size;
@@ -189,10 +188,9 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu
                     do
                     {
                         Debug.Assert((nuint)pBuffer % Vector256.Size == 0, "Vector read should be aligned.");
-                        uint msbMask = Vector256.LoadAligned(pBuffer).ExtractMostSignificantBits();
-                        if (msbMask != 0)
+                        if (Vector256.LoadAligned(pBuffer).ExtractMostSignificantBits() != 0)
                         {
-                            return (nuint)pBuffer - (nuint)pOriginalBuffer + (nuint)BitOperations.TrailingZeroCount(msbMask); // found non-ASCII data
+                            break; // found non-ASCII data
                         }
 
                         pBuffer += Vector256.Size;