Implement GetIndexOfFirstNonAsciiChar intrinsic on AArch64 (#71637)

* Implement GetIndexOfFirstNonAsciiChar intrinsic on AArch64 * Fix typo in a method name
dotnet · Jul 7, 2022 · 85d638b · 85d638b
1 parent 162cca2
commit 85d638b
Showing 1 changed file with 69 additions and 118 deletions.
diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs
@@ -616,13 +616,13 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Intrinsified(byte* pBuff
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static unsafe nuint GetIndexOfFirstNonAsciiChar(char* pBuffer, nuint bufferLength /* in chars */)
         {
-            // If SSE2 is supported, use those specific intrinsics instead of the generic vectorized
+            // If SSE2/ASIMD is supported, use those specific intrinsics instead of the generic vectorized
             // code below. This has two benefits: (a) we can take advantage of specific instructions like
             // pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while
             // this method is running.
 
-            return (Sse2.IsSupported)
-                ? GetIndexOfFirstNonAsciiChar_Sse2(pBuffer, bufferLength)
+            return ((Sse2.IsSupported || AdvSimd.IsSupported) && BitConverter.IsLittleEndian)
+                ? GetIndexOfFirstNonAsciiChar_Intrinsified(pBuffer, bufferLength)
                 : GetIndexOfFirstNonAsciiChar_Default(pBuffer, bufferLength);
         }
 
@@ -762,10 +762,10 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Default(char* pBuffer, n
             goto Finish;
         }
 
-        private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuint bufferLength /* in chars */)
+        private static unsafe nuint GetIndexOfFirstNonAsciiChar_Intrinsified(char* pBuffer, nuint bufferLength /* in chars */)
         {
-            // This method contains logic optimized for both SSE2 and SSE41. Much of the logic in this method
-            // will be elided by JIT once we determine which specific ISAs we support.
+            // This method contains logic optimized using vector instructions for both x64 and Arm64.
+            // Much of the logic in this method will be elided by JIT once we determine which specific ISAs we support.
 
             // Quick check for empty inputs.
 
@@ -779,8 +779,8 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuin
             uint SizeOfVector128InBytes = (uint)Unsafe.SizeOf<Vector128<byte>>();
             uint SizeOfVector128InChars = SizeOfVector128InBytes / sizeof(char);
 
-            Debug.Assert(Sse2.IsSupported, "Should've been checked by caller.");
-            Debug.Assert(BitConverter.IsLittleEndian, "SSE2 assumes little-endian.");
+            Debug.Assert(Sse2.IsSupported || AdvSimd.Arm64.IsSupported, "Should've been checked by caller.");
+            Debug.Assert(BitConverter.IsLittleEndian, "This SSE2/Arm64 assumes little-endian.");
 
             Vector128<ushort> firstVector, secondVector;
             uint currentMask;
@@ -795,27 +795,16 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuin
             // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII
             // data, we jump out of the hot paths to targets at the end of the method.
 
-            Vector128<ushort> asciiMaskForTestZ = Vector128.Create((ushort)0xFF80); // used for PTEST on supported hardware
-            Vector128<ushort> asciiMaskForAddSaturate = Vector128.Create((ushort)0x7F80); // used for PADDUSW
-            const uint NonAsciiDataSeenMask = 0b_1010_1010_1010_1010; // used for determining whether 'currentMask' contains non-ASCII data
-
 #if SYSTEM_PRIVATE_CORELIB
             Debug.Assert(bufferLength <= nuint.MaxValue / sizeof(char));
 #endif
 
             // Read the first vector unaligned.
 
-            firstVector = Sse2.LoadVector128((ushort*)pBuffer); // unaligned load
-
-            // The operation below forces the 0x8000 bit of each WORD to be set iff the WORD element
-            // has value >= 0x0800 (non-ASCII). Then we'll treat the vector as a BYTE vector in order
-            // to extract the mask. Reminder: the 0x0080 bit of each WORD should be ignored.
-
-            currentMask = (uint)Sse2.MoveMask(Sse2.AddSaturate(firstVector, asciiMaskForAddSaturate).AsByte());
-
-            if ((currentMask & NonAsciiDataSeenMask) != 0)
+            firstVector = Vector128.LoadUnsafe(ref *(ushort*)pBuffer);
+            if (VectorContainsNonAsciiChar(firstVector))
             {
-                goto FoundNonAsciiDataInCurrentMask;
+                goto FoundNonAsciiDataInFirstVector;
             }
 
             // If we have less than 32 bytes to process, just go straight to the final unaligned
@@ -843,41 +832,26 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuin
 
             // Adjust remaining buffer length.
 
-            bufferLength += (nuint)pOriginalBuffer;
-            bufferLength -= (nuint)pBuffer;
+            nuint numBytesRead = ((nuint)pBuffer - (nuint)pOriginalBuffer);
+            bufferLength -= numBytesRead;
 
             // The buffer is now properly aligned.
             // Read 2 vectors at a time if possible.
-
             if (bufferLength >= 2 * SizeOfVector128InBytes)
             {
                 char* pFinalVectorReadPos = (char*)((nuint)pBuffer + bufferLength - 2 * SizeOfVector128InBytes);
 
                 // After this point, we no longer need to update the bufferLength value.
-
                 do
                 {
-                    firstVector = Sse2.LoadAlignedVector128((ushort*)pBuffer);
-                    secondVector = Sse2.LoadAlignedVector128((ushort*)pBuffer + SizeOfVector128InChars);
-                    Vector128<ushort> combinedVector = Sse2.Or(firstVector, secondVector);
 
-                    if (Sse41.IsSupported)
-                    {
-                        // If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data.
-                        // Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data.
-                        if (!Sse41.TestZ(combinedVector, asciiMaskForTestZ))
-                        {
-                            goto FoundNonAsciiDataInFirstOrSecondVector;
-                        }
-                    }
-                    else
+                    firstVector = Vector128.LoadUnsafe(ref *(ushort*)pBuffer);
+                    secondVector = Vector128.LoadUnsafe(ref *(ushort*)pBuffer, SizeOfVector128InChars);
+                    Vector128<ushort> combinedVector = firstVector | secondVector;
+
+                    if (VectorContainsNonAsciiChar(combinedVector))
                     {
-                        // See comment earlier in the method for an explanation of how the below logic works.
-                        currentMask = (uint)Sse2.MoveMask(Sse2.AddSaturate(combinedVector, asciiMaskForAddSaturate).AsByte());
-                        if ((currentMask & NonAsciiDataSeenMask) != 0)
-                        {
-                            goto FoundNonAsciiDataInFirstOrSecondVector;
-                        }
+                        goto FoundNonAsciiDataInFirstOrSecondVector;
                     }
 
                     pBuffer += 2 * SizeOfVector128InChars;
@@ -902,25 +876,10 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuin
             // At least one full vector's worth of data remains, so we can safely read it.
             // Remember, at this point pBuffer is still aligned.
 
-            firstVector = Sse2.LoadAlignedVector128((ushort*)pBuffer);
-
-            if (Sse41.IsSupported)
-            {
-                // If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data.
-                // Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data.
-                if (!Sse41.TestZ(firstVector, asciiMaskForTestZ))
-                {
-                    goto FoundNonAsciiDataInFirstVector;
-                }
-            }
-            else
+            firstVector = Vector128.LoadUnsafe(ref *(ushort*)pBuffer);
+            if (VectorContainsNonAsciiChar(firstVector))
             {
-                // See comment earlier in the method for an explanation of how the below logic works.
-                currentMask = (uint)Sse2.MoveMask(Sse2.AddSaturate(firstVector, asciiMaskForAddSaturate).AsByte());
-                if ((currentMask & NonAsciiDataSeenMask) != 0)
-                {
-                    goto FoundNonAsciiDataInCurrentMask;
-                }
+                goto FoundNonAsciiDataInFirstVector;
             }
 
         IncrementCurrentOffsetBeforeFinalUnalignedVectorRead:
@@ -935,25 +894,10 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuin
                 // We need to adjust the pointer because we're re-reading data.
 
                 pBuffer = (char*)((byte*)pBuffer + (bufferLength & (SizeOfVector128InBytes - 1)) - SizeOfVector128InBytes);
-                firstVector = Sse2.LoadVector128((ushort*)pBuffer); // unaligned load
-
-                if (Sse41.IsSupported)
-                {
-                    // If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data.
-                    // Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data.
-                    if (!Sse41.TestZ(firstVector, asciiMaskForTestZ))
-                    {
-                        goto FoundNonAsciiDataInFirstVector;
-                    }
-                }
-                else
+                firstVector = Vector128.LoadUnsafe(ref *(ushort*)pBuffer);
+                if (VectorContainsNonAsciiChar(firstVector))
                 {
-                    // See comment earlier in the method for an explanation of how the below logic works.
-                    currentMask = (uint)Sse2.MoveMask(Sse2.AddSaturate(firstVector, asciiMaskForAddSaturate).AsByte());
-                    if ((currentMask & NonAsciiDataSeenMask) != 0)
-                    {
-                        goto FoundNonAsciiDataInCurrentMask;
-                    }
+                    goto FoundNonAsciiDataInFirstVector;
                 }
 
                 pBuffer += SizeOfVector128InChars;
@@ -970,21 +914,9 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuin
             // vector, and if that's all-ASCII then the second vector must be the culprit. Either way
             // we'll make sure the first vector local is the one that contains the non-ASCII data.
 
-            // See comment earlier in the method for an explanation of how the below logic works.
-            if (Sse41.IsSupported)
-            {
-                if (!Sse41.TestZ(firstVector, asciiMaskForTestZ))
-                {
-                    goto FoundNonAsciiDataInFirstVector;
-                }
-            }
-            else
+            if (VectorContainsNonAsciiChar(firstVector))
             {
-                currentMask = (uint)Sse2.MoveMask(Sse2.AddSaturate(firstVector, asciiMaskForAddSaturate).AsByte());
-                if ((currentMask & NonAsciiDataSeenMask) != 0)
-                {
-                    goto FoundNonAsciiDataInCurrentMask;
-                }
+                goto FoundNonAsciiDataInFirstVector;
             }
 
             // Wasn't the first vector; must be the second.
@@ -994,29 +926,48 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuin
 
         FoundNonAsciiDataInFirstVector:
 
-            // See comment earlier in the method for an explanation of how the below logic works.
-            currentMask = (uint)Sse2.MoveMask(Sse2.AddSaturate(firstVector, asciiMaskForAddSaturate).AsByte());
-
-        FoundNonAsciiDataInCurrentMask:
-
-            // See comment earlier in the method accounting for the 0x8000 and 0x0080 bits set after the WORD-sized operations.
-
-            currentMask &= NonAsciiDataSeenMask;
-
-            // Now, the mask contains - from the LSB - a 0b00 pair for each ASCII char we saw, and a 0b10 pair for each non-ASCII char.
-            //
-            // (Keep endianness in mind in the below examples.)
-            // A non-ASCII char followed by two ASCII chars is 0b..._00_00_10. (tzcnt = 1)
-            // An ASCII char followed by two non-ASCII chars is 0b..._10_10_00. (tzcnt = 3)
-            // Two ASCII chars followed by a non-ASCII char is 0b..._10_00_00. (tzcnt = 5)
-            //
-            // This means tzcnt = 2 * numLeadingAsciiChars + 1. We can conveniently take advantage of the fact
-            // that the 2x multiplier already matches the char* stride length, then just subtract 1 at the end to
-            // compute the correct final ending pointer value.
+            if (Sse2.IsSupported)
+            {
+                // The operation below forces the 0x8000 bit of each WORD to be set iff the WORD element
+                // has value >= 0x0800 (non-ASCII). Then we'll treat the vector as a BYTE vector in order
+                // to extract the mask. Reminder: the 0x0080 bit of each WORD should be ignored.
+                Vector128<ushort> asciiMaskForAddSaturate = Vector128.Create((ushort)0x7F80);
+                const uint NonAsciiDataSeenMask = 0b_1010_1010_1010_1010; // used for determining whether 'currentMask' contains non-ASCII data
 
-            Debug.Assert(currentMask != 0, "Shouldn't be here unless we see non-ASCII data.");
-            pBuffer = (char*)((byte*)pBuffer + (uint)BitOperations.TrailingZeroCount(currentMask) - 1);
+                currentMask = (uint)Sse2.MoveMask(Sse2.AddSaturate(firstVector, asciiMaskForAddSaturate).AsByte());
+                currentMask &= NonAsciiDataSeenMask;
+
+                // Now, the mask contains - from the LSB - a 0b00 pair for each ASCII char we saw, and a 0b10 pair for each non-ASCII char.
+                //
+                // (Keep endianness in mind in the below examples.)
+                // A non-ASCII char followed by two ASCII chars is 0b..._00_00_10. (tzcnt = 1)
+                // An ASCII char followed by two non-ASCII chars is 0b..._10_10_00. (tzcnt = 3)
+                // Two ASCII chars followed by a non-ASCII char is 0b..._10_00_00. (tzcnt = 5)
+                //
+                // This means tzcnt = 2 * numLeadingAsciiChars + 1. We can conveniently take advantage of the fact
+                // that the 2x multiplier already matches the char* stride length, then just subtract 1 at the end to
+                // compute the correct final ending pointer value.
+
+                Debug.Assert(currentMask != 0, "Shouldn't be here unless we see non-ASCII data.");
+                pBuffer = (char*)((byte*)pBuffer + (uint)BitOperations.TrailingZeroCount(currentMask) - 1);
+            }
+            else if (AdvSimd.Arm64.IsSupported)
+            {
+                // The following operation sets all the bits in a WORD to 1 where a non-ASCII char is found (otherwise to 0)
+                // in the vector. Then narrow each char to a byte by taking its top byte. Now the bottom-half (64-bits)
+                // of the vector contains 0xFFFF for non-ASCII and 0x0000 for ASCII char. We then find the index of the
+                // first non-ASCII char by counting number of trailing zeros representing ASCII chars before it.
 
+                Vector128<ushort> largestAsciiValue = Vector128.Create((ushort)0x007F);
+                Vector128<byte> compareResult = AdvSimd.CompareGreaterThan(firstVector, largestAsciiValue).AsByte();
+                ulong asciiCompareMask = AdvSimd.Arm64.UnzipOdd(compareResult, compareResult).AsUInt64().ToScalar();
+                // Compare mask now contains 8 bits for each 16-bit char. Divide it by 8 to get to the first non-ASCII byte.
+                pBuffer += BitOperations.TrailingZeroCount(asciiCompareMask) >> 3;
+            }
+            else
+            {
+                throw new PlatformNotSupportedException();
+            }
             goto Finish;
 
         FoundNonAsciiDataInCurrentDWord:
@@ -1491,7 +1442,7 @@ private static Vector128<byte> ExtractAsciiVector(Vector128<ushort> vectorFirst,
 
         private static unsafe nuint NarrowUtf16ToAscii_Intrinsified(char* pUtf16Buffer, byte* pAsciiBuffer, nuint elementCount)
         {
-            // This method contains logic optimized using vector instructions for both x64 and AArch64.
+            // This method contains logic optimized using vector instructions for both x64 and Arm64.
             // Much of the logic in this method will be elided by JIT once we determine which specific ISAs we support.
 
             // JIT turns the below into constants
@@ -1591,7 +1542,7 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified(char* pUtf16Buffer,
         FoundNonAsciiDataInLoop:
 
             // Can we at least narrow the high vector?
-            // See comments in GetIndexOfFirstNonAsciiChar_Sse2 for information about how this works.
+            // See comments in GetIndexOfFirstNonAsciiChar_Intrinsified for information about how this works.
             if (VectorContainsNonAsciiChar(utf16VectorFirst))
             {
                 goto Finish;