dotnet · pgovind · Aug 13, 2020 · Jul 31, 2020 · Jul 31, 2020 · Jul 31, 2020
diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs
@@ -42,6 +42,28 @@ private static bool AllCharsInUInt64AreAscii(ulong value)
             return (value & ~0x007F007F_007F007Ful) == 0;
         }
 
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static int GetIndexOfFirstNonAsciiByteInLane_AdvSimd(Vector128<byte> value, Vector128<byte> bitmask)
+        {
+            if (!AdvSimd.Arm64.IsSupported || !BitConverter.IsLittleEndian)
+            {
+                throw new PlatformNotSupportedException();
+            }
+
+            // extractedBits[i] = (value[i] >> 7) & (1 << (12 * (i % 2)));
+            Vector128<byte> mostSignificantBitIsSet = AdvSimd.ShiftRightArithmetic(value.AsSByte(), 7).AsByte();
+            Vector128<byte> extractedBits = AdvSimd.And(mostSignificantBitIsSet, bitmask);
+
+            // collapse mask to lower bits
+            extractedBits = AdvSimd.Arm64.AddPairwise(extractedBits, extractedBits);
+            ulong mask = extractedBits.AsUInt64().ToScalar();
+
+            // calculate the index
+            int index = BitOperations.TrailingZeroCount(mask) >> 2;
+            Debug.Assert((mask != 0) ? index < 16 : index >= 16);
+            return index;
+        }
+
         /// <summary>
         /// Given a DWORD which represents two packed chars in machine-endian order,
         /// <see langword="true"/> iff the first char (in machine-endian order) is ASCII.
@@ -67,8 +89,8 @@ public static unsafe nuint GetIndexOfFirstNonAsciiByte(byte* pBuffer, nuint buff
             // pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while
             // this method is running.
 
-            return (Sse2.IsSupported)
-                ? GetIndexOfFirstNonAsciiByte_Sse2(pBuffer, bufferLength)
+            return (Sse2.IsSupported || AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian)
+                ? GetIndexOfFirstNonAsciiByte_Intrinsified(pBuffer, bufferLength)
                 : GetIndexOfFirstNonAsciiByte_Default(pBuffer, bufferLength);
         }
 
@@ -215,17 +237,38 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Default(byte* pBuffer, n
             goto Finish;
         }
 
-        private static unsafe nuint GetIndexOfFirstNonAsciiByte_Sse2(byte* pBuffer, nuint bufferLength)
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool ContainsNonAsciiByte_Sse2(uint sseMask)
+        {
+            Debug.Assert(sseMask != uint.MaxValue);
+            Debug.Assert(Sse2.IsSupported);
+            return sseMask != 0;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool ContainsNonAsciiByte_AdvSimd(uint advSimdIndex)
+        {
+            Debug.Assert(advSimdIndex != uint.MaxValue);
+            Debug.Assert(AdvSimd.IsSupported);
+            return advSimdIndex < 16;
+        }
+
+        private static unsafe nuint GetIndexOfFirstNonAsciiByte_Intrinsified(byte* pBuffer, nuint bufferLength)
         {
             // JIT turns the below into constants
 
             uint SizeOfVector128 = (uint)Unsafe.SizeOf<Vector128<byte>>();
             nuint MaskOfAllBitsInVector128 = (nuint)(SizeOfVector128 - 1);
 
-            Debug.Assert(Sse2.IsSupported, "Should've been checked by caller.");
-            Debug.Assert(BitConverter.IsLittleEndian, "SSE2 assumes little-endian.");
+            Debug.Assert(Sse2.IsSupported || AdvSimd.Arm64.IsSupported, "Sse2 or AdvSimd64 required.");
+            Debug.Assert(BitConverter.IsLittleEndian, "This SSE2/Arm64 implementation assumes little-endian.");
 
-            uint currentMask, secondMask;
+            Vector128<byte> bitmask = BitConverter.IsLittleEndian ?
+                Vector128.Create((ushort)0x1001).AsByte() :
+                Vector128.Create((ushort)0x0110).AsByte();
+
+            uint currentSseMask = uint.MaxValue, secondSseMask = uint.MaxValue;
+            uint currentAdvSimdIndex = uint.MaxValue, secondAdvSimdIndex = uint.MaxValue;
             byte* pOriginalBuffer = pBuffer;
 
             // This method is written such that control generally flows top-to-bottom, avoiding
@@ -240,11 +283,25 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Sse2(byte* pBuffer, nuin
 
             // Read the first vector unaligned.
 
-            currentMask = (uint)Sse2.MoveMask(Sse2.LoadVector128(pBuffer)); // unaligned load
-
-            if (currentMask != 0)
+            if (Sse2.IsSupported)
             {
-                goto FoundNonAsciiDataInCurrentMask;
+                currentSseMask = (uint)Sse2.MoveMask(Sse2.LoadVector128(pBuffer)); // unaligned load
+                if (ContainsNonAsciiByte_Sse2(currentSseMask))
+                {
+                    goto FoundNonAsciiDataInCurrentChunk;
+                }
+            }
+            else if (AdvSimd.Arm64.IsSupported)
+            {
+                currentAdvSimdIndex = (uint)GetIndexOfFirstNonAsciiByteInLane_AdvSimd(AdvSimd.LoadVector128(pBuffer), bitmask); // unaligned load
+                if (ContainsNonAsciiByte_AdvSimd(currentAdvSimdIndex))
+                {
+                    goto FoundNonAsciiDataInCurrentChunk;
+                }
+            }
+            else
+            {
+                throw new PlatformNotSupportedException();
             }
 
             // If we have less than 32 bytes to process, just go straight to the final unaligned
@@ -281,15 +338,33 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Sse2(byte* pBuffer, nuin
 
                 do
                 {
-                    Vector128<byte> firstVector = Sse2.LoadAlignedVector128(pBuffer);
-                    Vector128<byte> secondVector = Sse2.LoadAlignedVector128(pBuffer + SizeOfVector128);
+                    if (Sse2.IsSupported)
+                    {
+                        Vector128<byte> firstVector = Sse2.LoadAlignedVector128(pBuffer);
+                        Vector128<byte> secondVector = Sse2.LoadAlignedVector128(pBuffer + SizeOfVector128);
 
-                    currentMask = (uint)Sse2.MoveMask(firstVector);
-                    secondMask = (uint)Sse2.MoveMask(secondVector);
+                        currentSseMask = (uint)Sse2.MoveMask(firstVector);
+                        secondSseMask = (uint)Sse2.MoveMask(secondVector);
+                        if (ContainsNonAsciiByte_Sse2(currentSseMask | secondSseMask))
+                        {
+                            goto FoundNonAsciiDataInInnerLoop;
+                        }
+                    }
+                    else if (AdvSimd.Arm64.IsSupported)
+                    {
+                        Vector128<byte> firstVector = AdvSimd.LoadVector128(pBuffer);
+                        Vector128<byte> secondVector = AdvSimd.LoadVector128(pBuffer + SizeOfVector128);
 
-                    if ((currentMask | secondMask) != 0)
+                        currentAdvSimdIndex = (uint)GetIndexOfFirstNonAsciiByteInLane_AdvSimd(firstVector, bitmask);
+                        secondAdvSimdIndex = (uint)GetIndexOfFirstNonAsciiByteInLane_AdvSimd(secondVector, bitmask);
+                        if (ContainsNonAsciiByte_AdvSimd(currentAdvSimdIndex) || ContainsNonAsciiByte_AdvSimd(secondAdvSimdIndex))
+                        {
+                            goto FoundNonAsciiDataInInnerLoop;
+                        }
+                    }
+                    else
                     {
-                        goto FoundNonAsciiDataInInnerLoop;
+                        throw new PlatformNotSupportedException();
                     }
 
                     pBuffer += 2 * SizeOfVector128;
@@ -313,10 +388,25 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Sse2(byte* pBuffer, nuin
             // At least one full vector's worth of data remains, so we can safely read it.
             // Remember, at this point pBuffer is still aligned.
 
-            currentMask = (uint)Sse2.MoveMask(Sse2.LoadAlignedVector128(pBuffer));
-            if (currentMask != 0)
+            if (Sse2.IsSupported)
             {
-                goto FoundNonAsciiDataInCurrentMask;
+                currentSseMask = (uint)Sse2.MoveMask(Sse2.LoadAlignedVector128(pBuffer));
+                if (ContainsNonAsciiByte_Sse2(currentSseMask))
+                {
+                    goto FoundNonAsciiDataInCurrentChunk;
+                }
+            }
+            else if (AdvSimd.Arm64.IsSupported)
+            {
+                currentAdvSimdIndex = (uint)GetIndexOfFirstNonAsciiByteInLane_AdvSimd(AdvSimd.LoadVector128(pBuffer), bitmask);
+                if (ContainsNonAsciiByte_AdvSimd(currentAdvSimdIndex))
+                {
+                    goto FoundNonAsciiDataInCurrentChunk;
+                }
+            }
+            else
+            {
+                throw new PlatformNotSupportedException();
             }
 
         IncrementCurrentOffsetBeforeFinalUnalignedVectorRead:
@@ -332,17 +422,33 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Sse2(byte* pBuffer, nuin
 
                 pBuffer += (bufferLength & MaskOfAllBitsInVector128) - SizeOfVector128;
 
-                currentMask = (uint)Sse2.MoveMask(Sse2.LoadVector128(pBuffer)); // unaligned load
-                if (currentMask != 0)
+                if (Sse2.IsSupported)
                 {
-                    goto FoundNonAsciiDataInCurrentMask;
+                    currentSseMask = (uint)Sse2.MoveMask(Sse2.LoadVector128(pBuffer)); // unaligned load
+                    if (ContainsNonAsciiByte_Sse2(currentSseMask))
+                    {
+                        goto FoundNonAsciiDataInCurrentChunk;
+                    }
+
+                }
+                else if (AdvSimd.Arm64.IsSupported)
+                {
+                    currentAdvSimdIndex = (uint)GetIndexOfFirstNonAsciiByteInLane_AdvSimd(AdvSimd.LoadVector128(pBuffer), bitmask); // unaligned load
+                    if (ContainsNonAsciiByte_AdvSimd(currentAdvSimdIndex))
+                    {
+                        goto FoundNonAsciiDataInCurrentChunk;
+                    }
+
+                }
+                else
+                {
+                    throw new PlatformNotSupportedException();
                 }
 
                 pBuffer += SizeOfVector128;
             }
 
         Finish:
-
             return (nuint)pBuffer - (nuint)pOriginalBuffer; // and we're done!
 
         FoundNonAsciiDataInInnerLoop:
@@ -351,20 +457,46 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Sse2(byte* pBuffer, nuin
             // instead be the second mask. If so, skip the entire first mask and drain ASCII bytes
             // from the second mask.
 
-            if (currentMask == 0)
+            if (Sse2.IsSupported)
             {
-                pBuffer += SizeOfVector128;
-                currentMask = secondMask;
+                if (!ContainsNonAsciiByte_Sse2(currentSseMask))
+                {
+                    pBuffer += SizeOfVector128;
+                    currentSseMask = secondSseMask;
+                }
             }
+            else if (AdvSimd.IsSupported)
+            {
+                if (!ContainsNonAsciiByte_AdvSimd(currentAdvSimdIndex))
+                {
+                    pBuffer += SizeOfVector128;
+                    currentAdvSimdIndex = secondAdvSimdIndex;
+                }
+            }
+            else
+            {
+                throw new PlatformNotSupportedException();
+            }
+        FoundNonAsciiDataInCurrentChunk:
 
-        FoundNonAsciiDataInCurrentMask:
-
-            // The mask contains - from the LSB - a 0 for each ASCII byte we saw, and a 1 for each non-ASCII byte.
-            // Tzcnt is the correct operation to count the number of zero bits quickly. If this instruction isn't
-            // available, we'll fall back to a normal loop.
 
-            Debug.Assert(currentMask != 0, "Shouldn't be here unless we see non-ASCII data.");
-            pBuffer += (uint)BitOperations.TrailingZeroCount(currentMask);
+            if (Sse2.IsSupported)
+            {
+                // The mask contains - from the LSB - a 0 for each ASCII byte we saw, and a 1 for each non-ASCII byte.
+                // Tzcnt is the correct operation to count the number of zero bits quickly. If this instruction isn't
+                // available, we'll fall back to a normal loop.
+                Debug.Assert(ContainsNonAsciiByte_Sse2(currentSseMask), "Shouldn't be here unless we see non-ASCII data.");
+                pBuffer += (uint)BitOperations.TrailingZeroCount(currentSseMask);
+            }
+            else if (AdvSimd.Arm64.IsSupported)
+            {
+                Debug.Assert(ContainsNonAsciiByte_AdvSimd(currentAdvSimdIndex), "Shouldn't be here unless we see non-ASCII data.");
+                pBuffer += currentAdvSimdIndex;
+            }
+            else
+            {
+                throw new PlatformNotSupportedException();
+            }
 
             goto Finish;