diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs index 742e2239f27f3..5cba2be3143e0 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs @@ -616,13 +616,13 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Intrinsified(byte* pBuff [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe nuint GetIndexOfFirstNonAsciiChar(char* pBuffer, nuint bufferLength /* in chars */) { - // If SSE2 is supported, use those specific intrinsics instead of the generic vectorized + // If SSE2/ASIMD is supported, use those specific intrinsics instead of the generic vectorized // code below. This has two benefits: (a) we can take advantage of specific instructions like // pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while // this method is running. - return (Sse2.IsSupported) - ? GetIndexOfFirstNonAsciiChar_Sse2(pBuffer, bufferLength) + return ((Sse2.IsSupported || AdvSimd.IsSupported) && BitConverter.IsLittleEndian) + ? GetIndexOfFirstNonAsciiChar_Intrinsified(pBuffer, bufferLength) : GetIndexOfFirstNonAsciiChar_Default(pBuffer, bufferLength); } @@ -762,10 +762,10 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Default(char* pBuffer, n goto Finish; } - private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuint bufferLength /* in chars */) + private static unsafe nuint GetIndexOfFirstNonAsciiChar_Intrinsified(char* pBuffer, nuint bufferLength /* in chars */) { - // This method contains logic optimized for both SSE2 and SSE41. Much of the logic in this method - // will be elided by JIT once we determine which specific ISAs we support. + // This method contains logic optimized using vector instructions for both x64 and Arm64. + // Much of the logic in this method will be elided by JIT once we determine which specific ISAs we support. // Quick check for empty inputs. @@ -779,8 +779,8 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuin uint SizeOfVector128InBytes = (uint)Unsafe.SizeOf>(); uint SizeOfVector128InChars = SizeOfVector128InBytes / sizeof(char); - Debug.Assert(Sse2.IsSupported, "Should've been checked by caller."); - Debug.Assert(BitConverter.IsLittleEndian, "SSE2 assumes little-endian."); + Debug.Assert(Sse2.IsSupported || AdvSimd.Arm64.IsSupported, "Should've been checked by caller."); + Debug.Assert(BitConverter.IsLittleEndian, "This SSE2/Arm64 assumes little-endian."); Vector128 firstVector, secondVector; uint currentMask; @@ -795,27 +795,16 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuin // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII // data, we jump out of the hot paths to targets at the end of the method. - Vector128 asciiMaskForTestZ = Vector128.Create((ushort)0xFF80); // used for PTEST on supported hardware - Vector128 asciiMaskForAddSaturate = Vector128.Create((ushort)0x7F80); // used for PADDUSW - const uint NonAsciiDataSeenMask = 0b_1010_1010_1010_1010; // used for determining whether 'currentMask' contains non-ASCII data - #if SYSTEM_PRIVATE_CORELIB Debug.Assert(bufferLength <= nuint.MaxValue / sizeof(char)); #endif // Read the first vector unaligned. - firstVector = Sse2.LoadVector128((ushort*)pBuffer); // unaligned load - - // The operation below forces the 0x8000 bit of each WORD to be set iff the WORD element - // has value >= 0x0800 (non-ASCII). Then we'll treat the vector as a BYTE vector in order - // to extract the mask. Reminder: the 0x0080 bit of each WORD should be ignored. - - currentMask = (uint)Sse2.MoveMask(Sse2.AddSaturate(firstVector, asciiMaskForAddSaturate).AsByte()); - - if ((currentMask & NonAsciiDataSeenMask) != 0) + firstVector = Vector128.LoadUnsafe(ref *(ushort*)pBuffer); + if (VectorContainsNonAsciiChar(firstVector)) { - goto FoundNonAsciiDataInCurrentMask; + goto FoundNonAsciiDataInFirstVector; } // If we have less than 32 bytes to process, just go straight to the final unaligned @@ -843,41 +832,26 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuin // Adjust remaining buffer length. - bufferLength += (nuint)pOriginalBuffer; - bufferLength -= (nuint)pBuffer; + nuint numBytesRead = ((nuint)pBuffer - (nuint)pOriginalBuffer); + bufferLength -= numBytesRead; // The buffer is now properly aligned. // Read 2 vectors at a time if possible. - if (bufferLength >= 2 * SizeOfVector128InBytes) { char* pFinalVectorReadPos = (char*)((nuint)pBuffer + bufferLength - 2 * SizeOfVector128InBytes); // After this point, we no longer need to update the bufferLength value. - do { - firstVector = Sse2.LoadAlignedVector128((ushort*)pBuffer); - secondVector = Sse2.LoadAlignedVector128((ushort*)pBuffer + SizeOfVector128InChars); - Vector128 combinedVector = Sse2.Or(firstVector, secondVector); - if (Sse41.IsSupported) - { - // If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data. - // Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data. - if (!Sse41.TestZ(combinedVector, asciiMaskForTestZ)) - { - goto FoundNonAsciiDataInFirstOrSecondVector; - } - } - else + firstVector = Vector128.LoadUnsafe(ref *(ushort*)pBuffer); + secondVector = Vector128.LoadUnsafe(ref *(ushort*)pBuffer, SizeOfVector128InChars); + Vector128 combinedVector = firstVector | secondVector; + + if (VectorContainsNonAsciiChar(combinedVector)) { - // See comment earlier in the method for an explanation of how the below logic works. - currentMask = (uint)Sse2.MoveMask(Sse2.AddSaturate(combinedVector, asciiMaskForAddSaturate).AsByte()); - if ((currentMask & NonAsciiDataSeenMask) != 0) - { - goto FoundNonAsciiDataInFirstOrSecondVector; - } + goto FoundNonAsciiDataInFirstOrSecondVector; } pBuffer += 2 * SizeOfVector128InChars; @@ -902,25 +876,10 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuin // At least one full vector's worth of data remains, so we can safely read it. // Remember, at this point pBuffer is still aligned. - firstVector = Sse2.LoadAlignedVector128((ushort*)pBuffer); - - if (Sse41.IsSupported) - { - // If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data. - // Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data. - if (!Sse41.TestZ(firstVector, asciiMaskForTestZ)) - { - goto FoundNonAsciiDataInFirstVector; - } - } - else + firstVector = Vector128.LoadUnsafe(ref *(ushort*)pBuffer); + if (VectorContainsNonAsciiChar(firstVector)) { - // See comment earlier in the method for an explanation of how the below logic works. - currentMask = (uint)Sse2.MoveMask(Sse2.AddSaturate(firstVector, asciiMaskForAddSaturate).AsByte()); - if ((currentMask & NonAsciiDataSeenMask) != 0) - { - goto FoundNonAsciiDataInCurrentMask; - } + goto FoundNonAsciiDataInFirstVector; } IncrementCurrentOffsetBeforeFinalUnalignedVectorRead: @@ -935,25 +894,10 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuin // We need to adjust the pointer because we're re-reading data. pBuffer = (char*)((byte*)pBuffer + (bufferLength & (SizeOfVector128InBytes - 1)) - SizeOfVector128InBytes); - firstVector = Sse2.LoadVector128((ushort*)pBuffer); // unaligned load - - if (Sse41.IsSupported) - { - // If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data. - // Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data. - if (!Sse41.TestZ(firstVector, asciiMaskForTestZ)) - { - goto FoundNonAsciiDataInFirstVector; - } - } - else + firstVector = Vector128.LoadUnsafe(ref *(ushort*)pBuffer); + if (VectorContainsNonAsciiChar(firstVector)) { - // See comment earlier in the method for an explanation of how the below logic works. - currentMask = (uint)Sse2.MoveMask(Sse2.AddSaturate(firstVector, asciiMaskForAddSaturate).AsByte()); - if ((currentMask & NonAsciiDataSeenMask) != 0) - { - goto FoundNonAsciiDataInCurrentMask; - } + goto FoundNonAsciiDataInFirstVector; } pBuffer += SizeOfVector128InChars; @@ -970,21 +914,9 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuin // vector, and if that's all-ASCII then the second vector must be the culprit. Either way // we'll make sure the first vector local is the one that contains the non-ASCII data. - // See comment earlier in the method for an explanation of how the below logic works. - if (Sse41.IsSupported) - { - if (!Sse41.TestZ(firstVector, asciiMaskForTestZ)) - { - goto FoundNonAsciiDataInFirstVector; - } - } - else + if (VectorContainsNonAsciiChar(firstVector)) { - currentMask = (uint)Sse2.MoveMask(Sse2.AddSaturate(firstVector, asciiMaskForAddSaturate).AsByte()); - if ((currentMask & NonAsciiDataSeenMask) != 0) - { - goto FoundNonAsciiDataInCurrentMask; - } + goto FoundNonAsciiDataInFirstVector; } // Wasn't the first vector; must be the second. @@ -994,29 +926,48 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuin FoundNonAsciiDataInFirstVector: - // See comment earlier in the method for an explanation of how the below logic works. - currentMask = (uint)Sse2.MoveMask(Sse2.AddSaturate(firstVector, asciiMaskForAddSaturate).AsByte()); - - FoundNonAsciiDataInCurrentMask: - - // See comment earlier in the method accounting for the 0x8000 and 0x0080 bits set after the WORD-sized operations. - - currentMask &= NonAsciiDataSeenMask; - - // Now, the mask contains - from the LSB - a 0b00 pair for each ASCII char we saw, and a 0b10 pair for each non-ASCII char. - // - // (Keep endianness in mind in the below examples.) - // A non-ASCII char followed by two ASCII chars is 0b..._00_00_10. (tzcnt = 1) - // An ASCII char followed by two non-ASCII chars is 0b..._10_10_00. (tzcnt = 3) - // Two ASCII chars followed by a non-ASCII char is 0b..._10_00_00. (tzcnt = 5) - // - // This means tzcnt = 2 * numLeadingAsciiChars + 1. We can conveniently take advantage of the fact - // that the 2x multiplier already matches the char* stride length, then just subtract 1 at the end to - // compute the correct final ending pointer value. + if (Sse2.IsSupported) + { + // The operation below forces the 0x8000 bit of each WORD to be set iff the WORD element + // has value >= 0x0800 (non-ASCII). Then we'll treat the vector as a BYTE vector in order + // to extract the mask. Reminder: the 0x0080 bit of each WORD should be ignored. + Vector128 asciiMaskForAddSaturate = Vector128.Create((ushort)0x7F80); + const uint NonAsciiDataSeenMask = 0b_1010_1010_1010_1010; // used for determining whether 'currentMask' contains non-ASCII data - Debug.Assert(currentMask != 0, "Shouldn't be here unless we see non-ASCII data."); - pBuffer = (char*)((byte*)pBuffer + (uint)BitOperations.TrailingZeroCount(currentMask) - 1); + currentMask = (uint)Sse2.MoveMask(Sse2.AddSaturate(firstVector, asciiMaskForAddSaturate).AsByte()); + currentMask &= NonAsciiDataSeenMask; + + // Now, the mask contains - from the LSB - a 0b00 pair for each ASCII char we saw, and a 0b10 pair for each non-ASCII char. + // + // (Keep endianness in mind in the below examples.) + // A non-ASCII char followed by two ASCII chars is 0b..._00_00_10. (tzcnt = 1) + // An ASCII char followed by two non-ASCII chars is 0b..._10_10_00. (tzcnt = 3) + // Two ASCII chars followed by a non-ASCII char is 0b..._10_00_00. (tzcnt = 5) + // + // This means tzcnt = 2 * numLeadingAsciiChars + 1. We can conveniently take advantage of the fact + // that the 2x multiplier already matches the char* stride length, then just subtract 1 at the end to + // compute the correct final ending pointer value. + + Debug.Assert(currentMask != 0, "Shouldn't be here unless we see non-ASCII data."); + pBuffer = (char*)((byte*)pBuffer + (uint)BitOperations.TrailingZeroCount(currentMask) - 1); + } + else if (AdvSimd.Arm64.IsSupported) + { + // The following operation sets all the bits in a WORD to 1 where a non-ASCII char is found (otherwise to 0) + // in the vector. Then narrow each char to a byte by taking its top byte. Now the bottom-half (64-bits) + // of the vector contains 0xFFFF for non-ASCII and 0x0000 for ASCII char. We then find the index of the + // first non-ASCII char by counting number of trailing zeros representing ASCII chars before it. + Vector128 largestAsciiValue = Vector128.Create((ushort)0x007F); + Vector128 compareResult = AdvSimd.CompareGreaterThan(firstVector, largestAsciiValue).AsByte(); + ulong asciiCompareMask = AdvSimd.Arm64.UnzipOdd(compareResult, compareResult).AsUInt64().ToScalar(); + // Compare mask now contains 8 bits for each 16-bit char. Divide it by 8 to get to the first non-ASCII byte. + pBuffer += BitOperations.TrailingZeroCount(asciiCompareMask) >> 3; + } + else + { + throw new PlatformNotSupportedException(); + } goto Finish; FoundNonAsciiDataInCurrentDWord: @@ -1491,7 +1442,7 @@ private static Vector128 ExtractAsciiVector(Vector128 vectorFirst, private static unsafe nuint NarrowUtf16ToAscii_Intrinsified(char* pUtf16Buffer, byte* pAsciiBuffer, nuint elementCount) { - // This method contains logic optimized using vector instructions for both x64 and AArch64. + // This method contains logic optimized using vector instructions for both x64 and Arm64. // Much of the logic in this method will be elided by JIT once we determine which specific ISAs we support. // JIT turns the below into constants @@ -1591,7 +1542,7 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified(char* pUtf16Buffer, FoundNonAsciiDataInLoop: // Can we at least narrow the high vector? - // See comments in GetIndexOfFirstNonAsciiChar_Sse2 for information about how this works. + // See comments in GetIndexOfFirstNonAsciiChar_Intrinsified for information about how this works. if (VectorContainsNonAsciiChar(utf16VectorFirst)) { goto Finish;