Skip to content

Commit

Permalink
Implement GetIndexOfFirstNonAsciiChar intrinsic on AArch64 (#71637)
Browse files Browse the repository at this point in the history
* Implement GetIndexOfFirstNonAsciiChar intrinsic on AArch64

* Fix typo in a method name
  • Loading branch information
SwapnilGaikwad committed Jul 7, 2022
1 parent 162cca2 commit 85d638b
Showing 1 changed file with 69 additions and 118 deletions.
187 changes: 69 additions & 118 deletions src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs
Original file line number Diff line number Diff line change
Expand Up @@ -616,13 +616,13 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Intrinsified(byte* pBuff
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static unsafe nuint GetIndexOfFirstNonAsciiChar(char* pBuffer, nuint bufferLength /* in chars */)
{
// If SSE2 is supported, use those specific intrinsics instead of the generic vectorized
// If SSE2/ASIMD is supported, use those specific intrinsics instead of the generic vectorized
// code below. This has two benefits: (a) we can take advantage of specific instructions like
// pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while
// this method is running.

return (Sse2.IsSupported)
? GetIndexOfFirstNonAsciiChar_Sse2(pBuffer, bufferLength)
return ((Sse2.IsSupported || AdvSimd.IsSupported) && BitConverter.IsLittleEndian)
? GetIndexOfFirstNonAsciiChar_Intrinsified(pBuffer, bufferLength)
: GetIndexOfFirstNonAsciiChar_Default(pBuffer, bufferLength);
}

Expand Down Expand Up @@ -762,10 +762,10 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Default(char* pBuffer, n
goto Finish;
}

private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuint bufferLength /* in chars */)
private static unsafe nuint GetIndexOfFirstNonAsciiChar_Intrinsified(char* pBuffer, nuint bufferLength /* in chars */)
{
// This method contains logic optimized for both SSE2 and SSE41. Much of the logic in this method
// will be elided by JIT once we determine which specific ISAs we support.
// This method contains logic optimized using vector instructions for both x64 and Arm64.
// Much of the logic in this method will be elided by JIT once we determine which specific ISAs we support.

// Quick check for empty inputs.

Expand All @@ -779,8 +779,8 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuin
uint SizeOfVector128InBytes = (uint)Unsafe.SizeOf<Vector128<byte>>();
uint SizeOfVector128InChars = SizeOfVector128InBytes / sizeof(char);

Debug.Assert(Sse2.IsSupported, "Should've been checked by caller.");
Debug.Assert(BitConverter.IsLittleEndian, "SSE2 assumes little-endian.");
Debug.Assert(Sse2.IsSupported || AdvSimd.Arm64.IsSupported, "Should've been checked by caller.");
Debug.Assert(BitConverter.IsLittleEndian, "This SSE2/Arm64 assumes little-endian.");

Vector128<ushort> firstVector, secondVector;
uint currentMask;
Expand All @@ -795,27 +795,16 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuin
// jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII
// data, we jump out of the hot paths to targets at the end of the method.

Vector128<ushort> asciiMaskForTestZ = Vector128.Create((ushort)0xFF80); // used for PTEST on supported hardware
Vector128<ushort> asciiMaskForAddSaturate = Vector128.Create((ushort)0x7F80); // used for PADDUSW
const uint NonAsciiDataSeenMask = 0b_1010_1010_1010_1010; // used for determining whether 'currentMask' contains non-ASCII data

#if SYSTEM_PRIVATE_CORELIB
Debug.Assert(bufferLength <= nuint.MaxValue / sizeof(char));
#endif

// Read the first vector unaligned.

firstVector = Sse2.LoadVector128((ushort*)pBuffer); // unaligned load

// The operation below forces the 0x8000 bit of each WORD to be set iff the WORD element
// has value >= 0x0800 (non-ASCII). Then we'll treat the vector as a BYTE vector in order
// to extract the mask. Reminder: the 0x0080 bit of each WORD should be ignored.

currentMask = (uint)Sse2.MoveMask(Sse2.AddSaturate(firstVector, asciiMaskForAddSaturate).AsByte());

if ((currentMask & NonAsciiDataSeenMask) != 0)
firstVector = Vector128.LoadUnsafe(ref *(ushort*)pBuffer);
if (VectorContainsNonAsciiChar(firstVector))
{
goto FoundNonAsciiDataInCurrentMask;
goto FoundNonAsciiDataInFirstVector;
}

// If we have less than 32 bytes to process, just go straight to the final unaligned
Expand Down Expand Up @@ -843,41 +832,26 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuin

// Adjust remaining buffer length.

bufferLength += (nuint)pOriginalBuffer;
bufferLength -= (nuint)pBuffer;
nuint numBytesRead = ((nuint)pBuffer - (nuint)pOriginalBuffer);
bufferLength -= numBytesRead;

// The buffer is now properly aligned.
// Read 2 vectors at a time if possible.

if (bufferLength >= 2 * SizeOfVector128InBytes)
{
char* pFinalVectorReadPos = (char*)((nuint)pBuffer + bufferLength - 2 * SizeOfVector128InBytes);

// After this point, we no longer need to update the bufferLength value.

do
{
firstVector = Sse2.LoadAlignedVector128((ushort*)pBuffer);
secondVector = Sse2.LoadAlignedVector128((ushort*)pBuffer + SizeOfVector128InChars);
Vector128<ushort> combinedVector = Sse2.Or(firstVector, secondVector);

if (Sse41.IsSupported)
{
// If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data.
// Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data.
if (!Sse41.TestZ(combinedVector, asciiMaskForTestZ))
{
goto FoundNonAsciiDataInFirstOrSecondVector;
}
}
else
firstVector = Vector128.LoadUnsafe(ref *(ushort*)pBuffer);
secondVector = Vector128.LoadUnsafe(ref *(ushort*)pBuffer, SizeOfVector128InChars);
Vector128<ushort> combinedVector = firstVector | secondVector;

if (VectorContainsNonAsciiChar(combinedVector))
{
// See comment earlier in the method for an explanation of how the below logic works.
currentMask = (uint)Sse2.MoveMask(Sse2.AddSaturate(combinedVector, asciiMaskForAddSaturate).AsByte());
if ((currentMask & NonAsciiDataSeenMask) != 0)
{
goto FoundNonAsciiDataInFirstOrSecondVector;
}
goto FoundNonAsciiDataInFirstOrSecondVector;
}

pBuffer += 2 * SizeOfVector128InChars;
Expand All @@ -902,25 +876,10 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuin
// At least one full vector's worth of data remains, so we can safely read it.
// Remember, at this point pBuffer is still aligned.

firstVector = Sse2.LoadAlignedVector128((ushort*)pBuffer);

if (Sse41.IsSupported)
{
// If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data.
// Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data.
if (!Sse41.TestZ(firstVector, asciiMaskForTestZ))
{
goto FoundNonAsciiDataInFirstVector;
}
}
else
firstVector = Vector128.LoadUnsafe(ref *(ushort*)pBuffer);
if (VectorContainsNonAsciiChar(firstVector))
{
// See comment earlier in the method for an explanation of how the below logic works.
currentMask = (uint)Sse2.MoveMask(Sse2.AddSaturate(firstVector, asciiMaskForAddSaturate).AsByte());
if ((currentMask & NonAsciiDataSeenMask) != 0)
{
goto FoundNonAsciiDataInCurrentMask;
}
goto FoundNonAsciiDataInFirstVector;
}

IncrementCurrentOffsetBeforeFinalUnalignedVectorRead:
Expand All @@ -935,25 +894,10 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuin
// We need to adjust the pointer because we're re-reading data.

pBuffer = (char*)((byte*)pBuffer + (bufferLength & (SizeOfVector128InBytes - 1)) - SizeOfVector128InBytes);
firstVector = Sse2.LoadVector128((ushort*)pBuffer); // unaligned load

if (Sse41.IsSupported)
{
// If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data.
// Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data.
if (!Sse41.TestZ(firstVector, asciiMaskForTestZ))
{
goto FoundNonAsciiDataInFirstVector;
}
}
else
firstVector = Vector128.LoadUnsafe(ref *(ushort*)pBuffer);
if (VectorContainsNonAsciiChar(firstVector))
{
// See comment earlier in the method for an explanation of how the below logic works.
currentMask = (uint)Sse2.MoveMask(Sse2.AddSaturate(firstVector, asciiMaskForAddSaturate).AsByte());
if ((currentMask & NonAsciiDataSeenMask) != 0)
{
goto FoundNonAsciiDataInCurrentMask;
}
goto FoundNonAsciiDataInFirstVector;
}

pBuffer += SizeOfVector128InChars;
Expand All @@ -970,21 +914,9 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuin
// vector, and if that's all-ASCII then the second vector must be the culprit. Either way
// we'll make sure the first vector local is the one that contains the non-ASCII data.

// See comment earlier in the method for an explanation of how the below logic works.
if (Sse41.IsSupported)
{
if (!Sse41.TestZ(firstVector, asciiMaskForTestZ))
{
goto FoundNonAsciiDataInFirstVector;
}
}
else
if (VectorContainsNonAsciiChar(firstVector))
{
currentMask = (uint)Sse2.MoveMask(Sse2.AddSaturate(firstVector, asciiMaskForAddSaturate).AsByte());
if ((currentMask & NonAsciiDataSeenMask) != 0)
{
goto FoundNonAsciiDataInCurrentMask;
}
goto FoundNonAsciiDataInFirstVector;
}

// Wasn't the first vector; must be the second.
Expand All @@ -994,29 +926,48 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuin

FoundNonAsciiDataInFirstVector:

// See comment earlier in the method for an explanation of how the below logic works.
currentMask = (uint)Sse2.MoveMask(Sse2.AddSaturate(firstVector, asciiMaskForAddSaturate).AsByte());

FoundNonAsciiDataInCurrentMask:

// See comment earlier in the method accounting for the 0x8000 and 0x0080 bits set after the WORD-sized operations.

currentMask &= NonAsciiDataSeenMask;

// Now, the mask contains - from the LSB - a 0b00 pair for each ASCII char we saw, and a 0b10 pair for each non-ASCII char.
//
// (Keep endianness in mind in the below examples.)
// A non-ASCII char followed by two ASCII chars is 0b..._00_00_10. (tzcnt = 1)
// An ASCII char followed by two non-ASCII chars is 0b..._10_10_00. (tzcnt = 3)
// Two ASCII chars followed by a non-ASCII char is 0b..._10_00_00. (tzcnt = 5)
//
// This means tzcnt = 2 * numLeadingAsciiChars + 1. We can conveniently take advantage of the fact
// that the 2x multiplier already matches the char* stride length, then just subtract 1 at the end to
// compute the correct final ending pointer value.
if (Sse2.IsSupported)
{
// The operation below forces the 0x8000 bit of each WORD to be set iff the WORD element
// has value >= 0x0800 (non-ASCII). Then we'll treat the vector as a BYTE vector in order
// to extract the mask. Reminder: the 0x0080 bit of each WORD should be ignored.
Vector128<ushort> asciiMaskForAddSaturate = Vector128.Create((ushort)0x7F80);
const uint NonAsciiDataSeenMask = 0b_1010_1010_1010_1010; // used for determining whether 'currentMask' contains non-ASCII data

Debug.Assert(currentMask != 0, "Shouldn't be here unless we see non-ASCII data.");
pBuffer = (char*)((byte*)pBuffer + (uint)BitOperations.TrailingZeroCount(currentMask) - 1);
currentMask = (uint)Sse2.MoveMask(Sse2.AddSaturate(firstVector, asciiMaskForAddSaturate).AsByte());
currentMask &= NonAsciiDataSeenMask;

// Now, the mask contains - from the LSB - a 0b00 pair for each ASCII char we saw, and a 0b10 pair for each non-ASCII char.
//
// (Keep endianness in mind in the below examples.)
// A non-ASCII char followed by two ASCII chars is 0b..._00_00_10. (tzcnt = 1)
// An ASCII char followed by two non-ASCII chars is 0b..._10_10_00. (tzcnt = 3)
// Two ASCII chars followed by a non-ASCII char is 0b..._10_00_00. (tzcnt = 5)
//
// This means tzcnt = 2 * numLeadingAsciiChars + 1. We can conveniently take advantage of the fact
// that the 2x multiplier already matches the char* stride length, then just subtract 1 at the end to
// compute the correct final ending pointer value.

Debug.Assert(currentMask != 0, "Shouldn't be here unless we see non-ASCII data.");
pBuffer = (char*)((byte*)pBuffer + (uint)BitOperations.TrailingZeroCount(currentMask) - 1);
}
else if (AdvSimd.Arm64.IsSupported)
{
// The following operation sets all the bits in a WORD to 1 where a non-ASCII char is found (otherwise to 0)
// in the vector. Then narrow each char to a byte by taking its top byte. Now the bottom-half (64-bits)
// of the vector contains 0xFFFF for non-ASCII and 0x0000 for ASCII char. We then find the index of the
// first non-ASCII char by counting number of trailing zeros representing ASCII chars before it.

Vector128<ushort> largestAsciiValue = Vector128.Create((ushort)0x007F);
Vector128<byte> compareResult = AdvSimd.CompareGreaterThan(firstVector, largestAsciiValue).AsByte();
ulong asciiCompareMask = AdvSimd.Arm64.UnzipOdd(compareResult, compareResult).AsUInt64().ToScalar();
// Compare mask now contains 8 bits for each 16-bit char. Divide it by 8 to get to the first non-ASCII byte.
pBuffer += BitOperations.TrailingZeroCount(asciiCompareMask) >> 3;
}
else
{
throw new PlatformNotSupportedException();
}
goto Finish;

FoundNonAsciiDataInCurrentDWord:
Expand Down Expand Up @@ -1491,7 +1442,7 @@ private static Vector128<byte> ExtractAsciiVector(Vector128<ushort> vectorFirst,

private static unsafe nuint NarrowUtf16ToAscii_Intrinsified(char* pUtf16Buffer, byte* pAsciiBuffer, nuint elementCount)
{
// This method contains logic optimized using vector instructions for both x64 and AArch64.
// This method contains logic optimized using vector instructions for both x64 and Arm64.
// Much of the logic in this method will be elided by JIT once we determine which specific ISAs we support.

// JIT turns the below into constants
Expand Down Expand Up @@ -1591,7 +1542,7 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified(char* pUtf16Buffer,
FoundNonAsciiDataInLoop:

// Can we at least narrow the high vector?
// See comments in GetIndexOfFirstNonAsciiChar_Sse2 for information about how this works.
// See comments in GetIndexOfFirstNonAsciiChar_Intrinsified for information about how this works.
if (VectorContainsNonAsciiChar(utf16VectorFirst))
{
goto Finish;
Expand Down

0 comments on commit 85d638b

Please sign in to comment.