-
Notifications
You must be signed in to change notification settings - Fork 4.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Get index of first non ascii byte #39506
Changes from 12 commits
5d33b49
9d4182c
d22d29c
3dca000
49d87f0
6d77419
8f259e6
9dd727c
b8014d9
0152f82
0370ec9
c3008eb
6c12885
576af36
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -42,6 +42,28 @@ private static bool AllCharsInUInt64AreAscii(ulong value) | |
return (value & ~0x007F007F_007F007Ful) == 0; | ||
} | ||
|
||
[MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
private static int GetIndexOfFirstNonAsciiByteInLane_AdvSimd(Vector128<byte> value, Vector128<byte> bitmask) | ||
{ | ||
if (!AdvSimd.Arm64.IsSupported || !BitConverter.IsLittleEndian) | ||
{ | ||
throw new PlatformNotSupportedException(); | ||
} | ||
|
||
// extractedBits[i] = (value[i] >> 7) & (1 << (12 * (i % 2))); | ||
Vector128<byte> mostSignificantBitIsSet = AdvSimd.ShiftRightArithmetic(value.AsSByte(), 7).AsByte(); | ||
Vector128<byte> extractedBits = AdvSimd.And(mostSignificantBitIsSet, bitmask); | ||
|
||
// collapse mask to lower bits | ||
extractedBits = AdvSimd.Arm64.AddPairwise(extractedBits, extractedBits); | ||
ulong mask = extractedBits.AsUInt64().ToScalar(); | ||
|
||
// calculate the index | ||
int index = BitOperations.TrailingZeroCount(mask) >> 2; | ||
Debug.Assert((mask != 0) ? index < 16 : index >= 16); | ||
return index; | ||
} | ||
|
||
/// <summary> | ||
/// Given a DWORD which represents two packed chars in machine-endian order, | ||
/// <see langword="true"/> iff the first char (in machine-endian order) is ASCII. | ||
|
@@ -67,8 +89,8 @@ public static unsafe nuint GetIndexOfFirstNonAsciiByte(byte* pBuffer, nuint buff | |
// pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while | ||
// this method is running. | ||
|
||
return (Sse2.IsSupported) | ||
? GetIndexOfFirstNonAsciiByte_Sse2(pBuffer, bufferLength) | ||
return (Sse2.IsSupported || AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) | ||
? GetIndexOfFirstNonAsciiByte_Intrinsified(pBuffer, bufferLength) | ||
: GetIndexOfFirstNonAsciiByte_Default(pBuffer, bufferLength); | ||
} | ||
|
||
|
@@ -215,17 +237,44 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Default(byte* pBuffer, n | |
goto Finish; | ||
} | ||
|
||
private static unsafe nuint GetIndexOfFirstNonAsciiByte_Sse2(byte* pBuffer, nuint bufferLength) | ||
[MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
private static bool ContainsNonAsciiByte_Sse2(uint sseMask) | ||
{ | ||
Debug.Assert(sseMask != uint.MaxValue); | ||
if (!Sse2.IsSupported) | ||
pgovind marked this conversation as resolved.
Show resolved
Hide resolved
|
||
{ | ||
throw new PlatformNotSupportedException(); | ||
} | ||
return sseMask != 0; | ||
} | ||
|
||
[MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
private static bool ContainsNonAsciiByte_AdvSimd(uint advSimdIndex) | ||
{ | ||
Debug.Assert(advSimdIndex != uint.MaxValue); | ||
if (!AdvSimd.IsSupported) | ||
pgovind marked this conversation as resolved.
Show resolved
Hide resolved
|
||
{ | ||
throw new PlatformNotSupportedException(); | ||
} | ||
return advSimdIndex < 16; | ||
} | ||
|
||
private static unsafe nuint GetIndexOfFirstNonAsciiByte_Intrinsified(byte* pBuffer, nuint bufferLength) | ||
{ | ||
// JIT turns the below into constants | ||
|
||
uint SizeOfVector128 = (uint)Unsafe.SizeOf<Vector128<byte>>(); | ||
nuint MaskOfAllBitsInVector128 = (nuint)(SizeOfVector128 - 1); | ||
|
||
Debug.Assert(Sse2.IsSupported, "Should've been checked by caller."); | ||
Debug.Assert(BitConverter.IsLittleEndian, "SSE2 assumes little-endian."); | ||
Debug.Assert(Sse2.IsSupported || AdvSimd.Arm64.IsSupported, "Sse2 or AdvSimd64 required."); | ||
Debug.Assert(BitConverter.IsLittleEndian, "This SSE2/Arm64 implementation assumes little-endian."); | ||
|
||
Vector128<byte> bitmask = BitConverter.IsLittleEndian ? | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm assuming the bitmask is intentionally defined as a local? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You throw There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yup. Declaring it static won't worry because of initializing issues in either .NET Standard or .NET Framework (I don't remember which. I just remember @carlossanlop's conclusion)
Yup, the idea is that this part will just work if we start supporting BigEndian environments |
||
Vector128.Create((ushort)0x1001).AsByte() : | ||
Vector128.Create((ushort)0x0110).AsByte(); | ||
|
||
uint currentMask, secondMask; | ||
uint currentSseMask = uint.MaxValue, secondSseMask = uint.MaxValue; | ||
uint currentAdvSimdIndex = uint.MaxValue, secondAdvSimdIndex = uint.MaxValue; | ||
byte* pOriginalBuffer = pBuffer; | ||
|
||
// This method is written such that control generally flows top-to-bottom, avoiding | ||
|
@@ -240,11 +289,25 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Sse2(byte* pBuffer, nuin | |
|
||
// Read the first vector unaligned. | ||
|
||
currentMask = (uint)Sse2.MoveMask(Sse2.LoadVector128(pBuffer)); // unaligned load | ||
|
||
if (currentMask != 0) | ||
if (Sse2.IsSupported) | ||
{ | ||
goto FoundNonAsciiDataInCurrentMask; | ||
currentSseMask = (uint)Sse2.MoveMask(Sse2.LoadVector128(pBuffer)); // unaligned load | ||
if (ContainsNonAsciiByte_Sse2(currentSseMask)) | ||
{ | ||
goto FoundNonAsciiDataInCurrentMask; | ||
} | ||
} | ||
else if (AdvSimd.Arm64.IsSupported) | ||
{ | ||
currentAdvSimdIndex = (uint)GetIndexOfFirstNonAsciiByteInLane_AdvSimd(AdvSimd.LoadVector128(pBuffer), bitmask); // unaligned load | ||
if (ContainsNonAsciiByte_AdvSimd(currentAdvSimdIndex)) | ||
{ | ||
goto FoundNonAsciiDataInCurrentMask; | ||
} | ||
} | ||
pgovind marked this conversation as resolved.
Show resolved
Hide resolved
|
||
else | ||
{ | ||
throw new PlatformNotSupportedException(); | ||
} | ||
|
||
// If we have less than 32 bytes to process, just go straight to the final unaligned | ||
|
@@ -281,15 +344,33 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Sse2(byte* pBuffer, nuin | |
|
||
do | ||
{ | ||
Vector128<byte> firstVector = Sse2.LoadAlignedVector128(pBuffer); | ||
Vector128<byte> secondVector = Sse2.LoadAlignedVector128(pBuffer + SizeOfVector128); | ||
if (Sse2.IsSupported) | ||
{ | ||
Vector128<byte> firstVector = Sse2.LoadAlignedVector128(pBuffer); | ||
Vector128<byte> secondVector = Sse2.LoadAlignedVector128(pBuffer + SizeOfVector128); | ||
|
||
currentMask = (uint)Sse2.MoveMask(firstVector); | ||
secondMask = (uint)Sse2.MoveMask(secondVector); | ||
currentSseMask = (uint)Sse2.MoveMask(firstVector); | ||
secondSseMask = (uint)Sse2.MoveMask(secondVector); | ||
if (ContainsNonAsciiByte_Sse2(currentSseMask | secondSseMask)) | ||
{ | ||
goto FoundNonAsciiDataInInnerLoop; | ||
} | ||
} | ||
else if (AdvSimd.Arm64.IsSupported) | ||
{ | ||
Vector128<byte> firstVector = AdvSimd.LoadVector128(pBuffer); | ||
Vector128<byte> secondVector = AdvSimd.LoadVector128(pBuffer + SizeOfVector128); | ||
|
||
if ((currentMask | secondMask) != 0) | ||
currentAdvSimdIndex = (uint)GetIndexOfFirstNonAsciiByteInLane_AdvSimd(firstVector, bitmask); | ||
secondAdvSimdIndex = (uint)GetIndexOfFirstNonAsciiByteInLane_AdvSimd(secondVector, bitmask); | ||
if (ContainsNonAsciiByte_AdvSimd(currentAdvSimdIndex) || ContainsNonAsciiByte_AdvSimd(secondAdvSimdIndex)) | ||
{ | ||
goto FoundNonAsciiDataInInnerLoop; | ||
} | ||
} | ||
else | ||
{ | ||
goto FoundNonAsciiDataInInnerLoop; | ||
throw new PlatformNotSupportedException(); | ||
} | ||
|
||
pBuffer += 2 * SizeOfVector128; | ||
|
@@ -313,10 +394,25 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Sse2(byte* pBuffer, nuin | |
// At least one full vector's worth of data remains, so we can safely read it. | ||
// Remember, at this point pBuffer is still aligned. | ||
|
||
currentMask = (uint)Sse2.MoveMask(Sse2.LoadAlignedVector128(pBuffer)); | ||
if (currentMask != 0) | ||
if (Sse2.IsSupported) | ||
{ | ||
goto FoundNonAsciiDataInCurrentMask; | ||
currentSseMask = (uint)Sse2.MoveMask(Sse2.LoadAlignedVector128(pBuffer)); | ||
if (ContainsNonAsciiByte_Sse2(currentSseMask)) | ||
{ | ||
goto FoundNonAsciiDataInCurrentMask; | ||
} | ||
} | ||
else if (AdvSimd.Arm64.IsSupported) | ||
{ | ||
currentAdvSimdIndex = (uint)GetIndexOfFirstNonAsciiByteInLane_AdvSimd(AdvSimd.LoadVector128(pBuffer), bitmask); | ||
if (ContainsNonAsciiByte_AdvSimd(currentAdvSimdIndex)) | ||
{ | ||
goto FoundNonAsciiDataInCurrentMask; | ||
} | ||
} | ||
else | ||
{ | ||
throw new PlatformNotSupportedException(); | ||
} | ||
|
||
IncrementCurrentOffsetBeforeFinalUnalignedVectorRead: | ||
|
@@ -332,17 +428,33 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Sse2(byte* pBuffer, nuin | |
|
||
pBuffer += (bufferLength & MaskOfAllBitsInVector128) - SizeOfVector128; | ||
|
||
currentMask = (uint)Sse2.MoveMask(Sse2.LoadVector128(pBuffer)); // unaligned load | ||
if (currentMask != 0) | ||
if (Sse2.IsSupported) | ||
{ | ||
goto FoundNonAsciiDataInCurrentMask; | ||
currentSseMask = (uint)Sse2.MoveMask(Sse2.LoadVector128(pBuffer)); // unaligned load | ||
if (ContainsNonAsciiByte_Sse2(currentSseMask)) | ||
{ | ||
goto FoundNonAsciiDataInCurrentMask; | ||
} | ||
|
||
} | ||
else if (AdvSimd.Arm64.IsSupported) | ||
{ | ||
currentAdvSimdIndex = (uint)GetIndexOfFirstNonAsciiByteInLane_AdvSimd(AdvSimd.LoadVector128(pBuffer), bitmask); // unaligned load | ||
if (ContainsNonAsciiByte_AdvSimd(currentAdvSimdIndex)) | ||
{ | ||
goto FoundNonAsciiDataInCurrentMask; | ||
} | ||
|
||
} | ||
else | ||
{ | ||
throw new PlatformNotSupportedException(); | ||
} | ||
|
||
pBuffer += SizeOfVector128; | ||
} | ||
|
||
Finish: | ||
|
||
return (nuint)pBuffer - (nuint)pOriginalBuffer; // and we're done! | ||
|
||
FoundNonAsciiDataInInnerLoop: | ||
|
@@ -351,20 +463,46 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Sse2(byte* pBuffer, nuin | |
// instead be the second mask. If so, skip the entire first mask and drain ASCII bytes | ||
// from the second mask. | ||
|
||
if (currentMask == 0) | ||
if (Sse2.IsSupported) | ||
{ | ||
pBuffer += SizeOfVector128; | ||
currentMask = secondMask; | ||
if (!ContainsNonAsciiByte_Sse2(currentSseMask)) | ||
{ | ||
pBuffer += SizeOfVector128; | ||
currentSseMask = secondSseMask; | ||
} | ||
} | ||
else if (AdvSimd.IsSupported) | ||
{ | ||
if (!ContainsNonAsciiByte_AdvSimd(currentAdvSimdIndex)) | ||
{ | ||
pBuffer += SizeOfVector128; | ||
currentAdvSimdIndex = secondSseMask; | ||
pgovind marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
} | ||
else | ||
{ | ||
throw new PlatformNotSupportedException(); | ||
} | ||
|
||
FoundNonAsciiDataInCurrentMask: | ||
pgovind marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same for this label, it seems like duplicate computations are being made. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmmm, not really. Both the SSE and AdvSimd paths can come here, so we need to special case this label for both paths |
||
// The mask contains - from the LSB - a 0 for each ASCII byte we saw, and a 1 for each non-ASCII byte. | ||
pgovind marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// Tzcnt is the correct operation to count the number of zero bits quickly. If this instruction isn't | ||
// available, we'll fall back to a normal loop. | ||
|
||
Debug.Assert(currentMask != 0, "Shouldn't be here unless we see non-ASCII data."); | ||
pBuffer += (uint)BitOperations.TrailingZeroCount(currentMask); | ||
if (Sse2.IsSupported) | ||
{ | ||
Debug.Assert(ContainsNonAsciiByte_Sse2(currentSseMask), "Shouldn't be here unless we see non-ASCII data."); | ||
pBuffer += (uint)BitOperations.TrailingZeroCount(currentSseMask); | ||
} | ||
else if (AdvSimd.Arm64.IsSupported) | ||
{ | ||
Debug.Assert(ContainsNonAsciiByte_AdvSimd(currentAdvSimdIndex), "Shouldn't be here unless we see non-ASCII data."); | ||
pBuffer += currentAdvSimdIndex; | ||
} | ||
else | ||
{ | ||
throw new PlatformNotSupportedException(); | ||
} | ||
|
||
goto Finish; | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I found a way how to find an index of a first non-ascii byte in
Vector128<byte> value
that doesn't requirebitmask
.Here is a sketch how this can be accomplished.
Here is a link to proof-of-concept code: https://gist.github.com/echesakovMSFT/b27ed28024091472db6d3ca007f34a6d#file-program-cs
There code that JIT generates for the part computing the index as follows:
The five
mov
s are redundant and shouldn't be there - I am working on analysing why JIT emits them (I suspect this is how we handleBuildDelayFreeUses
). Theoretically, the code should contain only 3 xsri
, 2 xxtn
, 1 xsli
, 1 xclz
and 1 xumov
instructions.@TamarChristinaArm Do you see any issue with this approach? Aside from only supporting little-endian.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This code also allows to compute mask non-ascii bytes by replacing
with
Although, the mask will be reversed in this case
mask.15
correspond tolane[0].7
andmask.0
corresponds tolane[15].7
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd actually like to get this PR in soon, considering our deadline is Friday. I'm thinking I'll file a follow-up issue to investigate this improvement and the improvements suggested in #39507. How does that sound?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@pgovind Sure, to clarify I was not suggesting to incorporate the algorithm right now. I understand that this would be an unreasonable ask.
But I can explain motivation and how I discovered this approach. I was trying to find a way to avoid doing this in your code
since a call to
BitOperations.TrailingZeroCount(mask)
is not free and useAdvSimd.LeadingZeroCount
instead.It turns out that I ended up with a completely different approach.
We can follow up on this in .NET 6.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@echesakovMSFT In and out of itself the
TrailingZeroCount
isn't that bad because the integer version ofclz
is twice as fast as the simd version ofclz
in your example. so therbit+clz
on the genreg side is about as efficient as theclz
on the SIMD side from your example (looking at a Neoverse-N1).The example you have, while it avoids having to load the mask ends up with 7 instructions that are 2 cycles each and the
sri
andsli
go down the same pipe so they can't be executed in parallel.so I think if you amortize the cost of the mask that sequence will end up being slower.
That said, you can improve this sequence further. The issue here isn't the use of a mask, it's the use of a mask that can't be created in register. Instead take a look at the sequence I pasted #39507 (comment) which also avoid the
TrailingZeroCount
(on big endian) by using a different mask (which is also created without requiring a load). That is a sequence of 6 cycles and only blocksV1
once.But looking at the overall algorithm, the majority of the cases don't require the actual index yet (with which I mean, you only check if it contains it or not...). The helper function should really end at the
.ToScalar();
andContainsNonAsciiByte_AdvSimd
should really just be!= 0
.Also what is the expected loop iterations? if you expect the majority of the characters to be ascii then the implementation is not really taking advantage of that.
You can test both vector together without needing a mask. Essentially what you want is a slightly modified version of strlen https://github.com/ARM-software/optimized-routines/blob/224cb5f67b71757b99fe1e10b5a437c17a1d733c/string/aarch64/strlen.S#L144
now when
synd
test succeeds so you know you have an element there, you have to find in which vector the match is.You can easily do this by testing the lowpart of
synd
, so ifsynd
isx1
you testw1
usingtst w1, 0x80808080
if that succeeds you know it's in the
currentAdvSimdIndex
else insecondAdvSimdIndex
.after this you implement the index finding part
Now you still need the
>> 2
but at the point you do this you're about to add to the offset:Which should allow the JIT to lower the
>> 2
into theadd
asThis avoid you needing the mask (does need some minor tweaking for big-endian) but should be significantly smaller than what is currently being generated and should be a couple of orders faster.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@TamarChristinaArm Thank you a lot for your detailed expanation! Going forward we should definitely capture your analysis here and in other PR.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
np, Also FYI for maximum throughput instead of
sshr v1.16b, v1.16b, 7
you can use interpret it as a signed number and instead doWhich has the same latency as
sshr
but isn't restricted to one pipe. That's a slightly better sequence than what I pasted in #39507 (comment) and above.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
and you do need an
rbit
above for little endian, I'll update the comments. for this case anLD1
with two regs may be faste than anLDP
but that needs some benchmarking, it does make it easier to handle the endian differencesThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually, we haven't had a chance to finish #33972 in .NET 5 - so, in this case, using
Vector64/128<T>.Zero
would require to zero a register and use 3-reg form ofcmlt
- this is why I suggested to usesshr
at the first place.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm well you can declare the use of
Vector64/128<T>.Zero
to outside the loop so you pay for it only once, and assuming you don't run out of registers it should still be better assshr
is restricted toV1
whilecmlt
can go in any pipe. So in a non-micro benchmark thecmlt
sequence would likely finish earlier.