Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement ShuffleUnsafe methods #99596

Closed
wants to merge 54 commits into from
Closed
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
4ed9fd8
Initial implementation & tests
hamarb123 Mar 12, 2024
f5dcfed
Update Vector256.cs
hamarb123 Mar 12, 2024
37063ff
Update Vector64.cs
hamarb123 Mar 12, 2024
1c456b0
Fix `values` to `vector`
hamarb123 Mar 12, 2024
32347d3
Update Vector256.cs
hamarb123 Mar 12, 2024
b65b8f3
Apply feedback & fix tests
hamarb123 Mar 12, 2024
3cf87ff
Update Vector512Tests.cs
hamarb123 Mar 12, 2024
fef9edd
Fix new tests
hamarb123 Mar 12, 2024
31136cb
Fix new tests
hamarb123 Mar 13, 2024
bc86f69
Update Vector128Tests.cs
hamarb123 Mar 13, 2024
3c20a8b
Update Vector128Tests.cs
hamarb123 Mar 13, 2024
64ab7c1
Optimise standard shuffle using ShuffleUnsafe
hamarb123 Mar 18, 2024
b2e6885
Compilation fixes
hamarb123 Mar 18, 2024
bd809c1
Fix
hamarb123 Mar 18, 2024
c533e98
Re-implement in JIT
hamarb123 Mar 25, 2024
9dd6cf6
Merge branch 'main' into main12
hamarb123 Mar 25, 2024
d9888bf
Fix mono compile
hamarb123 Mar 25, 2024
f57d52d
Update simd-methods.h
hamarb123 Mar 25, 2024
cde0362
Update gentree.cpp
hamarb123 Mar 25, 2024
510927b
Fix test failures
hamarb123 Mar 25, 2024
8ec52e9
Fix formatting of JIT code
hamarb123 Mar 25, 2024
1e23887
Fix further formatting issues
hamarb123 Mar 25, 2024
aaac9a8
Formatting fixes
hamarb123 Mar 25, 2024
055a1ba
Fix bug
hamarb123 Mar 26, 2024
5a62b6a
Use byte shuffle implementation for short with constants
hamarb123 Mar 26, 2024
090023b
Fix fomatting
hamarb123 Mar 26, 2024
465b0a5
Fix formatting
hamarb123 Mar 26, 2024
b028da5
Update gentree.cpp
hamarb123 Mar 26, 2024
08589bd
Fix formatting and compile error
hamarb123 Mar 26, 2024
9868e73
Comment change and implement `ShuffleUnsafe` for `sbyte`
hamarb123 Mar 26, 2024
434ff31
Non-constant indices handling on arm64
hamarb123 Mar 26, 2024
d52deed
Fix missing CLSCompliant attributes
hamarb123 Mar 26, 2024
3737e3e
Fix compile issues
hamarb123 Mar 26, 2024
9a0fec3
Fix compile
hamarb123 Mar 26, 2024
24bfa4f
Copy and paste the methods so I can't get it wrong
hamarb123 Mar 26, 2024
78b6ccf
Fix ref cs for sbyte ShuffleUnsafe overloads
hamarb123 Mar 26, 2024
71804b7
Non-constant indices for xarch
hamarb123 Mar 26, 2024
f67169d
FIx compile errors
hamarb123 Mar 26, 2024
62888b0
Fix formatting and unit tests compile
hamarb123 Mar 26, 2024
40cfed7
Update gentree.cpp
hamarb123 Mar 26, 2024
85db8bb
Fix build issues for tests
hamarb123 Mar 26, 2024
0bf77ff
Fix tests again
hamarb123 Mar 26, 2024
142f7ee
Update Vector128Tests.cs
hamarb123 Mar 26, 2024
64f1ec9
Fix bugs & simplify part of the code
hamarb123 Mar 26, 2024
243d6a0
Fix some bugs & simplify some code
hamarb123 Mar 26, 2024
9f169fa
Fix some bugs
hamarb123 Mar 28, 2024
e038767
Fix formatting
hamarb123 Mar 28, 2024
e5b501b
Merge branch 'main' of https://github.com/dotnet/runtime into main12
hamarb123 Mar 29, 2024
ab94fb3
Fix bugs
hamarb123 Mar 29, 2024
08f4a51
Fix formatting
hamarb123 Mar 29, 2024
cdb928f
Update gentree.cpp
hamarb123 Mar 29, 2024
fb790a3
Fix bug for ShuffleUnsafe when using avx-512
hamarb123 Mar 29, 2024
5658b0f
Update gentree.cpp
hamarb123 Mar 29, 2024
a8490ed
Fix shift usage
hamarb123 Mar 29, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
10 changes: 9 additions & 1 deletion src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -3243,7 +3243,15 @@ class Compiler
GenTree* op1,
GenTree* op2,
CorInfoType simdBaseJitType,
unsigned simdSize);
unsigned simdSize,
bool isUnsafe);

GenTree* gtNewSimdShuffleNodeVariable(var_types type,
GenTree* op1,
GenTree* op2,
CorInfoType simdBaseJitType,
unsigned simdSize,
bool isUnsafe);

GenTree* gtNewSimdSqrtNode(
var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize);
Expand Down
273 changes: 246 additions & 27 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24298,8 +24298,148 @@ GenTree* Compiler::gtNewSimdNarrowNode(
#endif // !TARGET_XARCH && !TARGET_ARM64
}

GenTree* Compiler::gtNewSimdShuffleNodeVariable(
var_types type, GenTree* op1, GenTree* op2, CorInfoType simdBaseJitType, unsigned simdSize, bool isUnsafe)
{
assert(IsBaselineSimdIsaSupportedDebugOnly());

assert(varTypeIsSIMD(type));
assert(getSIMDTypeForSize(simdSize) == type);

assert(op1 != nullptr);
assert(op1->TypeIs(type));

var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType);
assert(op2 != nullptr);
assert(op2->TypeIs(type));
assert(!op2->IsVectorConst());

assert(varTypeIsByte(simdBaseType));

GenTree* retNode = nullptr;
GenTree* cnsNode = nullptr;

size_t elementSize = genTypeSize(simdBaseType);
size_t elementCount = simdSize / elementSize;

assert(elementSize == 1);

#if defined(TARGET_XARCH)
// duplicate operand 2 for non-isUnsafe implementation later
GenTree* op2DupSafe = nullptr;
if (!isUnsafe) op2DupSafe = fgMakeMultiUse(&op2);

if (simdSize == 32)
{
assert(compIsaSupportedDebugOnly(InstructionSet_AVX2));

if (compOpportunisticallyDependsOn(InstructionSet_AVX512VBMI_VL))
{
// swap the operands to match the encoding requirements
retNode = gtNewSimdHWIntrinsicNode(type, op2, op1, NI_AVX512VBMI_VL_PermuteVar32x8, simdBaseJitType, simdSize);
}
else
{
// TODO-XARCH-CQ: If we have known min/max or set/unset bits for the indices, then we could avoid an Avx2.Shuffle and/or Avx2.Permute2x128 in some cases

// get the indices, but only keep the high bit and bits used for indexing, and xor the cross-lane bit on the high 128-bit lane part of indices
// we begin computing this early as it seems to take the longest to calculate (it can be done in parallel to other operations ideally)
// Vector256<byte> indicesXord = (indices ^ Vector256.Create(Vector128.Create((byte)0), Vector128.Create((byte)0x10))) & Vector256.Create((byte)0x9F);
simd_t xorCns = {};
for (size_t index = 0; index < simdSize; index++) xorCns.u8[index] = static_cast<uint8_t>(index & 0x10);
hamarb123 marked this conversation as resolved.
Show resolved Hide resolved
simd_t xorMaskCns = {};
for (size_t index = 0; index < simdSize; index++) xorMaskCns.u8[index] = 0x9F;
cnsNode = gtNewVconNode(type);
cnsNode->AsVecCon()->gtSimdVal = xorCns;
GenTree* indicesXord = gtNewSimdBinOpNode(GT_XOR, type, fgMakeMultiUse(&op2), cnsNode, simdBaseJitType, simdSize);
cnsNode = gtNewVconNode(type);
cnsNode->AsVecCon()->gtSimdVal = xorMaskCns;
indicesXord = gtNewSimdBinOpNode(GT_AND, type, indicesXord, cnsNode, simdBaseJitType, simdSize);

// swap the low and high 128-bit lanes
// calculate swap before shuf1 so they can be computed in parallel
// Vector256<byte> swap = Avx2.Permute2x128(vector, vector, 0b00000001);
uint8_t control = 1;
cnsNode = gtNewIconNode(control);
GenTree* swap = gtNewSimdHWIntrinsicNode(type, fgMakeMultiUse(&op1), fgMakeMultiUse(&op1), cnsNode, NI_AVX2_Permute2x128, simdBaseJitType, simdSize);

// shuffle with both the normal and swapped values
// Vector256<byte> shuf1 = Avx2.Shuffle(vector, indices);
// Vector256<byte> shuf2 = Avx2.Shuffle(swap, indices);
GenTree* shuf1 = gtNewSimdHWIntrinsicNode(type, op1, fgMakeMultiUse(&op2), NI_AVX2_Shuffle, simdBaseJitType, simdSize);
GenTree* shuf2 = gtNewSimdHWIntrinsicNode(type, swap, op2, NI_AVX2_Shuffle, simdBaseJitType, simdSize);

// compare our modified indices to 0x0F (highest value not swapping lane), we get 0xFF when we are swapping lane and 0x00 otherwise
// Vector256<byte> selection = Avx2.CompareGreaterThan(indicesXord.AsSByte(), Vector256.Create((sbyte)0x0F)).AsByte();
simd_t selCns = {};
for (size_t index = 0; index < simdSize; index++) selCns.u8[index] = 0x0F;
cnsNode = gtNewVconNode(type);
cnsNode->AsVecCon()->gtSimdVal = selCns;
GenTree* selection = gtNewSimdCmpOpNode(GT_GT, type, indicesXord, cnsNode, CORINFO_TYPE_BYTE, simdSize);

// blend our two shuffles based on whether each element swaps lanes or not
// return Avx2.BlendVariable(shuf1, shuf2, selection);
retNode = gtNewSimdHWIntrinsicNode(type, shuf1, shuf2, selection, NI_AVX2_BlendVariable, simdBaseJitType, simdSize);
}
}
else if (simdSize == 64)
{
assert(compIsaSupportedDebugOnly(InstructionSet_AVX512VBMI));

// swap the operands to match the encoding requirements
retNode = gtNewSimdHWIntrinsicNode(type, op2, op1, NI_AVX512VBMI_PermuteVar64x8, simdBaseJitType, simdSize);
}
else
{
assert(simdSize == 16);
assert(compOpportunisticallyDependsOn(InstructionSet_SSSE3));

retNode = gtNewSimdHWIntrinsicNode(type, op1, op2, NI_SSSE3_Shuffle, simdBaseJitType, simdSize);
}

assert(retNode != nullptr);

if (!isUnsafe)
{
// create the comparand - which has the byte size as every element
simd_t cmpCns = {};
assert((elementCount & 0xFF) == elementCount); //assert we don't lose info with the cast below
for (size_t index = 0; index < simdSize; index++) cmpCns.u8[index] = static_cast<uint8_t>(elementCount);

// create the comparand node, and the mask node (op2 < comparand), and the result node (mask & unsafeResult)
GenTree* comparand = gtNewVconNode(type);
comparand->AsVecCon()->gtSimdVal = cmpCns;
assert(op2DupSafe != nullptr);
GenTree* mask = gtNewSimdCmpOpNode(GT_LT, type, op2DupSafe, comparand, CORINFO_TYPE_UBYTE, simdSize);
retNode = gtNewSimdBinOpNode(GT_AND, type, mask, retNode, simdBaseJitType, simdSize);
}
else
{
assert(op2DupSafe == nullptr);
}

return retNode;
#elif defined(TARGET_ARM64)
NamedIntrinsic lookupIntrinsic = NI_AdvSimd_VectorTableLookup;

if (simdSize == 16)
{
lookupIntrinsic = NI_AdvSimd_Arm64_VectorTableLookup;

op1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector64_ToVector128, simdBaseJitType, simdSize);
}

// VectorTableLookup is only valid on byte/sbyte
simdBaseJitType = varTypeIsUnsigned(simdBaseType) ? CORINFO_TYPE_UBYTE : CORINFO_TYPE_BYTE;

return gtNewSimdHWIntrinsicNode(type, op1, op2, lookupIntrinsic, simdBaseJitType, simdSize);
#else
#error Unsupported platform
#endif // !TARGET_XARCH && !TARGET_ARM64
}

GenTree* Compiler::gtNewSimdShuffleNode(
var_types type, GenTree* op1, GenTree* op2, CorInfoType simdBaseJitType, unsigned simdSize)
var_types type, GenTree* op1, GenTree* op2, CorInfoType simdBaseJitType, unsigned simdSize, bool isUnsafe)
{
assert(IsBaselineSimdIsaSupportedDebugOnly());

Expand Down Expand Up @@ -24343,12 +24483,30 @@ GenTree* Compiler::gtNewSimdShuffleNode(
simd_t vecCns = {};
simd_t mskCns = {};

// we want to match ShuffleUnsafe behaviour with constants to with variable,
// therefore we need to make sure we treat where we should get 0s the same as what ShuffleUnsafe uses
size_t maxNonZeroIndex = elementCount; // exclusive
size_t valueMask = static_cast<uint64_t>(elementCount - 1);
if (isUnsafe)
{
if (simdSize == 64 || (simdSize == 32 && compOpportunisticallyDependsOn(InstructionSet_AVX512VBMI_VL)))
{
maxNonZeroIndex = 256;
}
else
{
maxNonZeroIndex = 128;
}
}

for (size_t index = 0; index < elementCount; index++)
{
value = op2->GetIntegralVectorConstElement(index, simdBaseType);

if (value < elementCount)
if (value < maxNonZeroIndex)
{
value &= valueMask;

if (simdSize == 32)
{
// Most of the 256-bit shuffle/permute instructions operate as if
Expand Down Expand Up @@ -24407,49 +24565,110 @@ GenTree* Compiler::gtNewSimdShuffleNode(
}
}

//set needsZero to false if this is for ShuffleUnsafe, since we never need to mask
if (isUnsafe) needsZero = false;

if (simdSize == 32)
{
assert(compIsaSupportedDebugOnly(InstructionSet_AVX2));

if ((varTypeIsByte(simdBaseType) && !compOpportunisticallyDependsOn(InstructionSet_AVX512VBMI_VL)) ||
(varTypeIsShort(simdBaseType) && !compOpportunisticallyDependsOn(InstructionSet_AVX512BW_VL)))
{
if (crossLane)
if (varTypeIsShort(simdBaseType))
{
// TODO-XARCH-CQ: We should emulate cross-lane shuffling for byte/sbyte and short/ushort
unreached();
// TODO-XARCH-CQ: We should emulate cross-lane shuffling for short/ushort
assert(!crossLane);

op2 = gtNewVconNode(type);
op2->AsVecCon()->gtSimdVal = vecCns;

retNode = gtNewSimdHWIntrinsicNode(type, op1, op2, NI_AVX2_Shuffle, simdBaseJitType, simdSize);
}
else
{
assert(varTypeIsByte(simdBaseType));

// If we aren't crossing lanes, then we can decompose the byte/sbyte
// and short/ushort operations into 2x 128-bit operations
uint8_t leftWants = 0; // result left lane wants which lanes bitfield (1 - left, 2 - right)
uint8_t rightWants = 0; // result right lane wants which lanes bitfield (1 - left, 2 - right)
bool nonDefaultShuffleMask = false; // tracks whether any element in vecCns is not the default value: 0->15, 0->15

// We want to build what is essentially the following managed code:
// var op1Lower = op1.GetLower();
// op1Lower = Ssse3.Shuffle(op1Lower, Vector128.Create(...));
//
// var op1Upper = op1.GetUpper();
// op1Upper = Ssse3.Shuffle(op1Upper, Vector128.Create(...));
//
// return Vector256.Create(op1Lower, op1Upper);
simd_t selCns = {};
for (size_t index = 0; index < simdSize; index++)
{
// get pointer to our leftWants/rightWants
uint8_t* wants = nullptr;
if (index < 16) wants = &leftWants;
else wants = &rightWants;

simdBaseJitType = varTypeIsUnsigned(simdBaseType) ? CORINFO_TYPE_UBYTE : CORINFO_TYPE_BYTE;
// update our wants based on which values we use
value = op2->GetIntegralVectorConstElement(index, simdBaseType);
if (value < 16) *wants |= 1;
else if (value < 32) *wants |= 2;

GenTree* op1Dup = fgMakeMultiUse(&op1);
GenTree* op1Lower = gtNewSimdGetLowerNode(TYP_SIMD16, op1, simdBaseJitType, simdSize);
// update our conditional select mask for if we need 2 shuffles
value ^= static_cast<uint64_t>(index & 0x10);
if (value < 32 && value >= 16) selCns.u8[index] = 0xFF;
else selCns.u8[index] = 0;

op2 = gtNewVconNode(TYP_SIMD16);
op2->AsVecCon()->gtSimd16Val = vecCns.v128[0];
// normalise our shuffle mask, and check if it's default
if (vecCns.u8[index] < 32) vecCns.u8[index] &= 0x0F;
if (vecCns.u8[index] != (index & 0x0F)) nonDefaultShuffleMask = true;
}

// we might be able to get away with only 1 shuffle, this is the case if neither leftWants nor rightWants are 3 (indicating only 0/1 side used)
if (leftWants != 3 && rightWants != 3)
{
// set result to its initial value
retNode = op1;

// create the permutation
uint8_t control = 0;
if (leftWants == 2) control |= 1; // if left wants right lane, then set that bit
if (rightWants != 1) control |= 16; // if right wants right lane (or neither), then set the bit for right lane
if (control != 16) // if we have 16, then we don't need to actually permute, since that's what we start with
{
cnsNode = gtNewIconNode(control);
retNode = gtNewSimdHWIntrinsicNode(type, fgMakeMultiUse(&retNode), retNode, cnsNode, NI_AVX2_Permute2x128, simdBaseJitType, simdSize);
}

op1Lower = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1Lower, op2, NI_SSSE3_Shuffle, simdBaseJitType, 16);
// if we have a non-default shuffle mask, we need to do Avx2.Shuffle
if (nonDefaultShuffleMask)
{
op2 = gtNewVconNode(type);
op2->AsVecCon()->gtSimdVal = vecCns;

retNode = gtNewSimdHWIntrinsicNode(type, op1, fgMakeMultiUse(&op2), NI_AVX2_Shuffle, simdBaseJitType, simdSize);
}
}
else
{
// create the control for swapping
uint8_t control = 1; // 0b00000001
cnsNode = gtNewIconNode(control);
GenTree* swap = gtNewSimdHWIntrinsicNode(type, fgMakeMultiUse(&op1), fgMakeMultiUse(&op1), cnsNode, NI_AVX2_Permute2x128, simdBaseJitType, simdSize);

GenTree* op1Upper = gtNewSimdGetUpperNode(TYP_SIMD16, op1Dup, simdBaseJitType, simdSize);
// if we have non-default shuffle mask
if (nonDefaultShuffleMask)
{
// create the shuffle indices node
op2 = gtNewVconNode(type);
op2->AsVecCon()->gtSimdVal = vecCns;

op2 = gtNewVconNode(TYP_SIMD16);
op2->AsVecCon()->gtSimd16Val = vecCns.v128[1];
// shuffle both op1 and swap(op1)
op1 = gtNewSimdHWIntrinsicNode(type, op1, fgMakeMultiUse(&op2), NI_AVX2_Shuffle, simdBaseJitType, simdSize);
swap = gtNewSimdHWIntrinsicNode(type, swap, op2, NI_AVX2_Shuffle, simdBaseJitType, simdSize);
}

op1Upper = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1Upper, op2, NI_SSSE3_Shuffle, simdBaseJitType, 16);
// select the appropriate values
GenTree* selNode = gtNewVconNode(type);
selNode->AsVecCon()->gtSimdVal = selCns;
retNode = gtNewConditionalNode(GT_SELECT, selNode, swap, op1, simdBaseType);
}

return gtNewSimdWithUpperNode(type, op1Lower, op1Upper, simdBaseJitType, simdSize);
assert(retNode != nullptr);
return retNode;
}
}

if (elementSize == 4)
Expand Down Expand Up @@ -24565,7 +24784,7 @@ GenTree* Compiler::gtNewSimdShuffleNode(
}
else
{
if (needsZero && compOpportunisticallyDependsOn(InstructionSet_SSSE3))
if ((needsZero || (isUnsafe && varTypeIsByte(simdBaseType))) && compOpportunisticallyDependsOn(InstructionSet_SSSE3))
{
simdBaseJitType = varTypeIsUnsigned(simdBaseType) ? CORINFO_TYPE_UBYTE : CORINFO_TYPE_BYTE;

Expand Down
15 changes: 13 additions & 2 deletions src/coreclr/jit/hwintrinsicarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1615,13 +1615,15 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,

case NI_Vector64_Shuffle:
case NI_Vector128_Shuffle:
case NI_Vector64_ShuffleUnsafe:
case NI_Vector128_ShuffleUnsafe:
{
assert((sig->numArgs == 2) || (sig->numArgs == 3));
assert((simdSize == 8) || (simdSize == 16));

GenTree* indices = impStackTop(0).val;

if (!indices->IsVectorConst())
if (!varTypeIsByte(simdBaseType) && !indices->IsVectorConst())
{
// TODO-ARM64-CQ: Handling non-constant indices is a bit more complex
break;
Expand All @@ -1632,7 +1634,16 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
op2 = impSIMDPopStack();
op1 = impSIMDPopStack();

retNode = gtNewSimdShuffleNode(retType, op1, op2, simdBaseJitType, simdSize);
if (indices->IsVectorConst())
{
retNode = gtNewSimdShuffleNode(retType, op1, op2, simdBaseJitType, simdSize,
intrinsic == NI_Vector64_ShuffleUnsafe || intrinsic == NI_Vector128_ShuffleUnsafe);
}
else
{
retNode = gtNewSimdShuffleNodeVariable(retType, op1, op2, simdBaseJitType, simdSize,
intrinsic == NI_Vector64_ShuffleUnsafe || intrinsic == NI_Vector128_ShuffleUnsafe);
}
}
break;
}
Expand Down