dotnet · hamarb123 · Mar 12, 2024 · Mar 12, 2024 · Mar 12, 2024 · Mar 12, 2024
diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
@@ -3243,7 +3243,15 @@ class Compiler
                                   GenTree*    op1,
                                   GenTree*    op2,
                                   CorInfoType simdBaseJitType,
-                                  unsigned    simdSize);
+                                  unsigned    simdSize,
+                                  bool        isUnsafe);
+
+    GenTree* gtNewSimdShuffleNodeVariable(var_types   type,
+                                          GenTree*    op1,
+                                          GenTree*    op2,
+                                          CorInfoType simdBaseJitType,
+                                          unsigned    simdSize,
+                                          bool        isUnsafe);
 
     GenTree* gtNewSimdSqrtNode(
         var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize);

diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp
@@ -24298,8 +24298,148 @@ GenTree* Compiler::gtNewSimdNarrowNode(
 #endif // !TARGET_XARCH && !TARGET_ARM64
 }
 
+GenTree* Compiler::gtNewSimdShuffleNodeVariable(
+    var_types type, GenTree* op1, GenTree* op2, CorInfoType simdBaseJitType, unsigned simdSize, bool isUnsafe)
+{
+    assert(IsBaselineSimdIsaSupportedDebugOnly());
+
+    assert(varTypeIsSIMD(type));
+    assert(getSIMDTypeForSize(simdSize) == type);
+
+    assert(op1 != nullptr);
+    assert(op1->TypeIs(type));
+
+    var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType);
+    assert(op2 != nullptr);
+    assert(op2->TypeIs(type));
+    assert(!op2->IsVectorConst());
+
+    assert(varTypeIsByte(simdBaseType));
+
+    GenTree*             retNode = nullptr;
+    GenTree*             cnsNode = nullptr;
+
+    size_t elementSize  = genTypeSize(simdBaseType);
+    size_t elementCount = simdSize / elementSize;
+
+    assert(elementSize == 1);
+
+#if defined(TARGET_XARCH)
+    // duplicate operand 2 for non-isUnsafe implementation later
+    GenTree* op2DupSafe = nullptr;
+    if (!isUnsafe) op2DupSafe = fgMakeMultiUse(&op2);
+
+    if (simdSize == 32)
+    {
+        assert(compIsaSupportedDebugOnly(InstructionSet_AVX2));
+
+        if (compOpportunisticallyDependsOn(InstructionSet_AVX512VBMI_VL))
+        {
+            // swap the operands to match the encoding requirements
+            retNode = gtNewSimdHWIntrinsicNode(type, op2, op1, NI_AVX512VBMI_VL_PermuteVar32x8, simdBaseJitType, simdSize);
+        }
+        else
+        {
+            // TODO-XARCH-CQ: If we have known min/max or set/unset bits for the indices, then we could avoid an Avx2.Shuffle and/or Avx2.Permute2x128 in some cases
+
+            // get the indices, but only keep the high bit and bits used for indexing, and xor the cross-lane bit on the high 128-bit lane part of indices
+            // we begin computing this early as it seems to take the longest to calculate (it can be done in parallel to other operations ideally)
+            // Vector256<byte> indicesXord = (indices ^ Vector256.Create(Vector128.Create((byte)0), Vector128.Create((byte)0x10))) & Vector256.Create((byte)0x9F);
+            simd_t xorCns = {};
+            for (size_t index = 0; index < simdSize; index++) xorCns.u8[index] = static_cast<uint8_t>(index & 0x10);
+            simd_t xorMaskCns = {};
+            for (size_t index = 0; index < simdSize; index++) xorMaskCns.u8[index] = 0x9F;
+            cnsNode = gtNewVconNode(type);
+            cnsNode->AsVecCon()->gtSimdVal = xorCns;
+            GenTree* indicesXord = gtNewSimdBinOpNode(GT_XOR, type, fgMakeMultiUse(&op2), cnsNode, simdBaseJitType, simdSize);
+            cnsNode = gtNewVconNode(type);
+            cnsNode->AsVecCon()->gtSimdVal = xorMaskCns;
+            indicesXord = gtNewSimdBinOpNode(GT_AND, type, indicesXord, cnsNode, simdBaseJitType, simdSize);
+
+            // swap the low and high 128-bit lanes
+            // calculate swap before shuf1 so they can be computed in parallel
+            // Vector256<byte> swap = Avx2.Permute2x128(vector, vector, 0b00000001);
+            uint8_t control = 1;
+            cnsNode = gtNewIconNode(control);
+            GenTree* swap = gtNewSimdHWIntrinsicNode(type, fgMakeMultiUse(&op1), fgMakeMultiUse(&op1), cnsNode, NI_AVX2_Permute2x128, simdBaseJitType, simdSize);
+
+            // shuffle with both the normal and swapped values
+            // Vector256<byte> shuf1 = Avx2.Shuffle(vector, indices);
+            // Vector256<byte> shuf2 = Avx2.Shuffle(swap, indices);
+            GenTree* shuf1 = gtNewSimdHWIntrinsicNode(type, op1, fgMakeMultiUse(&op2), NI_AVX2_Shuffle, simdBaseJitType, simdSize);
+            GenTree* shuf2 = gtNewSimdHWIntrinsicNode(type, swap, op2, NI_AVX2_Shuffle, simdBaseJitType, simdSize);
+
+            // compare our modified indices to 0x0F (highest value not swapping lane), we get 0xFF when we are swapping lane and 0x00 otherwise
+            // Vector256<byte> selection = Avx2.CompareGreaterThan(indicesXord.AsSByte(), Vector256.Create((sbyte)0x0F)).AsByte();
+            simd_t selCns = {};
+            for (size_t index = 0; index < simdSize; index++) selCns.u8[index] = 0x0F;
+            cnsNode = gtNewVconNode(type);
+            cnsNode->AsVecCon()->gtSimdVal = selCns;
+            GenTree* selection = gtNewSimdCmpOpNode(GT_GT, type, indicesXord, cnsNode, CORINFO_TYPE_BYTE, simdSize);
+
+            // blend our two shuffles based on whether each element swaps lanes or not
+            // return Avx2.BlendVariable(shuf1, shuf2, selection);
+            retNode = gtNewSimdHWIntrinsicNode(type, shuf1, shuf2, selection, NI_AVX2_BlendVariable, simdBaseJitType, simdSize);
+        }
+    }
+    else if (simdSize == 64)
+    {
+        assert(compIsaSupportedDebugOnly(InstructionSet_AVX512VBMI));
+
+        // swap the operands to match the encoding requirements
+        retNode = gtNewSimdHWIntrinsicNode(type, op2, op1, NI_AVX512VBMI_PermuteVar64x8, simdBaseJitType, simdSize);
+    }
+    else
+    {
+        assert(simdSize == 16);
+        assert(compOpportunisticallyDependsOn(InstructionSet_SSSE3));
+
+        retNode = gtNewSimdHWIntrinsicNode(type, op1, op2, NI_SSSE3_Shuffle, simdBaseJitType, simdSize);
+    }
+
+    assert(retNode != nullptr);
+
+    if (!isUnsafe)
+    {
+        // create the comparand - which has the byte size as every element
+        simd_t cmpCns = {};
+        assert((elementCount & 0xFF) == elementCount); //assert we don't lose info with the cast below
+        for (size_t index = 0; index < simdSize; index++) cmpCns.u8[index] = static_cast<uint8_t>(elementCount);
+
+        // create the comparand node, and the mask node (op2 < comparand), and the result node (mask & unsafeResult)
+        GenTree* comparand = gtNewVconNode(type);
+        comparand->AsVecCon()->gtSimdVal = cmpCns;
+        assert(op2DupSafe != nullptr);
+        GenTree* mask = gtNewSimdCmpOpNode(GT_LT, type, op2DupSafe, comparand, CORINFO_TYPE_UBYTE, simdSize);
+        retNode = gtNewSimdBinOpNode(GT_AND, type, mask, retNode, simdBaseJitType, simdSize);
+    }
+    else
+    {
+        assert(op2DupSafe == nullptr);
+    }
+
+    return retNode;
+#elif defined(TARGET_ARM64)
+    NamedIntrinsic lookupIntrinsic = NI_AdvSimd_VectorTableLookup;
+
+    if (simdSize == 16)
+    {
+        lookupIntrinsic = NI_AdvSimd_Arm64_VectorTableLookup;
+
+        op1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector64_ToVector128, simdBaseJitType, simdSize);
+    }
+
+    // VectorTableLookup is only valid on byte/sbyte
+    simdBaseJitType = varTypeIsUnsigned(simdBaseType) ? CORINFO_TYPE_UBYTE : CORINFO_TYPE_BYTE;
+
+    return gtNewSimdHWIntrinsicNode(type, op1, op2, lookupIntrinsic, simdBaseJitType, simdSize);
+#else
+#error Unsupported platform
+#endif // !TARGET_XARCH && !TARGET_ARM64
+}
+
 GenTree* Compiler::gtNewSimdShuffleNode(
-    var_types type, GenTree* op1, GenTree* op2, CorInfoType simdBaseJitType, unsigned simdSize)
+    var_types type, GenTree* op1, GenTree* op2, CorInfoType simdBaseJitType, unsigned simdSize, bool isUnsafe)
 {
     assert(IsBaselineSimdIsaSupportedDebugOnly());
 
@@ -24343,12 +24483,30 @@ GenTree* Compiler::gtNewSimdShuffleNode(
     simd_t   vecCns    = {};
     simd_t   mskCns    = {};
 
+    // we want to match ShuffleUnsafe behaviour with constants to with variable,
+    // therefore we need to make sure we treat where we should get 0s the same as what ShuffleUnsafe uses
+    size_t maxNonZeroIndex = elementCount; // exclusive
+    size_t valueMask = static_cast<uint64_t>(elementCount - 1);
+    if (isUnsafe)
+    {
+        if (simdSize == 64 || (simdSize == 32 && compOpportunisticallyDependsOn(InstructionSet_AVX512VBMI_VL)))
+        {
+            maxNonZeroIndex = 256;
+        }
+        else
+        {
+            maxNonZeroIndex = 128;
+        }
+    }
+
     for (size_t index = 0; index < elementCount; index++)
     {
         value = op2->GetIntegralVectorConstElement(index, simdBaseType);
 
-        if (value < elementCount)
+        if (value < maxNonZeroIndex)
         {
+            value &= valueMask;
+
             if (simdSize == 32)
             {
                 // Most of the 256-bit shuffle/permute instructions operate as if
@@ -24407,49 +24565,110 @@ GenTree* Compiler::gtNewSimdShuffleNode(
         }
     }
 
+    //set needsZero to false if this is for ShuffleUnsafe, since we never need to mask
+    if (isUnsafe) needsZero = false;
+
     if (simdSize == 32)
     {
         assert(compIsaSupportedDebugOnly(InstructionSet_AVX2));
 
         if ((varTypeIsByte(simdBaseType) && !compOpportunisticallyDependsOn(InstructionSet_AVX512VBMI_VL)) ||
             (varTypeIsShort(simdBaseType) && !compOpportunisticallyDependsOn(InstructionSet_AVX512BW_VL)))
         {
-            if (crossLane)
+            if (varTypeIsShort(simdBaseType))
             {
-                // TODO-XARCH-CQ: We should emulate cross-lane shuffling for byte/sbyte and short/ushort
-                unreached();
+                // TODO-XARCH-CQ: We should emulate cross-lane shuffling for short/ushort
+                assert(!crossLane);
+
+                op2 = gtNewVconNode(type);
+                op2->AsVecCon()->gtSimdVal = vecCns;
+
+                retNode = gtNewSimdHWIntrinsicNode(type, op1, op2, NI_AVX2_Shuffle, simdBaseJitType, simdSize);
             }
+            else
+            {
+                assert(varTypeIsByte(simdBaseType));
 
-            // If we aren't crossing lanes, then we can decompose the byte/sbyte
-            // and short/ushort operations into 2x 128-bit operations
+                uint8_t leftWants = 0; // result left lane wants which lanes bitfield (1 - left, 2 - right)
+                uint8_t rightWants = 0; // result right lane wants which lanes bitfield (1 - left, 2 - right)
+                bool nonDefaultShuffleMask = false; // tracks whether any element in vecCns is not the default value: 0->15, 0->15
 
-            // We want to build what is essentially the following managed code:
-            //     var op1Lower = op1.GetLower();
-            //     op1Lower = Ssse3.Shuffle(op1Lower, Vector128.Create(...));
-            //
-            //     var op1Upper = op1.GetUpper();
-            //     op1Upper = Ssse3.Shuffle(op1Upper, Vector128.Create(...));
-            //
-            //     return Vector256.Create(op1Lower, op1Upper);
+                simd_t selCns = {};
+                for (size_t index = 0; index < simdSize; index++)
+                {
+                    // get pointer to our leftWants/rightWants
+                    uint8_t* wants = nullptr;
+                    if (index < 16) wants = &leftWants;
+                    else wants = &rightWants;
 
-            simdBaseJitType = varTypeIsUnsigned(simdBaseType) ? CORINFO_TYPE_UBYTE : CORINFO_TYPE_BYTE;
+                    // update our wants based on which values we use
+                    value = op2->GetIntegralVectorConstElement(index, simdBaseType);
+                    if (value < 16) *wants |= 1;
+                    else if (value < 32) *wants |= 2;
 
-            GenTree* op1Dup   = fgMakeMultiUse(&op1);
-            GenTree* op1Lower = gtNewSimdGetLowerNode(TYP_SIMD16, op1, simdBaseJitType, simdSize);
+                    // update our conditional select mask for if we need 2 shuffles
+                    value ^= static_cast<uint64_t>(index & 0x10);
+                    if (value < 32 && value >= 16) selCns.u8[index] = 0xFF;
+                    else selCns.u8[index] = 0;
 
-            op2                          = gtNewVconNode(TYP_SIMD16);
-            op2->AsVecCon()->gtSimd16Val = vecCns.v128[0];
+                    // normalise our shuffle mask, and check if it's default
+                    if (vecCns.u8[index] < 32) vecCns.u8[index] &= 0x0F;
+                    if (vecCns.u8[index] != (index & 0x0F)) nonDefaultShuffleMask = true;
+                }
+
+                // we might be able to get away with only 1 shuffle, this is the case if neither leftWants nor rightWants are 3 (indicating only 0/1 side used)
+                if (leftWants != 3 && rightWants != 3)
+                {
+                    // set result to its initial value
+                    retNode = op1;
+
+                    // create the permutation
+                    uint8_t control = 0;
+                    if (leftWants == 2) control |= 1; // if left wants right lane, then set that bit
+                    if (rightWants != 1) control |= 16; // if right wants right lane (or neither), then set the bit for right lane
+                    if (control != 16) // if we have 16, then we don't need to actually permute, since that's what we start with
+                    {
+                        cnsNode = gtNewIconNode(control);
+                        retNode = gtNewSimdHWIntrinsicNode(type, fgMakeMultiUse(&retNode), retNode, cnsNode, NI_AVX2_Permute2x128, simdBaseJitType, simdSize);
+                    }
 
-            op1Lower = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1Lower, op2, NI_SSSE3_Shuffle, simdBaseJitType, 16);
+                    // if we have a non-default shuffle mask, we need to do Avx2.Shuffle
+                    if (nonDefaultShuffleMask)
+                    {
+                        op2 = gtNewVconNode(type);
+                        op2->AsVecCon()->gtSimdVal = vecCns;
+
+                        retNode = gtNewSimdHWIntrinsicNode(type, op1, fgMakeMultiUse(&op2), NI_AVX2_Shuffle, simdBaseJitType, simdSize);
+                    }
+                }
+                else
+                {
+                    // create the control for swapping
+                    uint8_t control = 1; // 0b00000001
+                    cnsNode = gtNewIconNode(control);
+                    GenTree* swap = gtNewSimdHWIntrinsicNode(type, fgMakeMultiUse(&op1), fgMakeMultiUse(&op1), cnsNode, NI_AVX2_Permute2x128, simdBaseJitType, simdSize);
 
-            GenTree* op1Upper = gtNewSimdGetUpperNode(TYP_SIMD16, op1Dup, simdBaseJitType, simdSize);
+                    // if we have non-default shuffle mask
+                    if (nonDefaultShuffleMask)
+                    {
+                        // create the shuffle indices node
+                        op2 = gtNewVconNode(type);
+                        op2->AsVecCon()->gtSimdVal = vecCns;
 
-            op2                          = gtNewVconNode(TYP_SIMD16);
-            op2->AsVecCon()->gtSimd16Val = vecCns.v128[1];
+                        // shuffle both op1 and swap(op1)
+                        op1 = gtNewSimdHWIntrinsicNode(type, op1, fgMakeMultiUse(&op2), NI_AVX2_Shuffle, simdBaseJitType, simdSize);
+                        swap = gtNewSimdHWIntrinsicNode(type, swap, op2, NI_AVX2_Shuffle, simdBaseJitType, simdSize);
+                    }
 
-            op1Upper = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1Upper, op2, NI_SSSE3_Shuffle, simdBaseJitType, 16);
+                    // select the appropriate values
+                    GenTree* selNode = gtNewVconNode(type);
+                    selNode->AsVecCon()->gtSimdVal = selCns;
+                    retNode = gtNewConditionalNode(GT_SELECT, selNode, swap, op1, simdBaseType);
+                }
 
-            return gtNewSimdWithUpperNode(type, op1Lower, op1Upper, simdBaseJitType, simdSize);
+                assert(retNode != nullptr);
+                return retNode;
+            }
         }
 
         if (elementSize == 4)
@@ -24565,7 +24784,7 @@ GenTree* Compiler::gtNewSimdShuffleNode(
     }
     else
     {
-        if (needsZero && compOpportunisticallyDependsOn(InstructionSet_SSSE3))
+        if ((needsZero || (isUnsafe && varTypeIsByte(simdBaseType))) && compOpportunisticallyDependsOn(InstructionSet_SSSE3))
         {
             simdBaseJitType = varTypeIsUnsigned(simdBaseType) ? CORINFO_TYPE_UBYTE : CORINFO_TYPE_BYTE;
 

diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp
@@ -1615,13 +1615,15 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic        intrinsic,
 
         case NI_Vector64_Shuffle:
         case NI_Vector128_Shuffle:
+        case NI_Vector64_ShuffleUnsafe:
+        case NI_Vector128_ShuffleUnsafe:
         {
             assert((sig->numArgs == 2) || (sig->numArgs == 3));
             assert((simdSize == 8) || (simdSize == 16));
 
             GenTree* indices = impStackTop(0).val;
 
-            if (!indices->IsVectorConst())
+            if (!varTypeIsByte(simdBaseType) && !indices->IsVectorConst())
             {
                 // TODO-ARM64-CQ: Handling non-constant indices is a bit more complex
                 break;
@@ -1632,7 +1634,16 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic        intrinsic,
                 op2 = impSIMDPopStack();
                 op1 = impSIMDPopStack();
 
-                retNode = gtNewSimdShuffleNode(retType, op1, op2, simdBaseJitType, simdSize);
+                if (indices->IsVectorConst())
+                {
+                    retNode = gtNewSimdShuffleNode(retType, op1, op2, simdBaseJitType, simdSize,
+                        intrinsic == NI_Vector64_ShuffleUnsafe || intrinsic == NI_Vector128_ShuffleUnsafe);
+                }
+                else
+                {
+                    retNode = gtNewSimdShuffleNodeVariable(retType, op1, op2, simdBaseJitType, simdSize,
+                        intrinsic == NI_Vector64_ShuffleUnsafe || intrinsic == NI_Vector128_ShuffleUnsafe);
+                }
             }
             break;
         }