dotnet · TIHan · Nov 5, 2022 · Oct 13, 2022 · Oct 13, 2022 · Oct 13, 2022
diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp
@@ -1085,7 +1085,6 @@ void CodeGen::genCodeForMul(GenTreeOp* treeNode)
 
     if (immOp != nullptr)
     {
-        // CQ: When possible use LEA for mul by imm 3, 5 or 9
         ssize_t imm = immOp->AsIntConCommon()->IconValue();
 
         if (!requiresOverflowCheck && rmOp->isUsedFromReg() && ((imm == 3) || (imm == 5) || (imm == 9)))
@@ -1095,17 +1094,6 @@ void CodeGen::genCodeForMul(GenTreeOp* treeNode)
             unsigned int scale = (unsigned int)(imm - 1);
             GetEmitter()->emitIns_R_ARX(INS_lea, size, targetReg, rmOp->GetRegNum(), rmOp->GetRegNum(), scale, 0);
         }
-        else if (!requiresOverflowCheck && rmOp->isUsedFromReg() && (imm == genFindLowestBit(imm)) && (imm != 0))
-        {
-            // Use shift for constant multiply when legal
-            uint64_t     zextImm     = static_cast<uint64_t>(static_cast<size_t>(imm));
-            unsigned int shiftAmount = genLog2(zextImm);
-
-            // Copy reg src to dest register
-            inst_Mov(targetType, targetReg, rmOp->GetRegNum(), /* canSkip */ true);
-
-            inst_RV_SH(INS_shl, size, targetReg, shiftAmount);
-        }
         else
         {
             // use the 3-op form with immediate
@@ -4456,6 +4444,20 @@ void CodeGen::genCodeForShift(GenTree* tree)
                 GetEmitter()->emitIns_R_ARX(INS_lea, size, tree->GetRegNum(), operandReg, operandReg, 1, 0);
             }
         }
+        // Optimize "X<<2" to "lea [reg*4]" - we only do this when the dst and src registers are different since it will
+        // remove a 'mov'.
+        else if (tree->OperIs(GT_LSH) && !tree->gtOverflowEx() && !tree->gtSetFlags() && shiftBy->IsIntegralConst(2) &&
+                 tree->GetRegNum() != operandReg)
+        {
+            GetEmitter()->emitIns_R_ARX(INS_lea, size, tree->GetRegNum(), REG_NA, operandReg, 4, 0);
+        }
+        // Optimize "X<<3" to "lea [reg*8]" - we only do this when the dst and src registers are different since it will
+        // remove a 'mov'.
+        else if (tree->OperIs(GT_LSH) && !tree->gtOverflowEx() && !tree->gtSetFlags() && shiftBy->IsIntegralConst(3) &&
+                 tree->GetRegNum() != operandReg)
+        {
+            GetEmitter()->emitIns_R_ARX(INS_lea, size, tree->GetRegNum(), REG_NA, operandReg, 8, 0);
+        }
         else
         {
             int shiftByValue = (int)shiftBy->AsIntConCommon()->IconValue();

diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h
@@ -319,7 +319,7 @@ class Lowering final : public Phase
     void LowerPutArgStkOrSplit(GenTreePutArgStk* putArgNode);
 #ifdef TARGET_XARCH
     void LowerPutArgStk(GenTreePutArgStk* putArgStk);
-    GenTree* TryLowerMulToLshSubOrLshAdd(GenTreeOp* node);
+    GenTree* TryLowerMulWithConstant(GenTreeOp* node);
 #endif // TARGET_XARCH
 
     bool TryCreateAddrMode(GenTree* addr, bool isContainable, GenTree* parent);

diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp
@@ -106,7 +106,9 @@ void Lowering::LowerStoreIndir(GenTreeStoreInd* node)
 }
 
 //----------------------------------------------------------------------------------------------
-// Lowering::TryLowerMulToLshSubOrLshAdd:
+// Lowering::TryLowerMulWithConstant:
+//    Lowers a tree MUL(X, CNS) to LSH(X, CNS_SHIFT)
+//    or
 //    Lowers a tree MUL(X, CNS) to SUB(LSH(X, CNS_SHIFT), X)
 //    or
 //    Lowers a tree MUL(X, CNS) to ADD(LSH(X, CNS_SHIFT), X)
@@ -119,14 +121,16 @@ void Lowering::LowerStoreIndir(GenTreeStoreInd* node)
 //
 // Notes:
 //    Performs containment checks on the replacement node if one is created
-GenTree* Lowering::TryLowerMulToLshSubOrLshAdd(GenTreeOp* node)
+GenTree* Lowering::TryLowerMulWithConstant(GenTreeOp* node)
 {
     assert(node->OperIs(GT_MUL));
 
-// We do not do this optimization in X86 as it is not recommended.
-#if TARGET_X86
-    return nullptr;
-#endif // TARGET_X86
+    // Do not do this optimization with min-opts enabled as
+    // this could create more tmp locals that need to be optimized
+    // in LSRA.
+    if (comp->opts.MinOpts())
+        return nullptr;
+
     if (!varTypeIsIntegral(node))
         return nullptr;
 
@@ -139,25 +143,36 @@ GenTree* Lowering::TryLowerMulToLshSubOrLshAdd(GenTreeOp* node)
     if (op1->isContained() || op2->isContained())
         return nullptr;
 
-    if (!op1->OperIs(GT_LCL_VAR))
-        return nullptr;
-
     if (!op2->IsCnsIntOrI())
         return nullptr;
 
     GenTreeIntConCommon* cns    = op2->AsIntConCommon();
     ssize_t              cnsVal = cns->IconValue();
 
-    // Use GT_LSH if cnsVal is a power of two.
-    // This is handled in codegen.
-    if (isPow2(cnsVal))
-        return nullptr;
-
     // Use GT_LEA if cnsVal is 3, 5, or 9.
-    // This is handled in codegen.
+    // These are handled in codegen.
     if (cnsVal == 3 || cnsVal == 5 || cnsVal == 9)
         return nullptr;
 
+    // Use GT_LSH if cnsVal is a power of two.
+    if (isPow2(cnsVal))
+    {
+        // Use shift for constant multiply when legal
+        unsigned int shiftAmount = genLog2(static_cast<uint64_t>(static_cast<size_t>(cnsVal)));
+
+        cns->SetIconValue(shiftAmount);
+        node->ChangeOper(GT_LSH);
+
+        ContainCheckShiftRotate(node);
+
+        return node;
+    }
+
+// We do not do this optimization in X86 as it is not recommended.
+#if TARGET_X86
+    return nullptr;
+#endif // TARGET_X86
+
     ssize_t cnsValPlusOne  = cnsVal + 1;
     ssize_t cnsValMinusOne = cnsVal - 1;
 
@@ -166,6 +181,9 @@ GenTree* Lowering::TryLowerMulToLshSubOrLshAdd(GenTreeOp* node)
     if (!useSub && !isPow2(cnsValMinusOne))
         return nullptr;
 
+    LIR::Use op1Use(BlockRange(), &node->gtOp1, node);
+    op1 = ReplaceWithLclVar(op1Use);
+
     if (useSub)
     {
         cnsVal = cnsValPlusOne;
@@ -213,7 +231,7 @@ GenTree* Lowering::LowerMul(GenTreeOp* mul)
 
     if (mul->OperIs(GT_MUL))
     {
-        GenTree* replacementNode = TryLowerMulToLshSubOrLshAdd(mul);
+        GenTree* replacementNode = TryLowerMulWithConstant(mul);
         if (replacementNode != nullptr)
         {
             return replacementNode->gtNext;

diff --git a/src/tests/JIT/opt/Multiply/IntMultiply.cs b/src/tests/JIT/opt/Multiply/IntMultiply.cs
@@ -59,8 +59,7 @@ static ulong UInt64_MultiplyWith3(ulong value)
         [MethodImpl(MethodImplOptions.NoInlining)]
         static ulong UInt64_MultiplyWith4(ulong value)
         {
-            // X64:      mov [[REG0:[a-z]+]], [[REG1:[a-z]+]]
-            // X64-NEXT: shl [[REG0]], 2
+            // X64: lea [[REG0:[a-z]+]], {{\[}}4*[[REG1:[a-z]+]]{{\]}}
             return value * 4;
         }
 
@@ -82,17 +81,15 @@ static ulong UInt64_MultiplyWith6(ulong value)
         [MethodImpl(MethodImplOptions.NoInlining)]
         static ulong UInt64_MultiplyWith7(ulong value)
         {
-            // X64:      mov [[REG0:[a-z]+]], [[REG1:[a-z]+]]
-            // X64-NEXT: shl [[REG0]], 3
+            // X64:      lea [[REG0:[a-z]+]], {{\[}}8*[[REG1]]{{\]}}
             // X64-NEXT: sub [[REG0]], [[REG1]]
             return value * 7;
         }
 
         [MethodImpl(MethodImplOptions.NoInlining)]
         static ulong UInt64_MultiplyWith8(ulong value)
         {
-            // X64:      mov [[REG0:[a-z]+]], [[REG1:[a-z]+]]
-            // X64-NEXT: shl [[REG0]], 3
+            // X64: lea [[REG0:[a-z]+]], {{\[}}8*[[REG1]]{{\]}}
             return value * 8;
         }
 
@@ -103,9 +100,54 @@ static ulong UInt64_MultiplyWith9(ulong value)
             return value * 9;
         }
 
+        [MethodImpl(MethodImplOptions.NoInlining)]
+        static ulong UInt64_MultiplyWith10(ulong value)
+        {
+            // X64:      lea [[REG0:[a-z]+]], {{\[}}[[REG1:[a-z]+]]+4*[[REG1]]{{\]}}
+            // X64-NEXT: add [[REG0]], [[REG0]]
+            return value * 10;
+        }
+
+        [MethodImpl(MethodImplOptions.NoInlining)]
+        static ulong UInt64_MultiplyWith11(ulong value)
+        {
+            // We expect 'imul' since the alternative replacement sequence would require 2 three-component LEA instructions which is slower.
+
+            // X64: imul
+            return value * 11;
+        }
+
+        [MethodImpl(MethodImplOptions.NoInlining)]
+        static ulong UInt64_MultiplyWith12(ulong value)
+        {
+            // X64:      lea [[REG0:[a-z]+]], {{\[}}[[REG1:[a-z]+]]+2*[[REG1]]{{\]}}
+            // X64-NEXT: shl [[REG0]], 2
+            return value * 12;
+        }
+
+        [MethodImpl(MethodImplOptions.NoInlining)]
+        static ulong UInt64_MultiplyWith13(ulong value)
+        {
+            // We expect 'imul' since the alternative replacement sequence would require 2 three-component LEA instructions which is slower.
+
+            // X64: imul
+            return value * 13;
+        }
+
+        [MethodImpl(MethodImplOptions.NoInlining)]
+        static ulong UInt64_MultiplyWith14(ulong value)
+        {
+            // We expect 'imul' since the alternative replacement sequence would require 4 instructions which is too slow.
+
+            // X64: imul
+            return value * 14;
+        }
+
         [MethodImpl(MethodImplOptions.NoInlining)]
         static ulong UInt64_MultiplyWith15(ulong value)
         {
+            // We expect these instructions since the alternative replacement sequence would require 2 three-component LEA instructions which is slower.
+
             // X64:      mov [[REG0:[a-z]+]], [[REG1:[a-z]+]]
             // X64-NEXT: shl [[REG0]], 4
             // X64-NEXT: sub [[REG0]], [[REG1]]
@@ -129,6 +171,92 @@ static ulong UInt64_MultiplyWith17(ulong value)
             return value * 17;
         }
 
+        [MethodImpl(MethodImplOptions.NoInlining)]
+        static ulong UInt64_MultiplyWith18(ulong value)
+        {
+            // X64:      lea [[REG0:[a-z]+]], {{\[}}[[REG1:[a-z]+]]+8*[[REG1]]{{\]}}
+            // X64-NEXT: add [[REG0]], [[REG0]]
+            return value * 18;
+        }
+
+        [MethodImpl(MethodImplOptions.NoInlining)]
+        static ulong UInt64_MultiplyWith19(ulong value)
+        {
+            // We expect 'imul' since the alternative replacement sequence would require 2 three-component LEA instructions which is slower.
+
+            // X64: imul
+            return value * 19;
+        }
+
+        [MethodImpl(MethodImplOptions.NoInlining)]
+        static ulong UInt64_MultiplyWith20(ulong value)
+        {
+            // X64:      lea [[REG0:[a-z]+]], {{\[}}[[REG1:[a-z]+]]+4*[[REG1]]{{\]}}
+            // X64-NEXT: shl [[REG0]], 2
+            return value * 20;
+        }
+
+        [MethodImpl(MethodImplOptions.NoInlining)]
+        static ulong UInt64_MultiplyWith21(ulong value)
+        {
+            // We expect 'imul' since the alternative replacement sequence would require 2 three-component LEA instructions which is slower.
+
+            // X64: imul
+            return value * 21;
+        }
+
+        [MethodImpl(MethodImplOptions.NoInlining)]
+        static ulong UInt64_MultiplyWith22(ulong value)
+        {
+            // We expect 'imul' since the alternative replacement sequence would require 2 three-component LEA instructions and 1 ADD instruction which is slower.
+
+            // X64: imul
+            return value * 22;
+        }
+
+        [MethodImpl(MethodImplOptions.NoInlining)]
+        static ulong UInt64_MultiplyWith23(ulong value)
+        {
+            // We expect 'imul' since the alternative replacement sequence would require 1 three-component LEA instructions, 1 SHL instruction, and 1 ADD instruction which is slower.
+
+            // X64: imul
+            return value * 23;
+        }
+
+        [MethodImpl(MethodImplOptions.NoInlining)]
+        static ulong UInt64_MultiplyWith24(ulong value)
+        {
+            // X64:      lea [[REG0:[a-z]+]], {{\[}}[[REG1:[a-z]+]]+4*[[REG1]]{{\]}}
+            // X64-NEXT: shl [[REG0]], 3
+            return value * 24;
+        }
+
+        [MethodImpl(MethodImplOptions.NoInlining)]
+        static ulong UInt64_MultiplyWith25(ulong value)
+        {
+            // We expect 'imul' since the alternative replacement sequence would require 2 three-component LEA instructions which is slower.
+
+            // X64: imul
+            return value * 25;
+        }
+
+        [MethodImpl(MethodImplOptions.NoInlining)]
+        static ulong UInt64_MultiplyWith5_AddressExposed(ulong value)
+        {
+            // X64:      mov [[REG0:[a-z]+]], qword ptr
+            // X64-NOT:  mov
+            // X64-NEXT: lea [[REG1:[a-z]+]], {{\[}}[[REG0:[a-z]+]]+4*[[REG0]]{{\]}}
+            var value2 = value * 5;
+            UInt64_AddressExposed(ref value);
+            return value2;
+        }
+
+        [MethodImpl(MethodImplOptions.NoInlining)]
+        static void UInt64_AddressExposed(ref ulong value)
+        {
+
+        }
+
         static int Main()
         {
             if (UInt32_MultiplyWithUInt32MaxValue(1) != UInt32.MaxValue)
@@ -167,6 +295,21 @@ static int Main()
             if (UInt64_MultiplyWith9(1) != 9)
                 return 0;
 
+            if (UInt64_MultiplyWith10(1) != 10)
+                return 0;
+
+            if (UInt64_MultiplyWith11(1) != 11)
+                return 0;
+
+            if (UInt64_MultiplyWith12(1) != 12)
+                return 0;
+
+            if (UInt64_MultiplyWith13(1) != 13)
+                return 0;
+
+            if (UInt64_MultiplyWith14(1) != 14)
+                return 0;
+
             if (UInt64_MultiplyWith15(1) != 15)
                 return 0;
 
@@ -176,6 +319,33 @@ static int Main()
             if (UInt64_MultiplyWith17(1) != 17)
                 return 0;
 
+            if (UInt64_MultiplyWith18(1) != 18)
+                return 0;
+
+            if (UInt64_MultiplyWith19(1) != 19)
+                return 0;
+
+            if (UInt64_MultiplyWith20(1) != 20)
+                return 0;
+
+            if (UInt64_MultiplyWith21(1) != 21)
+                return 0;
+
+            if (UInt64_MultiplyWith22(1) != 22)
+                return 0;
+
+            if (UInt64_MultiplyWith23(1) != 23)
+                return 0;
+
+            if (UInt64_MultiplyWith24(1) != 24)
+                return 0;
+
+            if (UInt64_MultiplyWith25(1) != 25)
+                return 0;
+
+            if (UInt64_MultiplyWith5_AddressExposed(1) != 5)
+                return 0;
+
             return 100;
         }
     }

diff --git a/src/tests/JIT/opt/Multiply/IntMultiply.csproj b/src/tests/JIT/opt/Multiply/IntMultiply.csproj
@@ -11,7 +11,7 @@
       <HasDisasmCheck>true</HasDisasmCheck>
     </Compile>
 
-    <CLRTestEnvironmentVariable Include="COMPlus_TieredCompilation" Value="0" />
-    <CLRTestEnvironmentVariable Include="COMPlus_JITMinOpts" Value="0" />
+    <CLRTestEnvironmentVariable Include="DOTNET_TieredCompilation" Value="0" />
+    <CLRTestEnvironmentVariable Include="DOTNET_JITMinOpts" Value="0" />
   </ItemGroup>
 </Project>