Merge pull request #12661 from Sintendo/arm64divwux

JitArm64: Optimize divwux
dolphin-emu · Mar 29, 2024 · 5f6a054 · 5f6a054
2 parents 5d57a82 + 2580837
commit 5f6a054
Show file tree

Hide file tree

Showing 5 changed files with 163 additions and 40 deletions.
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
@@ -1451,12 +1451,10 @@ void Jit64::divwux(UGeckoInstruction inst)
     }
     else
     {
-      u32 shift = 31;
-      while (!(divisor & (1 << shift)))
-        shift--;
-
-      if (divisor == (u32)(1 << shift))
+      if (MathUtil::IsPow2(divisor))
       {
+        u32 shift = MathUtil::IntLog2(divisor);
+
         RCOpArg Ra = gpr.Use(a, RCMode::Read);
         RCX64Reg Rd = gpr.Bind(d, RCMode::Write);
         RegCache::Realize(Ra, Rd);
@@ -1468,24 +1466,22 @@ void Jit64::divwux(UGeckoInstruction inst)
       }
       else
       {
-        u64 magic_dividend = 0x100000000ULL << shift;
-        u32 magic = (u32)(magic_dividend / divisor);
-        u32 max_quotient = magic >> shift;
+        UnsignedMagic m = UnsignedDivisionConstants(divisor);
 
         // Test for failure in round-up method
-        if (((u64)(magic + 1) * (max_quotient * divisor - 1)) >> (shift + 32) != max_quotient - 1)
+        if (!m.fast)
         {
           // If failed, use slower round-down method
           RCOpArg Ra = gpr.Use(a, RCMode::Read);
           RCX64Reg Rd = gpr.Bind(d, RCMode::Write);
           RegCache::Realize(Ra, Rd);
 
-          MOV(32, R(RSCRATCH), Imm32(magic));
+          MOV(32, R(RSCRATCH), Imm32(m.multiplier));
           if (d != a)
             MOV(32, Rd, Ra);
           IMUL(64, Rd, R(RSCRATCH));
           ADD(64, Rd, R(RSCRATCH));
-          SHR(64, Rd, Imm8(shift + 32));
+          SHR(64, Rd, Imm8(m.shift + 32));
         }
         else
         {
@@ -1494,32 +1490,23 @@ void Jit64::divwux(UGeckoInstruction inst)
           RCX64Reg Rd = gpr.Bind(d, RCMode::Write);
           RegCache::Realize(Ra, Rd);
 
-          magic++;
-
-          // Use smallest magic number and shift amount possible
-          while ((magic & 1) == 0 && shift > 0)
-          {
-            magic >>= 1;
-            shift--;
-          }
-
           // Three-operand IMUL sign extends the immediate to 64 bits, so we may only
           // use it when the magic number has its most significant bit set to 0
-          if ((magic & 0x80000000) == 0)
+          if ((m.multiplier & 0x80000000) == 0)
           {
-            IMUL(64, Rd, Ra, Imm32(magic));
+            IMUL(64, Rd, Ra, Imm32(m.multiplier));
           }
           else if (d == a)
           {
-            MOV(32, R(RSCRATCH), Imm32(magic));
+            MOV(32, R(RSCRATCH), Imm32(m.multiplier));
             IMUL(64, Rd, R(RSCRATCH));
           }
           else
           {
-            MOV(32, Rd, Imm32(magic));
+            MOV(32, Rd, Imm32(m.multiplier));
             IMUL(64, Rd, Ra);
           }
-          SHR(64, Rd, Imm8(shift + 32));
+          SHR(64, Rd, Imm8(m.shift + 32));
         }
       }
       if (inst.OE)
@@ -1792,7 +1779,7 @@ void Jit64::divwx(UGeckoInstruction inst)
     else
     {
       // Optimize signed 32-bit integer division by a constant
-      Magic m = SignedDivisionConstants(divisor);
+      SignedMagic m = SignedDivisionConstants(divisor);
 
       MOVSX(64, 32, RSCRATCH, Ra);
 

diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp
@@ -1538,6 +1538,60 @@ void JitArm64::divwux(UGeckoInstruction inst)
     if (inst.Rc)
       ComputeRC0(gpr.GetImm(d));
   }
+  else if (gpr.IsImm(b))
+  {
+    const u32 divisor = gpr.GetImm(b);
+
+    if (divisor == 0)
+    {
+      gpr.SetImmediate(d, 0);
+      if (inst.Rc)
+        ComputeRC0(0);
+    }
+    else
+    {
+      const bool allocate_reg = d == a;
+      gpr.BindToRegister(d, allocate_reg);
+
+      ARM64Reg RD = gpr.R(d);
+      ARM64Reg RA = gpr.R(a);
+
+      if (MathUtil::IsPow2(divisor))
+      {
+        int shift = MathUtil::IntLog2(divisor);
+        if (shift)
+          LSR(RD, RA, shift);
+        else if (d != a)
+          MOV(RD, RA);
+      }
+      else
+      {
+        UnsignedMagic m = UnsignedDivisionConstants(divisor);
+
+        ARM64Reg WI = allocate_reg ? gpr.GetReg() : RD;
+        ARM64Reg XD = EncodeRegTo64(RD);
+
+        MOVI2R(WI, m.multiplier);
+
+        if (m.fast)
+        {
+          UMULL(XD, RA, WI);
+        }
+        else
+        {
+          UMADDL(XD, RA, WI, EncodeRegTo64(WI));
+        }
+
+        LSR(XD, XD, 32 + m.shift);
+
+        if (allocate_reg)
+          gpr.Unlock(WI);
+      }
+
+      if (inst.Rc)
+        ComputeRC0(gpr.R(d));
+    }
+  }
   else
   {
     gpr.BindToRegister(d, d == a || d == b);
@@ -1675,7 +1729,7 @@ void JitArm64::divwx(UGeckoInstruction inst)
     else
     {
       // Optimize signed 32-bit integer division by a constant
-      Magic m = SignedDivisionConstants(divisor);
+      SignedMagic m = SignedDivisionConstants(divisor);
 
       ARM64Reg WA = gpr.GetReg();
       ARM64Reg WB = gpr.GetReg();

diff --git a/Source/Core/Core/PowerPC/JitCommon/DivUtils.cpp b/Source/Core/Core/PowerPC/JitCommon/DivUtils.cpp
@@ -3,16 +3,18 @@
 
 #include "Core/PowerPC/JitCommon/DivUtils.h"
 
+#include <algorithm>
+#include <bit>
 #include <cstdlib>
 
 namespace JitCommon
 {
-Magic SignedDivisionConstants(s32 d)
+SignedMagic SignedDivisionConstants(s32 divisor)
 {
   const u32 two31 = 2147483648;
 
-  const u32 ad = std::abs(d);
-  const u32 t = two31 - (d < 0);
+  const u32 ad = std::abs(divisor);
+  const u32 t = two31 - (divisor < 0);
   const u32 anc = t - 1 - t % ad;
   u32 q1 = two31 / anc;
   u32 r1 = two31 - q1 * anc;
@@ -44,13 +46,43 @@ Magic SignedDivisionConstants(s32 d)
     delta = ad - r2;
   } while (q1 < delta || (q1 == delta && r1 == 0));
 
-  Magic mag;
+  SignedMagic mag;
   mag.multiplier = q2 + 1;
-  if (d < 0)
+  if (divisor < 0)
     mag.multiplier = -mag.multiplier;
   mag.shift = p - 32;
 
   return mag;
 }
 
+UnsignedMagic UnsignedDivisionConstants(u32 divisor)
+{
+  u32 shift = 31 - std::countl_zero(divisor);
+
+  u64 magic_dividend = 0x100000000ULL << shift;
+  u32 multiplier = magic_dividend / divisor;
+  u32 max_quotient = multiplier >> shift;
+
+  // Test for failure in round-up method
+  u32 round_up = (u64(multiplier + 1) * (max_quotient * divisor - 1)) >> (shift + 32);
+  bool fast = round_up == max_quotient - 1;
+
+  if (fast)
+  {
+    multiplier++;
+
+    // Use smallest magic number and shift amount possible
+    u32 trailing_zeroes = std::min(shift, u32(std::countr_zero(multiplier)));
+    multiplier >>= trailing_zeroes;
+    shift -= trailing_zeroes;
+  }
+
+  UnsignedMagic mag;
+  mag.multiplier = multiplier;
+  mag.shift = shift;
+  mag.fast = fast;
+
+  return mag;
+}
+
 }  // namespace JitCommon
diff --git a/Source/Core/Core/PowerPC/JitCommon/DivUtils.h b/Source/Core/Core/PowerPC/JitCommon/DivUtils.h
@@ -7,7 +7,7 @@
 
 namespace JitCommon
 {
-struct Magic
+struct SignedMagic
 {
   s32 multiplier;
   u8 shift;
@@ -16,6 +16,27 @@ struct Magic
 // Calculate the constants required to optimize a signed 32-bit integer division.
 // Taken from The PowerPC Compiler Writer's Guide and LLVM.
 // Divisor must not be -1, 0, 1 or INT_MIN.
-Magic SignedDivisionConstants(s32 divisor);
+SignedMagic SignedDivisionConstants(s32 divisor);
+
+struct UnsignedMagic
+{
+  u32 multiplier;
+  u8 shift;
+  bool fast;
+};
+
+/// Calculate the constants required to optimize an unsigned 32-bit integer
+/// division.
+/// Divisor must not be 0, 1, or a power of two.
+///
+/// Original implementation by calc84maniac.
+/// Results are the same as the approach laid out in Hacker's Delight, with an
+/// improvement for so-called uncooperative divisors (e.g. 7), as discovered by
+/// ridiculousfish.
+///
+/// See also:
+/// https://ridiculousfish.com/blog/posts/labor-of-division-episode-iii.html
+/// https://rubenvannieuwpoort.nl/posts/division-by-constant-unsigned-integers
+UnsignedMagic UnsignedDivisionConstants(u32 divisor);
 
 }  // namespace JitCommon
diff --git a/Source/UnitTests/Core/PowerPC/DivUtilsTest.cpp b/Source/UnitTests/Core/PowerPC/DivUtilsTest.cpp
@@ -9,12 +9,12 @@ using namespace JitCommon;
 
 TEST(DivUtils, Signed)
 {
-  Magic m3 = SignedDivisionConstants(3);
-  Magic m5 = SignedDivisionConstants(5);
-  Magic m7 = SignedDivisionConstants(7);
-  Magic minus3 = SignedDivisionConstants(-3);
-  Magic minus5 = SignedDivisionConstants(-5);
-  Magic minus7 = SignedDivisionConstants(-7);
+  SignedMagic m3 = SignedDivisionConstants(3);
+  SignedMagic m5 = SignedDivisionConstants(5);
+  SignedMagic m7 = SignedDivisionConstants(7);
+  SignedMagic minus3 = SignedDivisionConstants(-3);
+  SignedMagic minus5 = SignedDivisionConstants(-5);
+  SignedMagic minus7 = SignedDivisionConstants(-7);
 
   EXPECT_EQ(0x55555556, m3.multiplier);
   EXPECT_EQ(0, m3.shift);
@@ -30,3 +30,32 @@ TEST(DivUtils, Signed)
   EXPECT_EQ(0x6DB6DB6D, minus7.multiplier);
   EXPECT_EQ(2, minus7.shift);
 }
+
+TEST(DivUtils, Unsigned)
+{
+  UnsignedMagic m3 = UnsignedDivisionConstants(3);
+  UnsignedMagic m5 = UnsignedDivisionConstants(5);
+  UnsignedMagic m7 = UnsignedDivisionConstants(7);
+  UnsignedMagic m9 = UnsignedDivisionConstants(9);
+  UnsignedMagic m19 = UnsignedDivisionConstants(19);
+
+  EXPECT_EQ(0xAAAAAAABU, m3.multiplier);
+  EXPECT_EQ(1, m3.shift);
+  EXPECT_TRUE(m3.fast);
+
+  EXPECT_EQ(0xCCCCCCCDU, m5.multiplier);
+  EXPECT_EQ(2, m5.shift);
+  EXPECT_TRUE(m5.fast);
+
+  EXPECT_EQ(0x92492492U, m7.multiplier);
+  EXPECT_EQ(2, m7.shift);
+  EXPECT_FALSE(m7.fast);
+
+  EXPECT_EQ(0x38E38E39U, m9.multiplier);
+  EXPECT_EQ(1, m9.shift);
+  EXPECT_TRUE(m9.fast);
+
+  EXPECT_EQ(0xD79435E5U, m19.multiplier);
+  EXPECT_EQ(4, m19.shift);
+  EXPECT_FALSE(m19.fast);
+}