Skip to content

Commit

Permalink
Merge pull request #12661 from Sintendo/arm64divwux
Browse files Browse the repository at this point in the history
JitArm64: Optimize divwux
  • Loading branch information
JosJuice committed Mar 29, 2024
2 parents 5d57a82 + 2580837 commit 5f6a054
Show file tree
Hide file tree
Showing 5 changed files with 163 additions and 40 deletions.
39 changes: 13 additions & 26 deletions Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
Expand Up @@ -1451,12 +1451,10 @@ void Jit64::divwux(UGeckoInstruction inst)
}
else
{
u32 shift = 31;
while (!(divisor & (1 << shift)))
shift--;

if (divisor == (u32)(1 << shift))
if (MathUtil::IsPow2(divisor))
{
u32 shift = MathUtil::IntLog2(divisor);

RCOpArg Ra = gpr.Use(a, RCMode::Read);
RCX64Reg Rd = gpr.Bind(d, RCMode::Write);
RegCache::Realize(Ra, Rd);
Expand All @@ -1468,24 +1466,22 @@ void Jit64::divwux(UGeckoInstruction inst)
}
else
{
u64 magic_dividend = 0x100000000ULL << shift;
u32 magic = (u32)(magic_dividend / divisor);
u32 max_quotient = magic >> shift;
UnsignedMagic m = UnsignedDivisionConstants(divisor);

// Test for failure in round-up method
if (((u64)(magic + 1) * (max_quotient * divisor - 1)) >> (shift + 32) != max_quotient - 1)
if (!m.fast)
{
// If failed, use slower round-down method
RCOpArg Ra = gpr.Use(a, RCMode::Read);
RCX64Reg Rd = gpr.Bind(d, RCMode::Write);
RegCache::Realize(Ra, Rd);

MOV(32, R(RSCRATCH), Imm32(magic));
MOV(32, R(RSCRATCH), Imm32(m.multiplier));
if (d != a)
MOV(32, Rd, Ra);
IMUL(64, Rd, R(RSCRATCH));
ADD(64, Rd, R(RSCRATCH));
SHR(64, Rd, Imm8(shift + 32));
SHR(64, Rd, Imm8(m.shift + 32));
}
else
{
Expand All @@ -1494,32 +1490,23 @@ void Jit64::divwux(UGeckoInstruction inst)
RCX64Reg Rd = gpr.Bind(d, RCMode::Write);
RegCache::Realize(Ra, Rd);

magic++;

// Use smallest magic number and shift amount possible
while ((magic & 1) == 0 && shift > 0)
{
magic >>= 1;
shift--;
}

// Three-operand IMUL sign extends the immediate to 64 bits, so we may only
// use it when the magic number has its most significant bit set to 0
if ((magic & 0x80000000) == 0)
if ((m.multiplier & 0x80000000) == 0)
{
IMUL(64, Rd, Ra, Imm32(magic));
IMUL(64, Rd, Ra, Imm32(m.multiplier));
}
else if (d == a)
{
MOV(32, R(RSCRATCH), Imm32(magic));
MOV(32, R(RSCRATCH), Imm32(m.multiplier));
IMUL(64, Rd, R(RSCRATCH));
}
else
{
MOV(32, Rd, Imm32(magic));
MOV(32, Rd, Imm32(m.multiplier));
IMUL(64, Rd, Ra);
}
SHR(64, Rd, Imm8(shift + 32));
SHR(64, Rd, Imm8(m.shift + 32));
}
}
if (inst.OE)
Expand Down Expand Up @@ -1792,7 +1779,7 @@ void Jit64::divwx(UGeckoInstruction inst)
else
{
// Optimize signed 32-bit integer division by a constant
Magic m = SignedDivisionConstants(divisor);
SignedMagic m = SignedDivisionConstants(divisor);

MOVSX(64, 32, RSCRATCH, Ra);

Expand Down
56 changes: 55 additions & 1 deletion Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp
Expand Up @@ -1538,6 +1538,60 @@ void JitArm64::divwux(UGeckoInstruction inst)
if (inst.Rc)
ComputeRC0(gpr.GetImm(d));
}
else if (gpr.IsImm(b))
{
const u32 divisor = gpr.GetImm(b);

if (divisor == 0)
{
gpr.SetImmediate(d, 0);
if (inst.Rc)
ComputeRC0(0);
}
else
{
const bool allocate_reg = d == a;
gpr.BindToRegister(d, allocate_reg);

ARM64Reg RD = gpr.R(d);
ARM64Reg RA = gpr.R(a);

if (MathUtil::IsPow2(divisor))
{
int shift = MathUtil::IntLog2(divisor);
if (shift)
LSR(RD, RA, shift);
else if (d != a)
MOV(RD, RA);
}
else
{
UnsignedMagic m = UnsignedDivisionConstants(divisor);

ARM64Reg WI = allocate_reg ? gpr.GetReg() : RD;
ARM64Reg XD = EncodeRegTo64(RD);

MOVI2R(WI, m.multiplier);

if (m.fast)
{
UMULL(XD, RA, WI);
}
else
{
UMADDL(XD, RA, WI, EncodeRegTo64(WI));
}

LSR(XD, XD, 32 + m.shift);

if (allocate_reg)
gpr.Unlock(WI);
}

if (inst.Rc)
ComputeRC0(gpr.R(d));
}
}
else
{
gpr.BindToRegister(d, d == a || d == b);
Expand Down Expand Up @@ -1675,7 +1729,7 @@ void JitArm64::divwx(UGeckoInstruction inst)
else
{
// Optimize signed 32-bit integer division by a constant
Magic m = SignedDivisionConstants(divisor);
SignedMagic m = SignedDivisionConstants(divisor);

ARM64Reg WA = gpr.GetReg();
ARM64Reg WB = gpr.GetReg();
Expand Down
42 changes: 37 additions & 5 deletions Source/Core/Core/PowerPC/JitCommon/DivUtils.cpp
Expand Up @@ -3,16 +3,18 @@

#include "Core/PowerPC/JitCommon/DivUtils.h"

#include <algorithm>
#include <bit>
#include <cstdlib>

namespace JitCommon
{
Magic SignedDivisionConstants(s32 d)
SignedMagic SignedDivisionConstants(s32 divisor)
{
const u32 two31 = 2147483648;

const u32 ad = std::abs(d);
const u32 t = two31 - (d < 0);
const u32 ad = std::abs(divisor);
const u32 t = two31 - (divisor < 0);
const u32 anc = t - 1 - t % ad;
u32 q1 = two31 / anc;
u32 r1 = two31 - q1 * anc;
Expand Down Expand Up @@ -44,13 +46,43 @@ Magic SignedDivisionConstants(s32 d)
delta = ad - r2;
} while (q1 < delta || (q1 == delta && r1 == 0));

Magic mag;
SignedMagic mag;
mag.multiplier = q2 + 1;
if (d < 0)
if (divisor < 0)
mag.multiplier = -mag.multiplier;
mag.shift = p - 32;

return mag;
}

UnsignedMagic UnsignedDivisionConstants(u32 divisor)
{
u32 shift = 31 - std::countl_zero(divisor);

u64 magic_dividend = 0x100000000ULL << shift;
u32 multiplier = magic_dividend / divisor;
u32 max_quotient = multiplier >> shift;

// Test for failure in round-up method
u32 round_up = (u64(multiplier + 1) * (max_quotient * divisor - 1)) >> (shift + 32);
bool fast = round_up == max_quotient - 1;

if (fast)
{
multiplier++;

// Use smallest magic number and shift amount possible
u32 trailing_zeroes = std::min(shift, u32(std::countr_zero(multiplier)));
multiplier >>= trailing_zeroes;
shift -= trailing_zeroes;
}

UnsignedMagic mag;
mag.multiplier = multiplier;
mag.shift = shift;
mag.fast = fast;

return mag;
}

} // namespace JitCommon
25 changes: 23 additions & 2 deletions Source/Core/Core/PowerPC/JitCommon/DivUtils.h
Expand Up @@ -7,7 +7,7 @@

namespace JitCommon
{
struct Magic
struct SignedMagic
{
s32 multiplier;
u8 shift;
Expand All @@ -16,6 +16,27 @@ struct Magic
// Calculate the constants required to optimize a signed 32-bit integer division.
// Taken from The PowerPC Compiler Writer's Guide and LLVM.
// Divisor must not be -1, 0, 1 or INT_MIN.
Magic SignedDivisionConstants(s32 divisor);
SignedMagic SignedDivisionConstants(s32 divisor);

struct UnsignedMagic
{
u32 multiplier;
u8 shift;
bool fast;
};

/// Calculate the constants required to optimize an unsigned 32-bit integer
/// division.
/// Divisor must not be 0, 1, or a power of two.
///
/// Original implementation by calc84maniac.
/// Results are the same as the approach laid out in Hacker's Delight, with an
/// improvement for so-called uncooperative divisors (e.g. 7), as discovered by
/// ridiculousfish.
///
/// See also:
/// https://ridiculousfish.com/blog/posts/labor-of-division-episode-iii.html
/// https://rubenvannieuwpoort.nl/posts/division-by-constant-unsigned-integers
UnsignedMagic UnsignedDivisionConstants(u32 divisor);

} // namespace JitCommon
41 changes: 35 additions & 6 deletions Source/UnitTests/Core/PowerPC/DivUtilsTest.cpp
Expand Up @@ -9,12 +9,12 @@ using namespace JitCommon;

TEST(DivUtils, Signed)
{
Magic m3 = SignedDivisionConstants(3);
Magic m5 = SignedDivisionConstants(5);
Magic m7 = SignedDivisionConstants(7);
Magic minus3 = SignedDivisionConstants(-3);
Magic minus5 = SignedDivisionConstants(-5);
Magic minus7 = SignedDivisionConstants(-7);
SignedMagic m3 = SignedDivisionConstants(3);
SignedMagic m5 = SignedDivisionConstants(5);
SignedMagic m7 = SignedDivisionConstants(7);
SignedMagic minus3 = SignedDivisionConstants(-3);
SignedMagic minus5 = SignedDivisionConstants(-5);
SignedMagic minus7 = SignedDivisionConstants(-7);

EXPECT_EQ(0x55555556, m3.multiplier);
EXPECT_EQ(0, m3.shift);
Expand All @@ -30,3 +30,32 @@ TEST(DivUtils, Signed)
EXPECT_EQ(0x6DB6DB6D, minus7.multiplier);
EXPECT_EQ(2, minus7.shift);
}

TEST(DivUtils, Unsigned)
{
UnsignedMagic m3 = UnsignedDivisionConstants(3);
UnsignedMagic m5 = UnsignedDivisionConstants(5);
UnsignedMagic m7 = UnsignedDivisionConstants(7);
UnsignedMagic m9 = UnsignedDivisionConstants(9);
UnsignedMagic m19 = UnsignedDivisionConstants(19);

EXPECT_EQ(0xAAAAAAABU, m3.multiplier);
EXPECT_EQ(1, m3.shift);
EXPECT_TRUE(m3.fast);

EXPECT_EQ(0xCCCCCCCDU, m5.multiplier);
EXPECT_EQ(2, m5.shift);
EXPECT_TRUE(m5.fast);

EXPECT_EQ(0x92492492U, m7.multiplier);
EXPECT_EQ(2, m7.shift);
EXPECT_FALSE(m7.fast);

EXPECT_EQ(0x38E38E39U, m9.multiplier);
EXPECT_EQ(1, m9.shift);
EXPECT_TRUE(m9.fast);

EXPECT_EQ(0xD79435E5U, m19.multiplier);
EXPECT_EQ(4, m19.shift);
EXPECT_FALSE(m19.fast);
}

0 comments on commit 5f6a054

Please sign in to comment.