Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Merge pull request #9712 from JosJuice/jitarm64-fmul-rounding
JitArm64: Fix fmul rounding issues
  • Loading branch information
lioncash committed May 20, 2021
2 parents 5949a19 + 11be231 commit 1054abc
Show file tree
Hide file tree
Showing 6 changed files with 177 additions and 34 deletions.
39 changes: 37 additions & 2 deletions Source/Core/Common/Arm64Emitter.cpp
Expand Up @@ -2294,6 +2294,15 @@ void ARM64FloatEmitter::EmitScalar2Source(bool M, bool S, u32 type, u32 opcode,
(opcode << 12) | (1 << 11) | (DecodeReg(Rn) << 5) | DecodeReg(Rd));
}

void ARM64FloatEmitter::EmitScalarThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn,
ARM64Reg Rm)
{
ASSERT_MSG(DYNA_REC, !IsQuad(Rd), "%s only supports double and single registers!", __func__);

Write32((1 << 30) | (U << 29) | (0b11110001 << 21) | (size << 22) | (DecodeReg(Rm) << 16) |
(opcode << 11) | (1 << 10) | (DecodeReg(Rn) << 5) | DecodeReg(Rd));
}

void ARM64FloatEmitter::EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn,
ARM64Reg Rm)
{
Expand Down Expand Up @@ -3118,6 +3127,11 @@ void ARM64FloatEmitter::FRSQRTE(ARM64Reg Rd, ARM64Reg Rn)
}

// Scalar - 2 Source
void ARM64FloatEmitter::ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
ASSERT_MSG(DYNA_REC, IsDouble(Rd), "%s only supports double registers!", __func__);
EmitScalarThreeSame(0, 3, 0b10000, Rd, Rn, Rm);
}
void ARM64FloatEmitter::FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
EmitScalar2Source(0, 0, IsDouble(Rd), 2, Rd, Rn, Rm);
Expand Down Expand Up @@ -3189,10 +3203,18 @@ void ARM64FloatEmitter::FMOV(ARM64Reg Rd, uint8_t imm8)
}

// Vector
void ARM64FloatEmitter::ADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
EmitThreeSame(0, size >> 6, 0b10000, Rd, Rn, Rm);
}
void ARM64FloatEmitter::AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
EmitThreeSame(0, 0, 3, Rd, Rn, Rm);
}
void ARM64FloatEmitter::BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
EmitThreeSame(0, 1, 3, Rd, Rn, Rm);
}
void ARM64FloatEmitter::BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
EmitThreeSame(1, 1, 3, Rd, Rn, Rm);
Expand Down Expand Up @@ -3300,6 +3322,10 @@ void ARM64FloatEmitter::ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
EmitThreeSame(0, 2, 3, Rd, Rn, Rm);
}
void ARM64FloatEmitter::ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
EmitThreeSame(0, 3, 3, Rd, Rn, Rm);
}
void ARM64FloatEmitter::REV16(u8 size, ARM64Reg Rd, ARM64Reg Rn)
{
Emit2RegMisc(IsQuad(Rd), 0, size >> 4, 1, Rd, Rn);
Expand Down Expand Up @@ -3879,11 +3905,10 @@ void ARM64FloatEmitter::MOVI(u8 size, ARM64Reg Rd, u64 imm, u8 shift)
EncodeModImm(Q, op, cmode, 0, Rd, abcdefgh);
}

void ARM64FloatEmitter::BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift)
void ARM64FloatEmitter::ORR_BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift, u8 op)
{
bool Q = IsQuad(Rd);
u8 cmode = 1;
u8 op = 1;
if (size == 16)
{
ASSERT_MSG(DYNA_REC, shift == 0 || shift == 8, "%s(size16) only supports shift of {0, 8}!",
Expand Down Expand Up @@ -3919,6 +3944,16 @@ void ARM64FloatEmitter::BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift)
EncodeModImm(Q, op, cmode, 0, Rd, imm);
}

void ARM64FloatEmitter::ORR(u8 size, ARM64Reg Rd, u8 imm, u8 shift)
{
ORR_BIC(size, Rd, imm, shift, 0);
}

void ARM64FloatEmitter::BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift)
{
ORR_BIC(size, Rd, imm, shift, 1);
}

void ARM64FloatEmitter::ABI_PushRegisters(BitSet32 registers, ARM64Reg tmp)
{
bool bundled_loadstore = false;
Expand Down
8 changes: 8 additions & 0 deletions Source/Core/Common/Arm64Emitter.h
Expand Up @@ -1000,6 +1000,7 @@ class ARM64FloatEmitter
void FRSQRTE(ARM64Reg Rd, ARM64Reg Rn);

// Scalar - 2 Source
void ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void FMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void FSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
Expand All @@ -1020,7 +1021,9 @@ class ARM64FloatEmitter
void FMOV(ARM64Reg Rd, uint8_t imm8);

// Vector
void ADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index);
void FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn);
Expand All @@ -1043,6 +1046,7 @@ class ARM64FloatEmitter
void FSUB(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void NOT(ARM64Reg Rd, ARM64Reg Rn);
void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void MOV(ARM64Reg Rd, ARM64Reg Rn) { ORR(Rd, Rn, Rn); }
void REV16(u8 size, ARM64Reg Rd, ARM64Reg Rn);
void REV32(u8 size, ARM64Reg Rd, ARM64Reg Rn);
Expand Down Expand Up @@ -1128,6 +1132,7 @@ class ARM64FloatEmitter

// Modified Immediate
void MOVI(u8 size, ARM64Reg Rd, u64 imm, u8 shift = 0);
void ORR(u8 size, ARM64Reg Rd, u8 imm, u8 shift = 0);
void BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift = 0);

void MOVI2F(ARM64Reg Rd, float value, ARM64Reg scratch = ARM64Reg::INVALID_REG,
Expand All @@ -1145,6 +1150,7 @@ class ARM64FloatEmitter
void EmitLoadStoreImmediate(u8 size, u32 opc, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
void EmitScalar2Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn,
ARM64Reg Rm);
void EmitScalarThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd, ARM64Reg Rn);
void EmitScalar2RegMisc(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
Expand Down Expand Up @@ -1178,6 +1184,8 @@ class ARM64FloatEmitter
void EncodeLoadStoreRegisterOffset(u32 size, bool load, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
void EncodeModImm(bool Q, u8 op, u8 cmode, u8 o2, ARM64Reg Rd, u8 abcdefgh);

void ORR_BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift, u8 op);

void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
Expand Down
2 changes: 2 additions & 0 deletions Source/Core/Core/PowerPC/JitArm64/Jit.h
Expand Up @@ -270,6 +270,8 @@ class JitArm64 : public JitBase, public Arm64Gen::ARM64CodeBlock, public CommonA
bool Rc = false);

void SetFPRFIfNeeded(bool single, Arm64Gen::ARM64Reg reg);
void Force25BitPrecision(Arm64Gen::ARM64Reg output, Arm64Gen::ARM64Reg input,
Arm64Gen::ARM64Reg temp);

// <Fastmem fault location, slowmem handler location>
std::map<const u8*, FastmemArea> m_fault_to_handler;
Expand Down
61 changes: 58 additions & 3 deletions Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
Expand Up @@ -39,6 +39,29 @@ void JitArm64::SetFPRFIfNeeded(bool single, ARM64Reg reg)
gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30);
}

// Emulate the odd truncation/rounding that the PowerPC does on the RHS operand before
// a single precision multiply. To be precise, it drops the low 28 bits of the mantissa,
// rounding to nearest as it does.
void JitArm64::Force25BitPrecision(ARM64Reg output, ARM64Reg input, ARM64Reg temp)
{
ASSERT(output != input && output != temp && input != temp);

// temp = 0x0000'0000'0800'0000ULL
// output = 0xFFFF'FFFF'F800'0000ULL
m_float_emit.MOVI(32, temp, 0x08, 24);
m_float_emit.MOVI(64, output, 0xFFFF'FFFF'0000'0000ULL);
m_float_emit.BIC(temp, temp, output);
m_float_emit.ORR(32, output, 0xF8, 24);

// output = (input & ~0xFFFFFFF) + ((input & (1ULL << 27)) << 1)
m_float_emit.AND(temp, input, temp);
m_float_emit.AND(output, input, output);
if (IsQuad(input))
m_float_emit.ADD(64, output, output, temp);
else
m_float_emit.ADD(output, output, temp);
}

void JitArm64::fp_arith(UGeckoInstruction inst)
{
INSTRUCTION_START
Expand All @@ -51,8 +74,11 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
bool single = inst.OPCD == 59;
bool packed = inst.OPCD == 4;

bool use_c = op5 >= 25; // fmul and all kind of fmaddXX
bool use_b = op5 != 25; // fmul uses no B
const bool use_c = op5 >= 25; // fmul and all kind of fmaddXX
const bool use_b = op5 != 25; // fmul uses no B

const bool outputs_are_singles = single || packed;
const bool round_c = use_c && outputs_are_singles && !js.op->fprIsSingle[inst.FC];

const auto inputs_are_singles_func = [&] {
return fpr.IsSingle(a, !packed) && (!use_b || fpr.IsSingle(b, !packed)) &&
Expand All @@ -62,6 +88,8 @@ void JitArm64::fp_arith(UGeckoInstruction inst)

ARM64Reg VA{}, VB{}, VC{}, VD{};

ARM64Reg V0Q = ARM64Reg::INVALID_REG;

if (packed)
{
const RegType type = inputs_are_singles ? RegType::Single : RegType::Register;
Expand All @@ -75,6 +103,19 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
VC = reg_encoder(fpr.R(c, type));
VD = reg_encoder(fpr.RW(d, type));

if (round_c)
{
ASSERT_MSG(DYNA_REC, !inputs_are_singles, "Tried to apply 25-bit precision to single");

V0Q = fpr.GetReg();
const ARM64Reg V1Q = fpr.GetReg();

Force25BitPrecision(reg_encoder(V0Q), VC, reg_encoder(V1Q));
VC = reg_encoder(V0Q);

fpr.Unlock(V1Q);
}

switch (op5)
{
case 18:
Expand Down Expand Up @@ -110,6 +151,19 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
VC = reg_encoder(fpr.R(c, type));
VD = reg_encoder(fpr.RW(d, type_out));

if (round_c)
{
ASSERT_MSG(DYNA_REC, !inputs_are_singles, "Tried to apply 25-bit precision to single");

V0Q = fpr.GetReg();
const ARM64Reg V1Q = fpr.GetReg();

Force25BitPrecision(reg_encoder(V0Q), VC, reg_encoder(V1Q));
VC = reg_encoder(V0Q);

fpr.Unlock(V1Q);
}

switch (op5)
{
case 18:
Expand Down Expand Up @@ -142,7 +196,8 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
}
}

const bool outputs_are_singles = single || packed;
if (V0Q != ARM64Reg::INVALID_REG)
fpr.Unlock(V0Q);

if (outputs_are_singles)
{
Expand Down

0 comments on commit 1054abc

Please sign in to comment.