Skip to content

Commit

Permalink
JitArm64: Implement accurate NaNs
Browse files Browse the repository at this point in the history
For quite some time now, we've had a setting on x86-64 that makes Dolphin
handle NaNs in a more accurate but slower way. There's only one game that
cares about this, Dragon Ball: Revenge of King Piccolo, and what that game
cares about more specifically is that the default NaN (or "generated NaN"
as I believe it's called in PowerPC documentation) is the same as on
PowerPC. On ARM, the default NaN is the same as on PowerPC, so for the
longest time we didn't need to do anything special to get Dragon Ball:
Revenge of King Piccolo working. However, in 93e636a I changed how we
handle FMA instructions in a way that resulted in the sign of NaNs
becoming inverted for nmadd/nmsub instructions, breaking the game.
To fix this, let's implement the AccurateNaNs setting, like on x86-64.
  • Loading branch information
JosJuice committed Dec 3, 2022
1 parent 5c41d3b commit 06e60ac
Show file tree
Hide file tree
Showing 5 changed files with 329 additions and 15 deletions.
28 changes: 28 additions & 0 deletions Source/Core/Common/Arm64Emitter.cpp
Expand Up @@ -2173,6 +2173,12 @@ void ARM64FloatEmitter::EmitScalar2RegMisc(bool U, u32 size, u32 opcode, ARM64Re
(DecodeReg(Rn) << 5) | DecodeReg(Rd));
}

void ARM64FloatEmitter::EmitScalarPairwise(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn)
{
Write32((1 << 30) | (U << 29) | (0b111100011 << 20) | (size << 22) | (opcode << 12) | (1 << 11) |
(DecodeReg(Rn) << 5) | DecodeReg(Rd));
}

void ARM64FloatEmitter::Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn)
{
ASSERT_MSG(DYNA_REC, !IsSingle(Rd), "Singles are not supported!");
Expand Down Expand Up @@ -2985,6 +2991,28 @@ void ARM64FloatEmitter::FRSQRTE(ARM64Reg Rd, ARM64Reg Rn)
EmitScalar2RegMisc(1, IsDouble(Rd) ? 3 : 2, 0x1D, Rd, Rn);
}

// Scalar - pairwise
void ARM64FloatEmitter::FADDP(ARM64Reg Rd, ARM64Reg Rn)
{
EmitScalarPairwise(1, IsDouble(Rd), 0b01101, Rd, Rn);
}
void ARM64FloatEmitter::FMAXP(ARM64Reg Rd, ARM64Reg Rn)
{
EmitScalarPairwise(1, IsDouble(Rd), 0b01111, Rd, Rn);
}
void ARM64FloatEmitter::FMINP(ARM64Reg Rd, ARM64Reg Rn)
{
EmitScalarPairwise(1, IsDouble(Rd) ? 3 : 2, 0b01111, Rd, Rn);
}
void ARM64FloatEmitter::FMAXNMP(ARM64Reg Rd, ARM64Reg Rn)
{
EmitScalarPairwise(1, IsDouble(Rd), 0b01100, Rd, Rn);
}
void ARM64FloatEmitter::FMINNMP(ARM64Reg Rd, ARM64Reg Rn)
{
EmitScalarPairwise(1, IsDouble(Rd) ? 3 : 2, 0b01100, Rd, Rn);
}

// Scalar - 2 Source
void ARM64FloatEmitter::ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
Expand Down
8 changes: 8 additions & 0 deletions Source/Core/Common/Arm64Emitter.h
Expand Up @@ -1130,6 +1130,13 @@ class ARM64FloatEmitter
void FRECPE(ARM64Reg Rd, ARM64Reg Rn);
void FRSQRTE(ARM64Reg Rd, ARM64Reg Rn);

// Scalar - pairwise
void FADDP(ARM64Reg Rd, ARM64Reg Rn);
void FMAXP(ARM64Reg Rd, ARM64Reg Rn);
void FMINP(ARM64Reg Rd, ARM64Reg Rn);
void FMAXNMP(ARM64Reg Rd, ARM64Reg Rn);
void FMINNMP(ARM64Reg Rd, ARM64Reg Rn);

// Scalar - 2 Source
void ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
Expand Down Expand Up @@ -1296,6 +1303,7 @@ class ARM64FloatEmitter
void EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd, ARM64Reg Rn);
void EmitScalar2RegMisc(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
void EmitScalarPairwise(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
void Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt,
ARM64Reg Rn);
Expand Down
4 changes: 4 additions & 0 deletions Source/Core/Core/PowerPC/JitArm64/Jit.h
Expand Up @@ -177,6 +177,10 @@ class JitArm64 : public JitBase, public Arm64Gen::ARM64CodeBlock, public CommonA

void FloatCompare(UGeckoInstruction inst, bool upper = false);

// temp_gpr can be INVALID_REG if single is true
void EmitQuietNaNBitConstant(Arm64Gen::ARM64Reg dest_reg, bool single,
Arm64Gen::ARM64Reg temp_gpr);

bool IsFPRStoreSafe(size_t guest_reg) const;

protected:
Expand Down
133 changes: 127 additions & 6 deletions Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
Expand Up @@ -3,6 +3,8 @@

#include "Core/PowerPC/JitArm64/Jit.h"

#include <optional>

#include "Common/Arm64Emitter.h"
#include "Common/CPUDetect.h"
#include "Common/CommonTypes.h"
Expand Down Expand Up @@ -66,14 +68,20 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
FALLBACK_IF(inst.Rc);
FALLBACK_IF(jo.fp_exceptions || (jo.div_by_zero_exceptions && inst.SUBOP5 == 18));

u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD;
u32 op5 = inst.SUBOP5;
const u32 a = inst.FA;
const u32 b = inst.FB;
const u32 c = inst.FC;
const u32 d = inst.FD;
const u32 op5 = inst.SUBOP5;

const bool use_c = op5 >= 25; // fmul and all kind of fmaddXX
const bool use_b = op5 != 25; // fmul uses no B
const bool fma = use_b && use_c;
const bool negate_result = (op5 & ~0x1) == 30;

// Addition and subtraction can't generate new NaNs, they can only take NaNs from inputs
const bool can_generate_nan = (op5 & ~0x1) != 20;

const bool output_is_single = inst.OPCD == 59;
const bool inaccurate_fma = op5 > 25 && !Config::Get(Config::SESSION_USE_FMA);
const bool round_c = use_c && output_is_single && !js.op->fprIsSingle[inst.FC];
Expand All @@ -84,20 +92,20 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
};
const bool inputs_are_singles = inputs_are_singles_func();

const RegType type =
(inputs_are_singles && output_is_single) ? RegType::LowerPairSingle : RegType::LowerPair;
const bool single = inputs_are_singles && output_is_single;
const RegType type = single ? RegType::LowerPairSingle : RegType::LowerPair;
const RegType type_out =
output_is_single ? (inputs_are_singles ? RegType::DuplicatedSingle : RegType::Duplicated) :
RegType::LowerPair;
const auto reg_encoder =
(inputs_are_singles && output_is_single) ? EncodeRegToSingle : EncodeRegToDouble;
const auto reg_encoder = single ? EncodeRegToSingle : EncodeRegToDouble;

const ARM64Reg VA = reg_encoder(fpr.R(a, type));
const ARM64Reg VB = use_b ? reg_encoder(fpr.R(b, type)) : ARM64Reg::INVALID_REG;
const ARM64Reg VC = use_c ? reg_encoder(fpr.R(c, type)) : ARM64Reg::INVALID_REG;
const ARM64Reg VD = reg_encoder(fpr.RW(d, type_out));

ARM64Reg V0Q = ARM64Reg::INVALID_REG;
ARM64Reg V1Q = ARM64Reg::INVALID_REG;

ARM64Reg rounded_c_reg = VC;
if (round_c)
Expand All @@ -118,6 +126,21 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
}

ARM64Reg result_reg = VD;
const bool preserve_d =
m_accurate_nans && (VD == VA || (use_b && VD == VB) || (use_c && VD == VC));
if (preserve_d)
{
V1Q = fpr.GetReg();
result_reg = reg_encoder(V1Q);
}

const ARM64Reg temp_gpr = m_accurate_nans && !single ? gpr.GetReg() : ARM64Reg::INVALID_REG;

if (m_accurate_nans)
{
if (V0Q == ARM64Reg::INVALID_REG)
V0Q = fpr.GetReg();
}

switch (op5)
{
Expand Down Expand Up @@ -166,6 +189,74 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
break;
}

std::vector<FixupBranch> nan_fixups;
if (m_accurate_nans)
{
// Check if we need to handle NaNs
m_float_emit.FCMP(result_reg);
FixupBranch no_nan = B(CCFlags::CC_VC);
FixupBranch nan = B();
SetJumpTarget(no_nan);

SwitchToFarCode();
SetJumpTarget(nan);

const ARM64Reg quiet_bit_reg = reg_encoder(V0Q);

EmitQuietNaNBitConstant(quiet_bit_reg, inputs_are_singles && output_is_single, temp_gpr);

std::vector<ARM64Reg> inputs;
inputs.push_back(VA);
if (use_b && VA != VB)
inputs.push_back(VB);
if (use_c && VA != VC && (!use_b || VB != VC))
inputs.push_back(VC);

// If any inputs are NaNs, pick the first NaN of them and OR it with the quiet bit
for (size_t i = 0; i < inputs.size(); ++i)
{
// Skip checking if the input is a NaN if it's the last input and we're guaranteed to have at
// least one NaN input
const bool check_input = can_generate_nan || i != inputs.size() - 1;

const ARM64Reg input = inputs[i];
FixupBranch skip;
if (check_input)
{
m_float_emit.FCMP(input);
skip = B(CCFlags::CC_VC);
}

m_float_emit.ORR(EncodeRegToDouble(VD), EncodeRegToDouble(input),
EncodeRegToDouble(quiet_bit_reg));
nan_fixups.push_back(B());

if (check_input)
SetJumpTarget(skip);
}

std::optional<FixupBranch> nan_early_fixup;
if (can_generate_nan)
{
// There was no NaN in any of the inputs, so the NaN must have been generated by the
// arithmetic instruction. In this case, the result is already correct.
if (negate_result)
{
if (result_reg != VD)
m_float_emit.MOV(EncodeRegToDouble(VD), EncodeRegToDouble(result_reg));
nan_fixups.push_back(B());
}
else
{
nan_early_fixup = B();
}
}

SwitchToNearCode();

if (nan_early_fixup)
SetJumpTarget(*nan_early_fixup);
}

// PowerPC's nmadd/nmsub perform rounding before the final negation, which is not the case
// for any of AArch64's FMA instructions, so we negate using a separate instruction.
Expand All @@ -174,8 +265,15 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
else if (result_reg != VD)
m_float_emit.MOV(EncodeRegToDouble(VD), EncodeRegToDouble(result_reg));

for (FixupBranch fixup : nan_fixups)
SetJumpTarget(fixup);

if (V0Q != ARM64Reg::INVALID_REG)
fpr.Unlock(V0Q);
if (V1Q != ARM64Reg::INVALID_REG)
fpr.Unlock(V1Q);
if (temp_gpr != ARM64Reg::INVALID_REG)
gpr.Unlock(temp_gpr);

if (output_is_single)
{
Expand Down Expand Up @@ -787,6 +885,29 @@ void JitArm64::ConvertSingleToDoublePair(size_t guest_reg, ARM64Reg dest_reg, AR
}
}

void JitArm64::EmitQuietNaNBitConstant(ARM64Reg dest_reg, bool single, ARM64Reg temp_gpr)
{
// dest_reg = QNaN & ~SNaN
//
// (Alternatively, dest_reg = QNaN would also work, but that would take
// two instructions to emit even for singles)

if (single)
{
m_float_emit.MOVI(32, dest_reg, 0x40, 16);
}
else
{
ASSERT(temp_gpr != ARM64Reg::INVALID_REG);

MOVI2R(EncodeRegTo64(temp_gpr), 0x0008'0000'0000'0000);
if (IsQuad(dest_reg))
m_float_emit.DUP(64, dest_reg, EncodeRegTo64(temp_gpr));
else
m_float_emit.FMOV(dest_reg, EncodeRegTo64(temp_gpr));
}
}

bool JitArm64::IsFPRStoreSafe(size_t guest_reg) const
{
return js.fpr_is_store_safe[guest_reg];
Expand Down

0 comments on commit 06e60ac

Please sign in to comment.