From 06e60ac327c223f189103e2ed15e6896f92717e2 Mon Sep 17 00:00:00 2001
From: JosJuice <josjuice@gmail.com>
Date: Sat, 3 Dec 2022 17:37:51 +0100
Subject: [PATCH] JitArm64: Implement accurate NaNs

For quite some time now, we've had a setting on x86-64 that makes Dolphin
handle NaNs in a more accurate but slower way. There's only one game that
cares about this, Dragon Ball: Revenge of King Piccolo, and what that game
cares about more specifically is that the default NaN (or "generated NaN"
as I believe it's called in PowerPC documentation) is the same as on
PowerPC. On ARM, the default NaN is the same as on PowerPC, so for the
longest time we didn't need to do anything special to get Dragon Ball:
Revenge of King Piccolo working. However, in 93e636a I changed how we
handle FMA instructions in a way that resulted in the sign of NaNs
becoming inverted for nmadd/nmsub instructions, breaking the game.
To fix this, let's implement the AccurateNaNs setting, like on x86-64.
---
 Source/Core/Common/Arm64Emitter.cpp           |  28 +++
 Source/Core/Common/Arm64Emitter.h             |   8 +
 Source/Core/Core/PowerPC/JitArm64/Jit.h       |   4 +
 .../JitArm64/JitArm64_FloatingPoint.cpp       | 133 +++++++++++++-
 .../Core/PowerPC/JitArm64/JitArm64_Paired.cpp | 171 +++++++++++++++++-
 5 files changed, 329 insertions(+), 15 deletions(-)

diff --git a/Source/Core/Common/Arm64Emitter.cpp b/Source/Core/Common/Arm64Emitter.cpp
index 084ab249022f..87dae220df25 100644
--- a/Source/Core/Common/Arm64Emitter.cpp
+++ b/Source/Core/Common/Arm64Emitter.cpp
@@ -2173,6 +2173,12 @@ void ARM64FloatEmitter::EmitScalar2RegMisc(bool U, u32 size, u32 opcode, ARM64Re
           (DecodeReg(Rn) << 5) | DecodeReg(Rd));
 }
 
+void ARM64FloatEmitter::EmitScalarPairwise(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Write32((1 << 30) | (U << 29) | (0b111100011 << 20) | (size << 22) | (opcode << 12) | (1 << 11) |
+          (DecodeReg(Rn) << 5) | DecodeReg(Rd));
+}
+
 void ARM64FloatEmitter::Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn)
 {
   ASSERT_MSG(DYNA_REC, !IsSingle(Rd), "Singles are not supported!");
@@ -2985,6 +2991,28 @@ void ARM64FloatEmitter::FRSQRTE(ARM64Reg Rd, ARM64Reg Rn)
   EmitScalar2RegMisc(1, IsDouble(Rd) ? 3 : 2, 0x1D, Rd, Rn);
 }
 
+// Scalar - pairwise
+void ARM64FloatEmitter::FADDP(ARM64Reg Rd, ARM64Reg Rn)
+{
+  EmitScalarPairwise(1, IsDouble(Rd), 0b01101, Rd, Rn);
+}
+void ARM64FloatEmitter::FMAXP(ARM64Reg Rd, ARM64Reg Rn)
+{
+  EmitScalarPairwise(1, IsDouble(Rd), 0b01111, Rd, Rn);
+}
+void ARM64FloatEmitter::FMINP(ARM64Reg Rd, ARM64Reg Rn)
+{
+  EmitScalarPairwise(1, IsDouble(Rd) ? 3 : 2, 0b01111, Rd, Rn);
+}
+void ARM64FloatEmitter::FMAXNMP(ARM64Reg Rd, ARM64Reg Rn)
+{
+  EmitScalarPairwise(1, IsDouble(Rd), 0b01100, Rd, Rn);
+}
+void ARM64FloatEmitter::FMINNMP(ARM64Reg Rd, ARM64Reg Rn)
+{
+  EmitScalarPairwise(1, IsDouble(Rd) ? 3 : 2, 0b01100, Rd, Rn);
+}
+
 // Scalar - 2 Source
 void ARM64FloatEmitter::ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
 {
diff --git a/Source/Core/Common/Arm64Emitter.h b/Source/Core/Common/Arm64Emitter.h
index 09a26336602c..baaf598741d3 100644
--- a/Source/Core/Common/Arm64Emitter.h
+++ b/Source/Core/Common/Arm64Emitter.h
@@ -1130,6 +1130,13 @@ class ARM64FloatEmitter
   void FRECPE(ARM64Reg Rd, ARM64Reg Rn);
   void FRSQRTE(ARM64Reg Rd, ARM64Reg Rn);
 
+  // Scalar - pairwise
+  void FADDP(ARM64Reg Rd, ARM64Reg Rn);
+  void FMAXP(ARM64Reg Rd, ARM64Reg Rn);
+  void FMINP(ARM64Reg Rd, ARM64Reg Rn);
+  void FMAXNMP(ARM64Reg Rd, ARM64Reg Rn);
+  void FMINNMP(ARM64Reg Rd, ARM64Reg Rn);
+
   // Scalar - 2 Source
   void ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
   void FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
@@ -1296,6 +1303,7 @@ class ARM64FloatEmitter
   void EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
   void EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd, ARM64Reg Rn);
   void EmitScalar2RegMisc(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
+  void EmitScalarPairwise(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
   void Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
   void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt,
                                     ARM64Reg Rn);
diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h
index 9372fffd2d40..ae87d815c867 100644
--- a/Source/Core/Core/PowerPC/JitArm64/Jit.h
+++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h
@@ -177,6 +177,10 @@ class JitArm64 : public JitBase, public Arm64Gen::ARM64CodeBlock, public CommonA
 
   void FloatCompare(UGeckoInstruction inst, bool upper = false);
 
+  // temp_gpr can be INVALID_REG if single is true
+  void EmitQuietNaNBitConstant(Arm64Gen::ARM64Reg dest_reg, bool single,
+                               Arm64Gen::ARM64Reg temp_gpr);
+
   bool IsFPRStoreSafe(size_t guest_reg) const;
 
 protected:
diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
index bc0e427a6608..c5624ee6d4f2 100644
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
@@ -3,6 +3,8 @@
 
 #include "Core/PowerPC/JitArm64/Jit.h"
 
+#include <optional>
+
 #include "Common/Arm64Emitter.h"
 #include "Common/CPUDetect.h"
 #include "Common/CommonTypes.h"
@@ -66,14 +68,20 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
   FALLBACK_IF(inst.Rc);
   FALLBACK_IF(jo.fp_exceptions || (jo.div_by_zero_exceptions && inst.SUBOP5 == 18));
 
-  u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD;
-  u32 op5 = inst.SUBOP5;
+  const u32 a = inst.FA;
+  const u32 b = inst.FB;
+  const u32 c = inst.FC;
+  const u32 d = inst.FD;
+  const u32 op5 = inst.SUBOP5;
 
   const bool use_c = op5 >= 25;  // fmul and all kind of fmaddXX
   const bool use_b = op5 != 25;  // fmul uses no B
   const bool fma = use_b && use_c;
   const bool negate_result = (op5 & ~0x1) == 30;
 
+  // Addition and subtraction can't generate new NaNs, they can only take NaNs from inputs
+  const bool can_generate_nan = (op5 & ~0x1) != 20;
+
   const bool output_is_single = inst.OPCD == 59;
   const bool inaccurate_fma = op5 > 25 && !Config::Get(Config::SESSION_USE_FMA);
   const bool round_c = use_c && output_is_single && !js.op->fprIsSingle[inst.FC];
@@ -84,13 +92,12 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
   };
   const bool inputs_are_singles = inputs_are_singles_func();
 
-  const RegType type =
-      (inputs_are_singles && output_is_single) ? RegType::LowerPairSingle : RegType::LowerPair;
+  const bool single = inputs_are_singles && output_is_single;
+  const RegType type = single ? RegType::LowerPairSingle : RegType::LowerPair;
   const RegType type_out =
       output_is_single ? (inputs_are_singles ? RegType::DuplicatedSingle : RegType::Duplicated) :
                          RegType::LowerPair;
-  const auto reg_encoder =
-      (inputs_are_singles && output_is_single) ? EncodeRegToSingle : EncodeRegToDouble;
+  const auto reg_encoder = single ? EncodeRegToSingle : EncodeRegToDouble;
 
   const ARM64Reg VA = reg_encoder(fpr.R(a, type));
   const ARM64Reg VB = use_b ? reg_encoder(fpr.R(b, type)) : ARM64Reg::INVALID_REG;
@@ -98,6 +105,7 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
   const ARM64Reg VD = reg_encoder(fpr.RW(d, type_out));
 
   ARM64Reg V0Q = ARM64Reg::INVALID_REG;
+  ARM64Reg V1Q = ARM64Reg::INVALID_REG;
 
   ARM64Reg rounded_c_reg = VC;
   if (round_c)
@@ -118,6 +126,21 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
   }
 
   ARM64Reg result_reg = VD;
+  const bool preserve_d =
+      m_accurate_nans && (VD == VA || (use_b && VD == VB) || (use_c && VD == VC));
+  if (preserve_d)
+  {
+    V1Q = fpr.GetReg();
+    result_reg = reg_encoder(V1Q);
+  }
+
+  const ARM64Reg temp_gpr = m_accurate_nans && !single ? gpr.GetReg() : ARM64Reg::INVALID_REG;
+
+  if (m_accurate_nans)
+  {
+    if (V0Q == ARM64Reg::INVALID_REG)
+      V0Q = fpr.GetReg();
+  }
 
   switch (op5)
   {
@@ -166,6 +189,74 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
     break;
   }
 
+  std::vector<FixupBranch> nan_fixups;
+  if (m_accurate_nans)
+  {
+    // Check if we need to handle NaNs
+    m_float_emit.FCMP(result_reg);
+    FixupBranch no_nan = B(CCFlags::CC_VC);
+    FixupBranch nan = B();
+    SetJumpTarget(no_nan);
+
+    SwitchToFarCode();
+    SetJumpTarget(nan);
+
+    const ARM64Reg quiet_bit_reg = reg_encoder(V0Q);
+
+    EmitQuietNaNBitConstant(quiet_bit_reg, inputs_are_singles && output_is_single, temp_gpr);
+
+    std::vector<ARM64Reg> inputs;
+    inputs.push_back(VA);
+    if (use_b && VA != VB)
+      inputs.push_back(VB);
+    if (use_c && VA != VC && (!use_b || VB != VC))
+      inputs.push_back(VC);
+
+    // If any inputs are NaNs, pick the first NaN of them and OR it with the quiet bit
+    for (size_t i = 0; i < inputs.size(); ++i)
+    {
+      // Skip checking if the input is a NaN if it's the last input and we're guaranteed to have at
+      // least one NaN input
+      const bool check_input = can_generate_nan || i != inputs.size() - 1;
+
+      const ARM64Reg input = inputs[i];
+      FixupBranch skip;
+      if (check_input)
+      {
+        m_float_emit.FCMP(input);
+        skip = B(CCFlags::CC_VC);
+      }
+
+      m_float_emit.ORR(EncodeRegToDouble(VD), EncodeRegToDouble(input),
+                       EncodeRegToDouble(quiet_bit_reg));
+      nan_fixups.push_back(B());
+
+      if (check_input)
+        SetJumpTarget(skip);
+    }
+
+    std::optional<FixupBranch> nan_early_fixup;
+    if (can_generate_nan)
+    {
+      // There was no NaN in any of the inputs, so the NaN must have been generated by the
+      // arithmetic instruction. In this case, the result is already correct.
+      if (negate_result)
+      {
+        if (result_reg != VD)
+          m_float_emit.MOV(EncodeRegToDouble(VD), EncodeRegToDouble(result_reg));
+        nan_fixups.push_back(B());
+      }
+      else
+      {
+        nan_early_fixup = B();
+      }
+    }
+
+    SwitchToNearCode();
+
+    if (nan_early_fixup)
+      SetJumpTarget(*nan_early_fixup);
+  }
 
   // PowerPC's nmadd/nmsub perform rounding before the final negation, which is not the case
   // for any of AArch64's FMA instructions, so we negate using a separate instruction.
@@ -174,8 +265,15 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
   else if (result_reg != VD)
     m_float_emit.MOV(EncodeRegToDouble(VD), EncodeRegToDouble(result_reg));
 
+  for (FixupBranch fixup : nan_fixups)
+    SetJumpTarget(fixup);
+
   if (V0Q != ARM64Reg::INVALID_REG)
     fpr.Unlock(V0Q);
+  if (V1Q != ARM64Reg::INVALID_REG)
+    fpr.Unlock(V1Q);
+  if (temp_gpr != ARM64Reg::INVALID_REG)
+    gpr.Unlock(temp_gpr);
 
   if (output_is_single)
   {
@@ -787,6 +885,29 @@ void JitArm64::ConvertSingleToDoublePair(size_t guest_reg, ARM64Reg dest_reg, AR
   }
 }
 
+void JitArm64::EmitQuietNaNBitConstant(ARM64Reg dest_reg, bool single, ARM64Reg temp_gpr)
+{
+  // dest_reg = QNaN & ~SNaN
+  //
+  // (Alternatively, dest_reg = QNaN would also work, but that would take
+  // two instructions to emit even for singles)
+
+  if (single)
+  {
+    m_float_emit.MOVI(32, dest_reg, 0x40, 16);
+  }
+  else
+  {
+    ASSERT(temp_gpr != ARM64Reg::INVALID_REG);
+
+    MOVI2R(EncodeRegTo64(temp_gpr), 0x0008'0000'0000'0000);
+    if (IsQuad(dest_reg))
+      m_float_emit.DUP(64, dest_reg, EncodeRegTo64(temp_gpr));
+    else
+      m_float_emit.FMOV(dest_reg, EncodeRegTo64(temp_gpr));
+  }
+}
+
 bool JitArm64::IsFPRStoreSafe(size_t guest_reg) const
 {
   return js.fpr_is_store_safe[guest_reg];
diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
index 4e986b0ce23c..4c0730f9d0b2 100644
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
@@ -83,8 +83,11 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
   const u32 d = inst.FD;
   const u32 op5 = inst.SUBOP5;
 
+  const bool muls = (op5 & ~0x1) == 12;
+  const bool madds = (op5 & ~0x1) == 14;
   const bool use_c = op5 == 25 || (op5 & ~0x13) == 12;  // mul, muls, and all kinds of maddXX
-  const bool use_b = op5 != 25 && (op5 & ~0x1) != 12;   // mul and muls don't use B
+  const bool use_b = op5 != 25 && !muls;                // mul and muls don't use B
+  const bool duplicated_c = muls || madds;
   const bool fma = use_b && use_c;
   const bool negate_result = (op5 & ~0x1) == 30;
   const bool msub = op5 == 28 || op5 == 30;
@@ -107,6 +110,8 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
 
   ARM64Reg V0Q = ARM64Reg::INVALID_REG;
   ARM64Reg V1Q = ARM64Reg::INVALID_REG;
+  ARM64Reg V2Q = ARM64Reg::INVALID_REG;
+  ARM64Reg V3Q = ARM64Reg::INVALID_REG;
 
   ARM64Reg rounded_c_reg = VC;
   if (round_c)
@@ -127,12 +132,29 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
   }
 
   ARM64Reg result_reg = VD;
-  if (fma && !inaccurate_fma && (msub || VD != VB) && (VD == VA || VD == rounded_c_reg))
+  const bool need_accurate_fma_reg =
+      fma && !inaccurate_fma && (msub || VD != VB) && (VD == VA || VD == rounded_c_reg);
+  const bool preserve_d =
+      m_accurate_nans && (VD == VA || (use_b && VD == VB) || (use_c && VD == VC));
+  if (need_accurate_fma_reg || preserve_d)
   {
     V1Q = fpr.GetReg();
     result_reg = reg_encoder(V1Q);
   }
 
+  const ARM64Reg temp_gpr = m_accurate_nans && !singles ? gpr.GetReg() : ARM64Reg::INVALID_REG;
+
+  if (m_accurate_nans)
+  {
+    if (V0Q == ARM64Reg::INVALID_REG)
+      V0Q = fpr.GetReg();
+
+    V2Q = fpr.GetReg();
+
+    if (duplicated_c || VD == result_reg)
+      V3Q = fpr.GetReg();
+  }
+
   switch (op5)
   {
   case 12:  // ps_muls0: d = a * c.ps0
@@ -211,6 +233,69 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
     break;
   }
 
+  FixupBranch nan_fixup;
+  if (m_accurate_nans)
+  {
+    const ARM64Reg nan_temp_reg = singles ? EncodeRegToSingle(V0Q) : EncodeRegToDouble(V0Q);
+    const ARM64Reg nan_temp_reg_paired = reg_encoder(V0Q);
+
+    const ARM64Reg zero_reg = reg_encoder(V2Q);
+
+    // Check if we need to handle NaNs
+
+    m_float_emit.FMAXP(nan_temp_reg, result_reg);
+    m_float_emit.FCMP(nan_temp_reg);
+    FixupBranch no_nan = B(CCFlags::CC_VC);
+    FixupBranch nan = B();
+    SetJumpTarget(no_nan);
+
+    SwitchToFarCode();
+    SetJumpTarget(nan);
+
+    // Pick the right NaNs
+
+    m_float_emit.MOVI(8, zero_reg, 0);
+
+    const auto check_input = [&](ARM64Reg input) {
+      m_float_emit.FACGE(size, nan_temp_reg_paired, input, zero_reg);
+      m_float_emit.BIF(result_reg, input, nan_temp_reg_paired);
+    };
+
+    ARM64Reg c_reg_for_nan_purposes = VC;
+    if (duplicated_c)
+    {
+      c_reg_for_nan_purposes = reg_encoder(V3Q);
+      m_float_emit.DUP(size, c_reg_for_nan_purposes, VC, op5 & 0x1);
+    }
+
+    if (use_c)
+      check_input(c_reg_for_nan_purposes);
+
+    if (use_b && (!use_c || VB != c_reg_for_nan_purposes))
+      check_input(VB);
+
+    if ((!use_b || VA != VB) && (!use_c || VA != c_reg_for_nan_purposes))
+      check_input(VA);
+
+    // Make the NaNs quiet
+
+    const ARM64Reg quiet_bit_reg = VD == result_reg ? reg_encoder(V3Q) : VD;
+    EmitQuietNaNBitConstant(quiet_bit_reg, singles, temp_gpr);
+
+    m_float_emit.FACGE(size, nan_temp_reg_paired, result_reg, zero_reg);
+    m_float_emit.ORR(quiet_bit_reg, quiet_bit_reg, result_reg);
+    if (negate_result)
+      m_float_emit.FNEG(size, result_reg, result_reg);
+    if (VD == result_reg)
+      m_float_emit.BIF(VD, quiet_bit_reg, nan_temp_reg_paired);
+    else  // quiet_bit_reg == VD
+      m_float_emit.BIT(VD, result_reg, nan_temp_reg_paired);
+
+    nan_fixup = B();
+
+    SwitchToNearCode();
+  }
+
   // PowerPC's nmadd/nmsub perform rounding before the final negation, which is not the case
   // for any of AArch64's FMA instructions, so we negate using a separate instruction.
   if (negate_result)
@@ -218,10 +303,19 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
   else if (result_reg != VD)
     m_float_emit.MOV(VD, result_reg);
 
+  if (m_accurate_nans)
+    SetJumpTarget(nan_fixup);
+
   if (V0Q != ARM64Reg::INVALID_REG)
     fpr.Unlock(V0Q);
   if (V1Q != ARM64Reg::INVALID_REG)
     fpr.Unlock(V1Q);
+  if (V2Q != ARM64Reg::INVALID_REG)
+    fpr.Unlock(V2Q);
+  if (V3Q != ARM64Reg::INVALID_REG)
+    fpr.Unlock(V3Q);
+  if (temp_gpr != ARM64Reg::INVALID_REG)
+    gpr.Unlock(temp_gpr);
 
   ASSERT_MSG(DYNA_REC, singles == singles_func(),
              "Register allocation turned singles into doubles in the middle of ps_arith");
@@ -283,32 +377,91 @@ void JitArm64::ps_sumX(UGeckoInstruction inst)
   const u32 c = inst.FC;
   const u32 d = inst.FD;
 
-  const bool upper = inst.SUBOP5 == 11;
+  const bool upper = inst.SUBOP5 & 0x1;
 
   const bool singles = fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c);
   const RegType type = singles ? RegType::Single : RegType::Register;
   const u8 size = singles ? 32 : 64;
   const auto reg_encoder = singles ? EncodeRegToDouble : EncodeRegToQuad;
+  const auto scalar_reg_encoder = singles ? EncodeRegToSingle : EncodeRegToDouble;
 
   const ARM64Reg VA = fpr.R(a, type);
   const ARM64Reg VB = fpr.R(b, type);
   const ARM64Reg VC = fpr.R(c, type);
   const ARM64Reg VD = fpr.RW(d, type);
   const ARM64Reg V0 = fpr.GetReg();
+  const ARM64Reg V1 = m_accurate_nans ? fpr.GetReg() : ARM64Reg::INVALID_REG;
+  const ARM64Reg temp_gpr = m_accurate_nans && !singles ? gpr.GetReg() : ARM64Reg::INVALID_REG;
 
-  m_float_emit.DUP(size, reg_encoder(V0), reg_encoder(upper ? VA : VB), upper ? 0 : 1);
-  if (d != c)
+  m_float_emit.DUP(size, reg_encoder(V0), reg_encoder(VB), 1);
+
+  FixupBranch a_nan_done, b_nan_done;
+  if (m_accurate_nans)
   {
-    m_float_emit.FADD(size, reg_encoder(VD), reg_encoder(V0), reg_encoder(upper ? VB : VA));
-    m_float_emit.INS(size, VD, upper ? 0 : 1, VC, upper ? 0 : 1);
+    const auto check_nan = [&](ARM64Reg input) {
+      m_float_emit.FCMP(scalar_reg_encoder(input));
+      FixupBranch not_nan = B(CCFlags::CC_VC);
+      FixupBranch nan = B();
+      SetJumpTarget(not_nan);
+
+      SwitchToFarCode();
+      SetJumpTarget(nan);
+
+      EmitQuietNaNBitConstant(scalar_reg_encoder(V1), singles, temp_gpr);
+
+      if (upper)
+      {
+        m_float_emit.ORR(EncodeRegToDouble(V1), EncodeRegToDouble(V1), EncodeRegToDouble(input));
+        m_float_emit.TRN1(size, reg_encoder(VD), reg_encoder(VC), reg_encoder(V1));
+      }
+      else if (d != c)
+      {
+        m_float_emit.ORR(EncodeRegToDouble(VD), EncodeRegToDouble(V1), EncodeRegToDouble(input));
+        m_float_emit.INS(size, VD, 1, VC, 1);
+      }
+      else
+      {
+        m_float_emit.ORR(EncodeRegToDouble(V1), EncodeRegToDouble(V1), EncodeRegToDouble(input));
+        m_float_emit.INS(size, VD, 0, V1, 0);
+      }
+
+      FixupBranch nan_done = B();
+      SwitchToNearCode();
+
+      return nan_done;
+    };
+
+    a_nan_done = check_nan(VA);
+    b_nan_done = check_nan(V0);
+  }
+
+  if (upper)
+  {
+    m_float_emit.FADD(scalar_reg_encoder(V0), scalar_reg_encoder(V0), scalar_reg_encoder(VA));
+    m_float_emit.TRN1(size, reg_encoder(VD), reg_encoder(VC), reg_encoder(V0));
+  }
+  else if (d != c)
+  {
+    m_float_emit.FADD(scalar_reg_encoder(VD), scalar_reg_encoder(V0), scalar_reg_encoder(VA));
+    m_float_emit.INS(size, VD, 1, VC, 1);
   }
   else
   {
-    m_float_emit.FADD(size, reg_encoder(V0), reg_encoder(V0), reg_encoder(upper ? VB : VA));
-    m_float_emit.INS(size, VD, upper ? 1 : 0, V0, upper ? 1 : 0);
+    m_float_emit.FADD(scalar_reg_encoder(V0), scalar_reg_encoder(V0), scalar_reg_encoder(VA));
+    m_float_emit.INS(size, VD, 0, V0, 0);
+  }
+
+  if (m_accurate_nans)
+  {
+    SetJumpTarget(a_nan_done);
+    SetJumpTarget(b_nan_done);
   }
 
   fpr.Unlock(V0);
+  if (m_accurate_nans)
+    fpr.Unlock(V1);
+  if (temp_gpr != ARM64Reg::INVALID_REG)
+    gpr.Unlock(temp_gpr);
 
   ASSERT_MSG(DYNA_REC, singles == (fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c)),
              "Register allocation turned singles into doubles in the middle of ps_sumX");