dolphin-emu · AdmiralCurtiss · Apr 9, 2024 · Oct 8, 2022
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
@@ -107,8 +107,6 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std::
   if (!m_accurate_nans)
     return;
 
-  ASSERT(xmm != clobber);
-
   if (inst.OPCD != 4)
   {
     // not paired-single
@@ -148,6 +146,8 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std::
   {
     // paired-single
 
+    ASSERT(xmm != clobber);
+
     if (cpu_info.bSSE4_1)
     {
       avx_op(&XEmitter::VCMPPD, &XEmitter::CMPPD, clobber, R(xmm), R(xmm), CMP_UNORD);
@@ -325,7 +325,7 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
   FALLBACK_IF(jo.fp_exceptions);
 
   // We would like to emulate FMA instructions accurately without rounding error if possible, but
-  // unfortunately emulating FMA in software is just too slow on CPUs that are too old to have FMA
+  // unfortunately, emulating FMA in software is just too slow on CPUs that are too old to have FMA
   // instructions, so we have the Config::SESSION_USE_FMA setting to determine whether we should
   // emulate FMA instructions accurately or by a performing a multiply followed by a separate add.
   //
@@ -346,6 +346,8 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
   int d = inst.FD;
   bool single = inst.OPCD == 4 || inst.OPCD == 59;
   bool round_input = single && !js.op->fprIsSingle[c];
+  bool preserve_inputs = m_accurate_nans;
+  bool preserve_d = preserve_inputs && (a == d || b == d || c == d);
   bool packed =
       inst.OPCD == 4 || (!cpu_info.bAtom && !software_fma && single && js.op->fprIsDuplicated[a] &&
                          js.op->fprIsDuplicated[b] && js.op->fprIsDuplicated[c]);
@@ -356,21 +358,35 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
   const bool madds1 = inst.SUBOP5 == 15;
   const bool madds_accurate_nans = m_accurate_nans && (madds0 || madds1);
 
+  X64Reg scratch_xmm = XMM0;
+  X64Reg result_xmm = XMM1;
+  X64Reg Rc_duplicated = XMM2;
+
   RCOpArg Ra;
   RCOpArg Rb;
   RCOpArg Rc;
   RCX64Reg Rd;
-  RCX64Reg scratch_guard;
+  RCX64Reg xmm2_guard;
+  RCX64Reg result_xmm_guard;
   RCX64Reg Rc_duplicated_guard;
-  X64Reg Rc_duplicated = XMM2;
   if (software_fma)
   {
-    scratch_guard = fpr.Scratch(XMM2);
+    xmm2_guard = fpr.Scratch(XMM2);
     Ra = packed ? fpr.Bind(a, RCMode::Read) : fpr.Use(a, RCMode::Read);
     Rb = packed ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
     Rc = packed ? fpr.Bind(c, RCMode::Read) : fpr.Use(c, RCMode::Read);
     Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite);
-    RegCache::Realize(Ra, Rb, Rc, Rd, scratch_guard);
+    if (preserve_d && packed)
+    {
+      result_xmm_guard = fpr.Scratch();
+      RegCache::Realize(Ra, Rb, Rc, Rd, xmm2_guard, result_xmm_guard);
+      result_xmm = Gen::X64Reg(result_xmm_guard);
+    }
+    else
+    {
+      RegCache::Realize(Ra, Rb, Rc, Rd, xmm2_guard);
+      result_xmm = packed ? Gen::X64Reg(Rd) : XMM0;
+    }
   }
   else
   {
@@ -391,8 +407,6 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
     }
   }
 
-  X64Reg scratch_xmm = XMM0;
-  X64Reg result_xmm = XMM1;
   if (software_fma)
   {
     for (size_t i = (packed ? 1 : 0); i != std::numeric_limits<size_t>::max(); --i)
@@ -411,10 +425,11 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
           Force25BitPrecision(XMM1, R(XMM1), XMM2);
       }
 
-      // Write the result from the previous loop iteration into Rd so we don't lose it.
-      // It's important that this is done after reading Rc above, in case we have madds1 and c == d.
+      // Write the result from the previous loop iteration into result_xmm so we don't lose it.
+      // It's important that this is done after reading Rc above, in case we have madds1 and
+      // result_xmm == Rd == Rc.
       if (packed && i == 0)
-        MOVLHPS(Rd, XMM0);
+        MOVLHPS(result_xmm, XMM0);
 
       if (i == 0)
       {
@@ -437,14 +452,9 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
     }
 
     if (packed)
-    {
-      MOVSD(Rd, XMM0);
-      result_xmm = Rd;
-    }
+      MOVSD(R(result_xmm), XMM0);
     else
-    {
-      result_xmm = XMM0;
-    }
+      DEBUG_ASSERT(result_xmm == XMM0);
 
     if (madds_accurate_nans)
     {
@@ -530,6 +540,7 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
     // HandleNaNs needs to clobber XMM0
     MOVAPD(Rd, R(result_xmm));
     result_xmm = Rd;
+    DEBUG_ASSERT(!preserve_d);
   }
 
   // If packed, the clobber register must be XMM0. If not packed, the clobber register is unused.