From 761070dc5794e16f680ad454d15138c3693acaa3 Mon Sep 17 00:00:00 2001
From: JosJuice <josjuice@gmail.com>
Date: Sat, 8 Oct 2022 19:05:51 +0200
Subject: [PATCH] Jit64: Preserve inputs when software_fma && m_accurate_nans

When writing the software FMA code, I didn't realize that we can't
overwrite d if d is the same register as one of the inputs and
HandleNaNs is going to be called. This fixes that.
---
 .../Core/PowerPC/Jit64/Jit_FloatingPoint.cpp  | 47 ++++++++++++-------
 1 file changed, 29 insertions(+), 18 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
index fc27ad7bf1c9..78a702686870 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
@@ -107,8 +107,6 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std::
   if (!m_accurate_nans)
     return;
 
-  ASSERT(xmm != clobber);
-
   if (inst.OPCD != 4)
   {
     // not paired-single
@@ -148,6 +146,8 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std::
   {
     // paired-single
 
+    ASSERT(xmm != clobber);
+
     if (cpu_info.bSSE4_1)
     {
       avx_op(&XEmitter::VCMPPD, &XEmitter::CMPPD, clobber, R(xmm), R(xmm), CMP_UNORD);
@@ -325,7 +325,7 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
   FALLBACK_IF(jo.fp_exceptions);
 
   // We would like to emulate FMA instructions accurately without rounding error if possible, but
-  // unfortunately emulating FMA in software is just too slow on CPUs that are too old to have FMA
+  // unfortunately, emulating FMA in software is just too slow on CPUs that are too old to have FMA
   // instructions, so we have the Config::SESSION_USE_FMA setting to determine whether we should
   // emulate FMA instructions accurately or by a performing a multiply followed by a separate add.
   //
@@ -346,23 +346,39 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
   int d = inst.FD;
   bool single = inst.OPCD == 4 || inst.OPCD == 59;
   bool round_input = single && !js.op->fprIsSingle[c];
+  bool preserve_inputs = m_accurate_nans;
+  bool preserve_d = preserve_inputs && (a == d || b == d || c == d);
   bool packed =
       inst.OPCD == 4 || (!cpu_info.bAtom && !software_fma && single && js.op->fprIsDuplicated[a] &&
                          js.op->fprIsDuplicated[b] && js.op->fprIsDuplicated[c]);
 
+  X64Reg scratch_xmm = XMM0;
+  X64Reg result_xmm = XMM1;
+
   RCOpArg Ra;
   RCOpArg Rb;
   RCOpArg Rc;
   RCX64Reg Rd;
-  RCX64Reg scratch_guard;
+  RCX64Reg xmm2_guard;
+  RCX64Reg result_xmm_guard;
   if (software_fma)
   {
-    scratch_guard = fpr.Scratch(XMM2);
+    xmm2_guard = fpr.Scratch(XMM2);
     Ra = packed ? fpr.Bind(a, RCMode::Read) : fpr.Use(a, RCMode::Read);
     Rb = packed ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
     Rc = packed ? fpr.Bind(c, RCMode::Read) : fpr.Use(c, RCMode::Read);
     Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite);
-    RegCache::Realize(Ra, Rb, Rc, Rd, scratch_guard);
+    if (preserve_d && packed)
+    {
+      result_xmm_guard = fpr.Scratch();
+      RegCache::Realize(Ra, Rb, Rc, Rd, xmm2_guard, result_xmm_guard);
+      result_xmm = Gen::X64Reg(result_xmm_guard);
+    }
+    else
+    {
+      RegCache::Realize(Ra, Rb, Rc, Rd, xmm2_guard);
+      result_xmm = packed ? Gen::X64Reg(Rd) : XMM0;
+    }
   }
   else
   {
@@ -381,8 +397,6 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
   const bool madds0 = inst.SUBOP5 == 14;
   const bool madds1 = inst.SUBOP5 == 15;
 
-  X64Reg scratch_xmm = XMM0;
-  X64Reg result_xmm = XMM1;
   if (software_fma)
   {
     for (size_t i = (packed ? 1 : 0); i != std::numeric_limits<size_t>::max(); --i)
@@ -401,10 +415,11 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
           Force25BitPrecision(XMM1, R(XMM1), XMM2);
       }
 
-      // Write the result from the previous loop iteration into Rd so we don't lose it.
-      // It's important that this is done after reading Rc above, in case we have madds1 and c == d.
+      // Write the result from the previous loop iteration into result_xmm so we don't lose it.
+      // It's important that this is done after reading Rc above, in case we have madds1 and
+      // result_xmm == Rd == Rc.
       if (packed && i == 0)
-        MOVLHPS(Rd, XMM0);
+        MOVLHPS(result_xmm, XMM0);
 
       if (i == 0)
       {
@@ -427,14 +442,9 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
     }
 
     if (packed)
-    {
-      MOVSD(Rd, XMM0);
-      result_xmm = Rd;
-    }
+      MOVSD(R(result_xmm), XMM0);
     else
-    {
-      result_xmm = XMM0;
-    }
+      ASSERT(result_xmm == XMM0);
   }
   else
   {
@@ -510,6 +520,7 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
     result_xmm = Rd;
   }
 
+  // If packed, the clobber register must be XMM0. If not packed, the clobber register is unused.
   HandleNaNs(inst, result_xmm, XMM0, Ra, Rb, Rc);
 
   if (single)