From 761070dc5794e16f680ad454d15138c3693acaa3 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sat, 8 Oct 2022 19:05:51 +0200 Subject: [PATCH] Jit64: Preserve inputs when software_fma && m_accurate_nans When writing the software FMA code, I didn't realize that we can't overwrite d if d is the same register as one of the inputs and HandleNaNs is going to be called. This fixes that. --- .../Core/PowerPC/Jit64/Jit_FloatingPoint.cpp | 47 ++++++++++++------- 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp index fc27ad7bf1c9..78a702686870 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -107,8 +107,6 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std:: if (!m_accurate_nans) return; - ASSERT(xmm != clobber); - if (inst.OPCD != 4) { // not paired-single @@ -148,6 +146,8 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std:: { // paired-single + ASSERT(xmm != clobber); + if (cpu_info.bSSE4_1) { avx_op(&XEmitter::VCMPPD, &XEmitter::CMPPD, clobber, R(xmm), R(xmm), CMP_UNORD); @@ -325,7 +325,7 @@ void Jit64::fmaddXX(UGeckoInstruction inst) FALLBACK_IF(jo.fp_exceptions); // We would like to emulate FMA instructions accurately without rounding error if possible, but - // unfortunately emulating FMA in software is just too slow on CPUs that are too old to have FMA + // unfortunately, emulating FMA in software is just too slow on CPUs that are too old to have FMA // instructions, so we have the Config::SESSION_USE_FMA setting to determine whether we should // emulate FMA instructions accurately or by a performing a multiply followed by a separate add. // @@ -346,23 +346,39 @@ void Jit64::fmaddXX(UGeckoInstruction inst) int d = inst.FD; bool single = inst.OPCD == 4 || inst.OPCD == 59; bool round_input = single && !js.op->fprIsSingle[c]; + bool preserve_inputs = m_accurate_nans; + bool preserve_d = preserve_inputs && (a == d || b == d || c == d); bool packed = inst.OPCD == 4 || (!cpu_info.bAtom && !software_fma && single && js.op->fprIsDuplicated[a] && js.op->fprIsDuplicated[b] && js.op->fprIsDuplicated[c]); + X64Reg scratch_xmm = XMM0; + X64Reg result_xmm = XMM1; + RCOpArg Ra; RCOpArg Rb; RCOpArg Rc; RCX64Reg Rd; - RCX64Reg scratch_guard; + RCX64Reg xmm2_guard; + RCX64Reg result_xmm_guard; if (software_fma) { - scratch_guard = fpr.Scratch(XMM2); + xmm2_guard = fpr.Scratch(XMM2); Ra = packed ? fpr.Bind(a, RCMode::Read) : fpr.Use(a, RCMode::Read); Rb = packed ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read); Rc = packed ? fpr.Bind(c, RCMode::Read) : fpr.Use(c, RCMode::Read); Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite); - RegCache::Realize(Ra, Rb, Rc, Rd, scratch_guard); + if (preserve_d && packed) + { + result_xmm_guard = fpr.Scratch(); + RegCache::Realize(Ra, Rb, Rc, Rd, xmm2_guard, result_xmm_guard); + result_xmm = Gen::X64Reg(result_xmm_guard); + } + else + { + RegCache::Realize(Ra, Rb, Rc, Rd, xmm2_guard); + result_xmm = packed ? Gen::X64Reg(Rd) : XMM0; + } } else { @@ -381,8 +397,6 @@ void Jit64::fmaddXX(UGeckoInstruction inst) const bool madds0 = inst.SUBOP5 == 14; const bool madds1 = inst.SUBOP5 == 15; - X64Reg scratch_xmm = XMM0; - X64Reg result_xmm = XMM1; if (software_fma) { for (size_t i = (packed ? 1 : 0); i != std::numeric_limits::max(); --i) @@ -401,10 +415,11 @@ void Jit64::fmaddXX(UGeckoInstruction inst) Force25BitPrecision(XMM1, R(XMM1), XMM2); } - // Write the result from the previous loop iteration into Rd so we don't lose it. - // It's important that this is done after reading Rc above, in case we have madds1 and c == d. + // Write the result from the previous loop iteration into result_xmm so we don't lose it. + // It's important that this is done after reading Rc above, in case we have madds1 and + // result_xmm == Rd == Rc. if (packed && i == 0) - MOVLHPS(Rd, XMM0); + MOVLHPS(result_xmm, XMM0); if (i == 0) { @@ -427,14 +442,9 @@ void Jit64::fmaddXX(UGeckoInstruction inst) } if (packed) - { - MOVSD(Rd, XMM0); - result_xmm = Rd; - } + MOVSD(R(result_xmm), XMM0); else - { - result_xmm = XMM0; - } + ASSERT(result_xmm == XMM0); } else { @@ -510,6 +520,7 @@ void Jit64::fmaddXX(UGeckoInstruction inst) result_xmm = Rd; } + // If packed, the clobber register must be XMM0. If not packed, the clobber register is unused. HandleNaNs(inst, result_xmm, XMM0, Ra, Rb, Rc); if (single)