Skip to content

Commit aec3846

Browse files
committed
Jit64: optionally accurate NaNs
When AccurateNaNs is enabled, NaNs are handled accurately by checking for NaN results and choosing the correct input NaN or replacing x86's generated -QNaN with +QNaN.
1 parent 881f6db commit aec3846

File tree

7 files changed

+183
-42
lines changed

7 files changed

+183
-42
lines changed

Source/Core/Core/BootManager.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ namespace BootManager
4747
// Apply fire liberally
4848
struct ConfigCache
4949
{
50-
bool valid, bCPUThread, bSkipIdle, bSyncGPUOnSkipIdleHack, bFPRF, bMMU, bDCBZOFF, m_EnableJIT, bDSPThread,
50+
bool valid, bCPUThread, bSkipIdle, bSyncGPUOnSkipIdleHack, bFPRF, bAccurateNaNs, bMMU, bDCBZOFF, m_EnableJIT, bDSPThread,
5151
bSyncGPU, bFastDiscSpeed, bDSPHLE, bHLE_BS2, bProgressive;
5252
int iCPUCore, Volume;
5353
int iWiimoteSource[MAX_BBMOTES];
@@ -106,6 +106,7 @@ bool BootCore(const std::string& _rFilename)
106106
config_cache.bSyncGPUOnSkipIdleHack = StartUp.bSyncGPUOnSkipIdleHack;
107107
config_cache.iCPUCore = StartUp.iCPUCore;
108108
config_cache.bFPRF = StartUp.bFPRF;
109+
config_cache.bAccurateNaNs = StartUp.bAccurateNaNs;
109110
config_cache.bMMU = StartUp.bMMU;
110111
config_cache.bDCBZOFF = StartUp.bDCBZOFF;
111112
config_cache.bSyncGPU = StartUp.bSyncGPU;
@@ -146,6 +147,7 @@ bool BootCore(const std::string& _rFilename)
146147
core_section->Get("SkipIdle", &StartUp.bSkipIdle, StartUp.bSkipIdle);
147148
core_section->Get("SyncOnSkipIdle", &StartUp.bSyncGPUOnSkipIdleHack, StartUp.bSyncGPUOnSkipIdleHack);
148149
core_section->Get("FPRF", &StartUp.bFPRF, StartUp.bFPRF);
150+
core_section->Get("AccurateNaNs", &StartUp.bAccurateNaNs, StartUp.bAccurateNaNs);
149151
core_section->Get("MMU", &StartUp.bMMU, StartUp.bMMU);
150152
core_section->Get("DCBZ", &StartUp.bDCBZOFF, StartUp.bDCBZOFF);
151153
core_section->Get("SyncGPU", &StartUp.bSyncGPU, StartUp.bSyncGPU);
@@ -273,6 +275,7 @@ void Stop()
273275
StartUp.bSyncGPUOnSkipIdleHack = config_cache.bSyncGPUOnSkipIdleHack;
274276
StartUp.iCPUCore = config_cache.iCPUCore;
275277
StartUp.bFPRF = config_cache.bFPRF;
278+
StartUp.bAccurateNaNs = config_cache.bAccurateNaNs;
276279
StartUp.bMMU = config_cache.bMMU;
277280
StartUp.bDCBZOFF = config_cache.bDCBZOFF;
278281
StartUp.bSyncGPU = config_cache.bSyncGPU;

Source/Core/Core/CoreParameter.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ SCoreStartupParameter::SCoreStartupParameter()
3333
bJITPairedOff(false), bJITSystemRegistersOff(false),
3434
bJITBranchOff(false),
3535
bJITILTimeProfiling(false), bJITILOutputIR(false),
36-
bFPRF(false),
36+
bFPRF(false), bAccurateNaNs(false),
3737
bCPUThread(true), bDSPThread(false), bDSPHLE(true),
3838
bSkipIdle(true), bSyncGPUOnSkipIdleHack(true), bNTSC(false), bForceNTSCJ(false),
3939
bHLE_BS2(true), bEnableCheats(false),
@@ -78,6 +78,7 @@ void SCoreStartupParameter::LoadDefaults()
7878
bDSPHLE = true;
7979
bFastmem = true;
8080
bFPRF = false;
81+
bAccurateNaNs = false;
8182
bMMU = false;
8283
bDCBZOFF = false;
8384
iBBDumpPort = -1;

Source/Core/Core/CoreParameter.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,7 @@ struct SCoreStartupParameter
163163

164164
bool bFastmem;
165165
bool bFPRF;
166+
bool bAccurateNaNs;
166167

167168
bool bCPUThread;
168169
bool bDSPThread;

Source/Core/Core/PowerPC/Jit64/Jit.h

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -135,13 +135,18 @@ class Jit64 : public Jitx86Base
135135
Gen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set = true);
136136
void SetFPRFIfNeeded(Gen::X64Reg xmm);
137137

138+
void HandleNaNs(UGeckoInstruction inst, Gen::X64Reg xmm_out, Gen::X64Reg xmm_in);
139+
138140
void MultiplyImmediate(u32 imm, int a, int d, bool overflow);
139141

140142
typedef u32 (*Operation)(u32 a, u32 b);
141-
void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&),
142-
bool Rc = false, bool carry = false);
143-
void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, const Gen::OpArg&),
144-
void (Gen::XEmitter::*sseOp)(Gen::X64Reg, const Gen::OpArg&), bool packed = false, bool roundRHS = false);
143+
void regimmop(int d, int a, bool binary, u32 value, Operation doop,
144+
void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&),
145+
bool Rc = false, bool carry = false);
146+
Gen::X64Reg fp_tri_op(int d, int a, int b, bool reversible, bool single,
147+
void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, const Gen::OpArg&),
148+
void (Gen::XEmitter::*sseOp)(Gen::X64Reg, const Gen::OpArg&),
149+
bool packed, bool preserve_inputs, bool roundRHS = false);
145150
void FloatCompare(UGeckoInstruction inst, bool upper = false);
146151

147152
// OPCODES

Source/Core/Core/PowerPC/Jit64/JitRegCache.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,20 @@ class RegCache
138138
LockX(args...);
139139
}
140140

141+
template<typename T>
142+
void UnlockX(T x)
143+
{
144+
if (!xregs[x].locked)
145+
PanicAlert("RegCache: x %i already unlocked!", x);
146+
xregs[x].locked = false;
147+
}
148+
template<typename T, typename... Args>
149+
void UnlockX(T first, Args... args)
150+
{
151+
UnlockX(first);
152+
UnlockX(args...);
153+
}
154+
141155
void UnlockAll();
142156
void UnlockAllX();
143157

Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp

Lines changed: 141 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -10,38 +10,37 @@
1010

1111
using namespace Gen;
1212

13-
static const u64 GC_ALIGNED16(psSignBits[2]) = {0x8000000000000000ULL, 0x0000000000000000ULL};
14-
static const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL};
15-
static const u64 GC_ALIGNED16(psAbsMask[2]) = {0x7FFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL};
16-
static const u64 GC_ALIGNED16(psAbsMask2[2]) = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL};
13+
static const u64 GC_ALIGNED16(psSignBits[2]) = {0x8000000000000000ULL, 0x0000000000000000ULL};
14+
static const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL};
15+
static const u64 GC_ALIGNED16(psAbsMask[2]) = {0x7FFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL};
16+
static const u64 GC_ALIGNED16(psAbsMask2[2]) = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL};
17+
static const u64 GC_ALIGNED16(psGeneratedQNaN[2]) = {0x7FF8000000000000ULL, 0x7FF8000000000000ULL};
1718
static const double GC_ALIGNED16(half_qnan_and_s32_max[2]) = {0x7FFFFFFF, -0x80000};
1819

19-
void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&),
20-
void (XEmitter::*sseOp)(X64Reg, const OpArg&), bool packed, bool roundRHS)
20+
X64Reg Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&),
21+
void (XEmitter::*sseOp)(X64Reg, const OpArg&), bool packed, bool preserve_inputs, bool roundRHS)
2122
{
2223
fpr.Lock(d, a, b);
2324
fpr.BindToRegister(d, d == a || d == b || !single);
25+
X64Reg dest = preserve_inputs ? XMM1 : fpr.RX(d);
2426
if (roundRHS)
2527
{
26-
if (d == a)
28+
if (d == a && !preserve_inputs)
2729
{
2830
Force25BitPrecision(XMM0, fpr.R(b), XMM1);
2931
(this->*sseOp)(fpr.RX(d), R(XMM0));
3032
}
3133
else
3234
{
33-
Force25BitPrecision(fpr.RX(d), fpr.R(b), XMM0);
34-
(this->*sseOp)(fpr.RX(d), fpr.R(a));
35+
Force25BitPrecision(dest, fpr.R(b), XMM0);
36+
(this->*sseOp)(dest, fpr.R(a));
3537
}
3638
}
3739
else
3840
{
39-
avx_op(avxOp, sseOp, fpr.RX(d), fpr.R(a), fpr.R(b), packed, reversible);
41+
avx_op(avxOp, sseOp, dest, fpr.R(a), fpr.R(b), packed, reversible);
4042
}
41-
if (single)
42-
ForceSinglePrecision(fpr.RX(d), fpr.R(d), packed, true);
43-
SetFPRFIfNeeded(fpr.RX(d));
44-
fpr.UnlockAll();
43+
return dest;
4544
}
4645

4746
// We can avoid calculating FPRF if it's not needed; every float operation resets it, so
@@ -56,6 +55,112 @@ void Jit64::SetFPRFIfNeeded(X64Reg xmm)
5655
SetFPRF(xmm);
5756
}
5857

58+
void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm_out, X64Reg xmm)
59+
{
60+
// | PowerPC | x86
61+
// ---------------------+----------+---------
62+
// input NaN precedence | 1*3 + 2 | 1*2 + 3
63+
// generated QNaN | positive | negative
64+
//
65+
// Dragon Ball: Revenge of King Piccolo requires generated NaNs
66+
// to be positive, so we'll have to handle them manually.
67+
68+
if (!SConfig::GetInstance().m_LocalCoreStartupParameter.bAccurateNaNs)
69+
{
70+
if (xmm_out != xmm)
71+
MOVAPD(xmm_out, R(xmm));
72+
return;
73+
}
74+
75+
_assert_(xmm != XMM0);
76+
77+
std::vector<u32> inputs;
78+
u32 a = inst.FA, b = inst.FB, c = inst.FC;
79+
for (u32 i : {a, b, c})
80+
{
81+
if (!js.op->fregsIn[i])
82+
continue;
83+
if (std::find(inputs.begin(), inputs.end(), i) == inputs.end())
84+
inputs.push_back(i);
85+
}
86+
if (inst.OPCD != 4)
87+
{
88+
// not paired-single
89+
UCOMISD(xmm, R(xmm));
90+
FixupBranch handle_nan = J_CC(CC_P, true);
91+
SwitchToFarCode();
92+
SetJumpTarget(handle_nan);
93+
std::vector<FixupBranch> fixups;
94+
for (u32 x : inputs)
95+
{
96+
MOVDDUP(xmm, fpr.R(x));
97+
UCOMISD(xmm, R(xmm));
98+
fixups.push_back(J_CC(CC_P));
99+
}
100+
MOVDDUP(xmm, M(psGeneratedQNaN));
101+
for (FixupBranch fixup : fixups)
102+
SetJumpTarget(fixup);
103+
FixupBranch done = J(true);
104+
SwitchToNearCode();
105+
SetJumpTarget(done);
106+
}
107+
else
108+
{
109+
// paired-single
110+
std::reverse(inputs.begin(), inputs.end());
111+
if (cpu_info.bSSE4_1)
112+
{
113+
avx_op(&XEmitter::VCMPPD, &XEmitter::CMPPD, XMM0, R(xmm), R(xmm), CMP_UNORD);
114+
PTEST(XMM0, R(XMM0));
115+
FixupBranch handle_nan = J_CC(CC_NZ, true);
116+
SwitchToFarCode();
117+
SetJumpTarget(handle_nan);
118+
BLENDVPD(xmm, M(psGeneratedQNaN));
119+
for (u32 x : inputs)
120+
{
121+
avx_op(&XEmitter::VCMPPD, &XEmitter::CMPPD, XMM0, fpr.R(x), fpr.R(x), CMP_UNORD);
122+
BLENDVPD(xmm, fpr.R(x));
123+
}
124+
FixupBranch done = J(true);
125+
SwitchToNearCode();
126+
SetJumpTarget(done);
127+
}
128+
else
129+
{
130+
// SSE2 fallback
131+
X64Reg tmp = fpr.GetFreeXReg();
132+
fpr.FlushLockX(tmp);
133+
MOVAPD(XMM0, R(xmm));
134+
CMPPD(XMM0, R(XMM0), CMP_UNORD);
135+
MOVMSKPD(RSCRATCH, R(XMM0));
136+
TEST(32, R(RSCRATCH), R(RSCRATCH));
137+
FixupBranch handle_nan = J_CC(CC_NZ, true);
138+
SwitchToFarCode();
139+
SetJumpTarget(handle_nan);
140+
MOVAPD(tmp, R(XMM0));
141+
PANDN(XMM0, R(xmm));
142+
PAND(tmp, M(psGeneratedQNaN));
143+
POR(tmp, R(XMM0));
144+
MOVAPD(xmm, R(tmp));
145+
for (u32 x : inputs)
146+
{
147+
MOVAPD(XMM0, fpr.R(x));
148+
CMPPD(XMM0, R(XMM0), CMP_ORD);
149+
MOVAPD(tmp, R(XMM0));
150+
PANDN(XMM0, fpr.R(x));
151+
PAND(xmm, R(tmp));
152+
POR(xmm, R(XMM0));
153+
}
154+
FixupBranch done = J(true);
155+
SwitchToNearCode();
156+
SetJumpTarget(done);
157+
fpr.UnlockX(tmp);
158+
}
159+
}
160+
if (xmm_out != xmm)
161+
MOVAPD(xmm_out, R(xmm));
162+
}
163+
59164
void Jit64::fp_arith(UGeckoInstruction inst)
60165
{
61166
INSTRUCTION_START
@@ -80,20 +185,27 @@ void Jit64::fp_arith(UGeckoInstruction inst)
80185
packed = false;
81186

82187
bool round_input = single && !jit->js.op->fprIsSingle[inst.FC];
188+
bool preserve_inputs = SConfig::GetInstance().m_LocalCoreStartupParameter.bAccurateNaNs;
83189

190+
X64Reg dest = INVALID_REG;
84191
switch (inst.SUBOP5)
85192
{
86-
case 18: fp_tri_op(d, a, b, false, single, packed ? &XEmitter::VDIVPD : &XEmitter::VDIVSD,
87-
packed ? &XEmitter::DIVPD : &XEmitter::DIVSD, packed); break;
88-
case 20: fp_tri_op(d, a, b, false, single, packed ? &XEmitter::VSUBPD : &XEmitter::VSUBSD,
89-
packed ? &XEmitter::SUBPD : &XEmitter::SUBSD, packed); break;
90-
case 21: fp_tri_op(d, a, b, true, single, packed ? &XEmitter::VADDPD : &XEmitter::VADDSD,
91-
packed ? &XEmitter::ADDPD : &XEmitter::ADDSD, packed); break;
92-
case 25: fp_tri_op(d, a, c, true, single, packed ? &XEmitter::VMULPD : &XEmitter::VMULSD,
93-
packed ? &XEmitter::MULPD : &XEmitter::MULSD, packed, round_input); break;
193+
case 18: dest = fp_tri_op(d, a, b, false, single, packed ? &XEmitter::VDIVPD : &XEmitter::VDIVSD,
194+
packed ? &XEmitter::DIVPD : &XEmitter::DIVSD, packed, preserve_inputs); break;
195+
case 20: dest = fp_tri_op(d, a, b, false, single, packed ? &XEmitter::VSUBPD : &XEmitter::VSUBSD,
196+
packed ? &XEmitter::SUBPD : &XEmitter::SUBSD, packed, preserve_inputs); break;
197+
case 21: dest = fp_tri_op(d, a, b, true, single, packed ? &XEmitter::VADDPD : &XEmitter::VADDSD,
198+
packed ? &XEmitter::ADDPD : &XEmitter::ADDSD, packed, preserve_inputs); break;
199+
case 25: dest = fp_tri_op(d, a, c, true, single, packed ? &XEmitter::VMULPD : &XEmitter::VMULSD,
200+
packed ? &XEmitter::MULPD : &XEmitter::MULSD, packed, preserve_inputs, round_input); break;
94201
default:
95202
_assert_msg_(DYNA_REC, 0, "fp_arith WTF!!!");
96203
}
204+
HandleNaNs(inst, fpr.RX(d), dest);
205+
if (single)
206+
ForceSinglePrecision(fpr.RX(d), fpr.R(d), packed, true);
207+
SetFPRFIfNeeded(fpr.RX(d));
208+
fpr.UnlockAll();
97209
}
98210

99211
void Jit64::fmaddXX(UGeckoInstruction inst)
@@ -220,13 +332,17 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
220332
if (inst.SUBOP5 == 31) //nmadd
221333
PXOR(XMM1, M(packed ? psSignBits2 : psSignBits));
222334
}
223-
224335
fpr.BindToRegister(d, !single);
225-
226336
if (single)
227-
ForceSinglePrecision(fpr.RX(d), R(XMM1), packed, true);
337+
{
338+
HandleNaNs(inst, fpr.RX(d), XMM1);
339+
ForceSinglePrecision(fpr.RX(d), fpr.R(d), packed, true);
340+
}
228341
else
342+
{
343+
HandleNaNs(inst, XMM1, XMM1);
229344
MOVSD(fpr.RX(d), R(XMM1));
345+
}
230346
SetFPRFIfNeeded(fpr.RX(d));
231347
fpr.UnlockAll();
232348
}
@@ -379,7 +495,6 @@ void Jit64::FloatCompare(UGeckoInstruction inst, bool upper)
379495
}
380496
else
381497
{
382-
// Are we masking sNaN invalid floating point exceptions? If not this could crash if we don't handle the exception?
383498
UCOMISD(fpr.RX(b), fpr.R(a));
384499
}
385500

Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ void Jit64::ps_sum(UGeckoInstruction inst)
3838
fpr.Lock(a, b, c, d);
3939
OpArg op_a = fpr.R(a);
4040
fpr.BindToRegister(d, d == b || d == c);
41-
X64Reg tmp = XMM0;
41+
X64Reg tmp = XMM1;
4242
MOVDDUP(tmp, op_a); // {a.ps0, a.ps0}
4343
ADDPD(tmp, fpr.R(b)); // {a.ps0 + b.ps0, a.ps0 + b.ps1}
4444
switch (inst.SUBOP5)
@@ -55,9 +55,9 @@ void Jit64::ps_sum(UGeckoInstruction inst)
5555
}
5656
else
5757
{
58-
MOVAPD(XMM1, fpr.R(c));
59-
SHUFPD(XMM1, R(tmp), 2);
60-
tmp = XMM1;
58+
MOVAPD(XMM0, fpr.R(c));
59+
SHUFPD(XMM0, R(tmp), 2);
60+
tmp = XMM0;
6161
}
6262
}
6363
else
@@ -68,7 +68,8 @@ void Jit64::ps_sum(UGeckoInstruction inst)
6868
default:
6969
PanicAlert("ps_sum WTF!!!");
7070
}
71-
ForceSinglePrecision(fpr.RX(d), R(tmp));
71+
HandleNaNs(inst, fpr.RX(d), tmp);
72+
ForceSinglePrecision(fpr.RX(d), fpr.R(d));
7273
SetFPRFIfNeeded(fpr.RX(d));
7374
fpr.UnlockAll();
7475
}
@@ -88,19 +89,20 @@ void Jit64::ps_muls(UGeckoInstruction inst)
8889
switch (inst.SUBOP5)
8990
{
9091
case 12: // ps_muls0
91-
MOVDDUP(XMM0, fpr.R(c));
92+
MOVDDUP(XMM1, fpr.R(c));
9293
break;
9394
case 13: // ps_muls1
94-
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3);
95+
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM1, fpr.R(c), fpr.R(c), 3);
9596
break;
9697
default:
9798
PanicAlert("ps_muls WTF!!!");
9899
}
99100
if (round_input)
100-
Force25BitPrecision(XMM0, R(XMM0), XMM1);
101-
MULPD(XMM0, fpr.R(a));
101+
Force25BitPrecision(XMM1, R(XMM1), XMM0);
102+
MULPD(XMM1, fpr.R(a));
102103
fpr.BindToRegister(d, false);
103-
ForceSinglePrecision(fpr.RX(d), R(XMM0));
104+
HandleNaNs(inst, fpr.RX(d), XMM1);
105+
ForceSinglePrecision(fpr.RX(d), fpr.R(d));
104106
SetFPRFIfNeeded(fpr.RX(d));
105107
fpr.UnlockAll();
106108
}

0 commit comments

Comments
 (0)