1010
1111using namespace Gen ;
1212
13- static const u64 GC_ALIGNED16 (psSignBits[2 ]) = {0x8000000000000000ULL , 0x0000000000000000ULL };
14- static const u64 GC_ALIGNED16 (psSignBits2[2 ]) = {0x8000000000000000ULL , 0x8000000000000000ULL };
15- static const u64 GC_ALIGNED16 (psAbsMask[2 ]) = {0x7FFFFFFFFFFFFFFFULL , 0xFFFFFFFFFFFFFFFFULL };
16- static const u64 GC_ALIGNED16 (psAbsMask2[2 ]) = {0x7FFFFFFFFFFFFFFFULL , 0x7FFFFFFFFFFFFFFFULL };
13+ static const u64 GC_ALIGNED16 (psSignBits[2 ]) = {0x8000000000000000ULL , 0x0000000000000000ULL };
14+ static const u64 GC_ALIGNED16 (psSignBits2[2 ]) = {0x8000000000000000ULL , 0x8000000000000000ULL };
15+ static const u64 GC_ALIGNED16 (psAbsMask[2 ]) = {0x7FFFFFFFFFFFFFFFULL , 0xFFFFFFFFFFFFFFFFULL };
16+ static const u64 GC_ALIGNED16 (psAbsMask2[2 ]) = {0x7FFFFFFFFFFFFFFFULL , 0x7FFFFFFFFFFFFFFFULL };
17+ static const u64 GC_ALIGNED16 (psGeneratedQNaN[2 ]) = {0x7FF8000000000000ULL , 0x7FF8000000000000ULL };
1718static const double GC_ALIGNED16 (half_qnan_and_s32_max[2 ]) = {0x7FFFFFFF , -0x80000 };
1819
19- void Jit64::fp_tri_op (int d, int a, int b, bool reversible, bool single, void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&),
20- void (XEmitter::*sseOp)(X64Reg, const OpArg&), bool packed, bool roundRHS)
20+ X64Reg Jit64::fp_tri_op (int d, int a, int b, bool reversible, bool single, void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&),
21+ void (XEmitter::*sseOp)(X64Reg, const OpArg&), bool packed, bool preserve_inputs , bool roundRHS)
2122{
2223 fpr.Lock (d, a, b);
2324 fpr.BindToRegister (d, d == a || d == b || !single);
25+ X64Reg dest = preserve_inputs ? XMM1 : fpr.RX (d);
2426 if (roundRHS)
2527 {
26- if (d == a)
28+ if (d == a && !preserve_inputs )
2729 {
2830 Force25BitPrecision (XMM0, fpr.R (b), XMM1);
2931 (this ->*sseOp)(fpr.RX (d), R (XMM0));
3032 }
3133 else
3234 {
33- Force25BitPrecision (fpr. RX (d) , fpr.R (b), XMM0);
34- (this ->*sseOp)(fpr. RX (d) , fpr.R (a));
35+ Force25BitPrecision (dest , fpr.R (b), XMM0);
36+ (this ->*sseOp)(dest , fpr.R (a));
3537 }
3638 }
3739 else
3840 {
39- avx_op (avxOp, sseOp, fpr. RX (d) , fpr.R (a), fpr.R (b), packed, reversible);
41+ avx_op (avxOp, sseOp, dest , fpr.R (a), fpr.R (b), packed, reversible);
4042 }
41- if (single)
42- ForceSinglePrecision (fpr.RX (d), fpr.R (d), packed, true );
43- SetFPRFIfNeeded (fpr.RX (d));
44- fpr.UnlockAll ();
43+ return dest;
4544}
4645
4746// We can avoid calculating FPRF if it's not needed; every float operation resets it, so
@@ -56,6 +55,112 @@ void Jit64::SetFPRFIfNeeded(X64Reg xmm)
5655 SetFPRF (xmm);
5756}
5857
58+ void Jit64::HandleNaNs (UGeckoInstruction inst, X64Reg xmm_out, X64Reg xmm)
59+ {
60+ // | PowerPC | x86
61+ // ---------------------+----------+---------
62+ // input NaN precedence | 1*3 + 2 | 1*2 + 3
63+ // generated QNaN | positive | negative
64+ //
65+ // Dragon Ball: Revenge of King Piccolo requires generated NaNs
66+ // to be positive, so we'll have to handle them manually.
67+
68+ if (!SConfig::GetInstance ().m_LocalCoreStartupParameter .bAccurateNaNs )
69+ {
70+ if (xmm_out != xmm)
71+ MOVAPD (xmm_out, R (xmm));
72+ return ;
73+ }
74+
75+ _assert_ (xmm != XMM0);
76+
77+ std::vector<u32 > inputs;
78+ u32 a = inst.FA , b = inst.FB , c = inst.FC ;
79+ for (u32 i : {a, b, c})
80+ {
81+ if (!js.op ->fregsIn [i])
82+ continue ;
83+ if (std::find (inputs.begin (), inputs.end (), i) == inputs.end ())
84+ inputs.push_back (i);
85+ }
86+ if (inst.OPCD != 4 )
87+ {
88+ // not paired-single
89+ UCOMISD (xmm, R (xmm));
90+ FixupBranch handle_nan = J_CC (CC_P, true );
91+ SwitchToFarCode ();
92+ SetJumpTarget (handle_nan);
93+ std::vector<FixupBranch> fixups;
94+ for (u32 x : inputs)
95+ {
96+ MOVDDUP (xmm, fpr.R (x));
97+ UCOMISD (xmm, R (xmm));
98+ fixups.push_back (J_CC (CC_P));
99+ }
100+ MOVDDUP (xmm, M (psGeneratedQNaN));
101+ for (FixupBranch fixup : fixups)
102+ SetJumpTarget (fixup);
103+ FixupBranch done = J (true );
104+ SwitchToNearCode ();
105+ SetJumpTarget (done);
106+ }
107+ else
108+ {
109+ // paired-single
110+ std::reverse (inputs.begin (), inputs.end ());
111+ if (cpu_info.bSSE4_1 )
112+ {
113+ avx_op (&XEmitter::VCMPPD, &XEmitter::CMPPD, XMM0, R (xmm), R (xmm), CMP_UNORD);
114+ PTEST (XMM0, R (XMM0));
115+ FixupBranch handle_nan = J_CC (CC_NZ, true );
116+ SwitchToFarCode ();
117+ SetJumpTarget (handle_nan);
118+ BLENDVPD (xmm, M (psGeneratedQNaN));
119+ for (u32 x : inputs)
120+ {
121+ avx_op (&XEmitter::VCMPPD, &XEmitter::CMPPD, XMM0, fpr.R (x), fpr.R (x), CMP_UNORD);
122+ BLENDVPD (xmm, fpr.R (x));
123+ }
124+ FixupBranch done = J (true );
125+ SwitchToNearCode ();
126+ SetJumpTarget (done);
127+ }
128+ else
129+ {
130+ // SSE2 fallback
131+ X64Reg tmp = fpr.GetFreeXReg ();
132+ fpr.FlushLockX (tmp);
133+ MOVAPD (XMM0, R (xmm));
134+ CMPPD (XMM0, R (XMM0), CMP_UNORD);
135+ MOVMSKPD (RSCRATCH, R (XMM0));
136+ TEST (32 , R (RSCRATCH), R (RSCRATCH));
137+ FixupBranch handle_nan = J_CC (CC_NZ, true );
138+ SwitchToFarCode ();
139+ SetJumpTarget (handle_nan);
140+ MOVAPD (tmp, R (XMM0));
141+ PANDN (XMM0, R (xmm));
142+ PAND (tmp, M (psGeneratedQNaN));
143+ POR (tmp, R (XMM0));
144+ MOVAPD (xmm, R (tmp));
145+ for (u32 x : inputs)
146+ {
147+ MOVAPD (XMM0, fpr.R (x));
148+ CMPPD (XMM0, R (XMM0), CMP_ORD);
149+ MOVAPD (tmp, R (XMM0));
150+ PANDN (XMM0, fpr.R (x));
151+ PAND (xmm, R (tmp));
152+ POR (xmm, R (XMM0));
153+ }
154+ FixupBranch done = J (true );
155+ SwitchToNearCode ();
156+ SetJumpTarget (done);
157+ fpr.UnlockX (tmp);
158+ }
159+ }
160+ if (xmm_out != xmm)
161+ MOVAPD (xmm_out, R (xmm));
162+ }
163+
59164void Jit64::fp_arith (UGeckoInstruction inst)
60165{
61166 INSTRUCTION_START
@@ -80,20 +185,27 @@ void Jit64::fp_arith(UGeckoInstruction inst)
80185 packed = false ;
81186
82187 bool round_input = single && !jit->js .op ->fprIsSingle [inst.FC ];
188+ bool preserve_inputs = SConfig::GetInstance ().m_LocalCoreStartupParameter .bAccurateNaNs ;
83189
190+ X64Reg dest = INVALID_REG;
84191 switch (inst.SUBOP5 )
85192 {
86- case 18 : fp_tri_op (d, a, b, false , single, packed ? &XEmitter::VDIVPD : &XEmitter::VDIVSD,
87- packed ? &XEmitter::DIVPD : &XEmitter::DIVSD, packed); break ;
88- case 20 : fp_tri_op (d, a, b, false , single, packed ? &XEmitter::VSUBPD : &XEmitter::VSUBSD,
89- packed ? &XEmitter::SUBPD : &XEmitter::SUBSD, packed); break ;
90- case 21 : fp_tri_op (d, a, b, true , single, packed ? &XEmitter::VADDPD : &XEmitter::VADDSD,
91- packed ? &XEmitter::ADDPD : &XEmitter::ADDSD, packed); break ;
92- case 25 : fp_tri_op (d, a, c, true , single, packed ? &XEmitter::VMULPD : &XEmitter::VMULSD,
93- packed ? &XEmitter::MULPD : &XEmitter::MULSD, packed, round_input); break ;
193+ case 18 : dest = fp_tri_op (d, a, b, false , single, packed ? &XEmitter::VDIVPD : &XEmitter::VDIVSD,
194+ packed ? &XEmitter::DIVPD : &XEmitter::DIVSD, packed, preserve_inputs ); break ;
195+ case 20 : dest = fp_tri_op (d, a, b, false , single, packed ? &XEmitter::VSUBPD : &XEmitter::VSUBSD,
196+ packed ? &XEmitter::SUBPD : &XEmitter::SUBSD, packed, preserve_inputs ); break ;
197+ case 21 : dest = fp_tri_op (d, a, b, true , single, packed ? &XEmitter::VADDPD : &XEmitter::VADDSD,
198+ packed ? &XEmitter::ADDPD : &XEmitter::ADDSD, packed, preserve_inputs ); break ;
199+ case 25 : dest = fp_tri_op (d, a, c, true , single, packed ? &XEmitter::VMULPD : &XEmitter::VMULSD,
200+ packed ? &XEmitter::MULPD : &XEmitter::MULSD, packed, preserve_inputs , round_input); break ;
94201 default :
95202 _assert_msg_ (DYNA_REC, 0 , " fp_arith WTF!!!" );
96203 }
204+ HandleNaNs (inst, fpr.RX (d), dest);
205+ if (single)
206+ ForceSinglePrecision (fpr.RX (d), fpr.R (d), packed, true );
207+ SetFPRFIfNeeded (fpr.RX (d));
208+ fpr.UnlockAll ();
97209}
98210
99211void Jit64::fmaddXX (UGeckoInstruction inst)
@@ -220,13 +332,17 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
220332 if (inst.SUBOP5 == 31 ) // nmadd
221333 PXOR (XMM1, M (packed ? psSignBits2 : psSignBits));
222334 }
223-
224335 fpr.BindToRegister (d, !single);
225-
226336 if (single)
227- ForceSinglePrecision (fpr.RX (d), R (XMM1), packed, true );
337+ {
338+ HandleNaNs (inst, fpr.RX (d), XMM1);
339+ ForceSinglePrecision (fpr.RX (d), fpr.R (d), packed, true );
340+ }
228341 else
342+ {
343+ HandleNaNs (inst, XMM1, XMM1);
229344 MOVSD (fpr.RX (d), R (XMM1));
345+ }
230346 SetFPRFIfNeeded (fpr.RX (d));
231347 fpr.UnlockAll ();
232348}
@@ -379,7 +495,6 @@ void Jit64::FloatCompare(UGeckoInstruction inst, bool upper)
379495 }
380496 else
381497 {
382- // Are we masking sNaN invalid floating point exceptions? If not this could crash if we don't handle the exception?
383498 UCOMISD (fpr.RX (b), fpr.R (a));
384499 }
385500
0 commit comments