Skip to content

Commit

Permalink
Jit64: fix NaN handling in FMA
Browse files Browse the repository at this point in the history
Fixes issue 8364.
  • Loading branch information
Tilka committed May 9, 2015
1 parent d2ee065 commit 345e1e3
Show file tree
Hide file tree
Showing 4 changed files with 116 additions and 122 deletions.
1 change: 0 additions & 1 deletion Source/Core/Core/PowerPC/Jit64/Jit.h
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,6 @@ class Jit64 : public Jitx86Base
void ps_sign(UGeckoInstruction inst); //aggregate
void ps_arith(UGeckoInstruction inst); //aggregate
void ps_mergeXX(UGeckoInstruction inst);
void ps_maddXX(UGeckoInstruction inst);
void ps_res(UGeckoInstruction inst);
void ps_rsqrte(UGeckoInstruction inst);
void ps_sum(UGeckoInstruction inst);
Expand Down
12 changes: 6 additions & 6 deletions Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -122,19 +122,19 @@ static GekkoOPTemplate table4_2[] =
{11, &Jit64::ps_sum}, //"ps_sum1", OPTYPE_PS, 0}},
{12, &Jit64::ps_muls}, //"ps_muls0", OPTYPE_PS, 0}},
{13, &Jit64::ps_muls}, //"ps_muls1", OPTYPE_PS, 0}},
{14, &Jit64::ps_maddXX}, //"ps_madds0", OPTYPE_PS, 0}},
{15, &Jit64::ps_maddXX}, //"ps_madds1", OPTYPE_PS, 0}},
{14, &Jit64::fmaddXX}, //"ps_madds0", OPTYPE_PS, 0}},
{15, &Jit64::fmaddXX}, //"ps_madds1", OPTYPE_PS, 0}},
{18, &Jit64::ps_arith}, //"ps_div", OPTYPE_PS, 0, 16}},
{20, &Jit64::ps_arith}, //"ps_sub", OPTYPE_PS, 0}},
{21, &Jit64::ps_arith}, //"ps_add", OPTYPE_PS, 0}},
{23, &Jit64::ps_sel}, //"ps_sel", OPTYPE_PS, 0}},
{24, &Jit64::ps_res}, //"ps_res", OPTYPE_PS, 0}},
{25, &Jit64::ps_arith}, //"ps_mul", OPTYPE_PS, 0}},
{26, &Jit64::ps_rsqrte}, //"ps_rsqrte", OPTYPE_PS, 0, 1}},
{28, &Jit64::ps_maddXX}, //"ps_msub", OPTYPE_PS, 0}},
{29, &Jit64::ps_maddXX}, //"ps_madd", OPTYPE_PS, 0}},
{30, &Jit64::ps_maddXX}, //"ps_nmsub", OPTYPE_PS, 0}},
{31, &Jit64::ps_maddXX}, //"ps_nmadd", OPTYPE_PS, 0}},
{28, &Jit64::fmaddXX}, //"ps_msub", OPTYPE_PS, 0}},
{29, &Jit64::fmaddXX}, //"ps_madd", OPTYPE_PS, 0}},
{30, &Jit64::fmaddXX}, //"ps_nmsub", OPTYPE_PS, 0}},
{31, &Jit64::fmaddXX}, //"ps_nmadd", OPTYPE_PS, 0}},
};


Expand Down
131 changes: 110 additions & 21 deletions Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@

using namespace Gen;

static const u64 GC_ALIGNED16(psSignBits[2]) = {0x8000000000000000ULL, 0x0000000000000000ULL};
static const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL};
static const u64 GC_ALIGNED16(psAbsMask[2]) = {0x7FFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL};
static const u64 GC_ALIGNED16(psSignBits[2]) = {0x8000000000000000ULL, 0x0000000000000000ULL};
static const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL};
static const u64 GC_ALIGNED16(psAbsMask[2]) = {0x7FFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL};
static const u64 GC_ALIGNED16(psGeneratedQNaN[2]) = {0x7FF8000000000000ULL, 0x7FF8000000000000ULL};
static const double GC_ALIGNED16(half_qnan_and_s32_max[2]) = {0x7FFFFFFF, -0x80000};

void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*avxOp)(X64Reg, X64Reg, OpArg),
Expand Down Expand Up @@ -112,13 +113,40 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
int b = inst.FB;
int c = inst.FC;
int d = inst.FD;
bool single = inst.OPCD == 59;
bool single = inst.OPCD == 4 || inst.OPCD == 59;
bool round_input = single && !jit->js.op->fprIsSingle[c];
bool packed = single && jit->js.op->fprIsDuplicated[a] && jit->js.op->fprIsDuplicated[b] && jit->js.op->fprIsDuplicated[c];
if (cpu_info.bAtom)
packed = false;
bool packed = inst.OPCD == 4 ||
(!cpu_info.bAtom && single &&
jit->js.op->fprIsDuplicated[a] &&
jit->js.op->fprIsDuplicated[b] &&
jit->js.op->fprIsDuplicated[c]);

fpr.Lock(a, b, c, d);
X64Reg tmp = fpr.GetFreeXReg();
fpr.LockX(tmp);

switch(inst.SUBOP5)
{
case 14:
MOVDDUP(XMM0, fpr.R(c));
if (round_input)
Force25BitPrecision(XMM0, R(XMM0), XMM1);
break;
case 15:
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3);
if (round_input)
Force25BitPrecision(XMM0, R(XMM0), XMM1);
break;
default:
bool special = inst.SUBOP5 == 30 && (!cpu_info.bFMA || Core::g_want_determinism);
X64Reg tmp1 = special ? XMM1 : XMM0;
X64Reg tmp2 = special ? XMM0 : XMM1;
if (single && round_input)
Force25BitPrecision(tmp1, fpr.R(c), tmp2);
else
MOVAPD(tmp1, fpr.R(c));
break;
}

// While we don't know if any games are actually affected (replays seem to work with all the usual
// suspects for desyncing), netplay and other applications need absolute perfect determinism, so
Expand All @@ -128,10 +156,6 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
// instances on different computers giving identical results.
if (cpu_info.bFMA && !Core::g_want_determinism)
{
if (single && round_input)
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
else
MOVAPD(XMM0, fpr.R(c));
// Statistics suggests b is a lot less likely to be unbound in practice, so
// if we have to pick one of a or b to bind, let's make it b.
fpr.BindToRegister(b, true, false);
Expand All @@ -143,6 +167,8 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
else
VFMSUB132SD(XMM0, fpr.RX(b), fpr.R(a));
break;
case 14: //madds0
case 15: //madds1
case 29: //madd
if (packed)
VFMADD132PD(XMM0, fpr.RX(b), fpr.R(a));
Expand All @@ -169,11 +195,7 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
}
else if (inst.SUBOP5 == 30) //nmsub
{
// nmsub is implemented a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately
if (single && round_input)
Force25BitPrecision(XMM1, fpr.R(c), XMM0);
else
MOVAPD(XMM1, fpr.R(c));
// We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately.
MOVAPD(XMM0, fpr.R(b));
if (packed)
{
Expand All @@ -188,16 +210,12 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
}
else
{
if (single && round_input)
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
else
MOVAPD(XMM0, fpr.R(c));
if (packed)
{
MULPD(XMM0, fpr.R(a));
if (inst.SUBOP5 == 28) //msub
SUBPD(XMM0, fpr.R(b));
else //(n)madd
else //(n)madd(s[01])
ADDPD(XMM0, fpr.R(b));
}
else
Expand All @@ -212,6 +230,76 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
PXOR(XMM0, M(packed ? psSignBits2 : psSignBits));
}

// | PowerPC | x86
// ---------------------+----------+---------
// input NaN precedence | 1*3 + 2 | 1*2 + 3
// generated QNaN | positive | negative
//
// Dragon Ball: Revenge of King Piccolo requires correct NaNs,
// so we'll have to handle them manually.
if (inst.OPCD == 4)
{
// paired-single
if (cpu_info.bSSE4_1 && cpu_info.bAVX)
{
VCMPPD(XMM1, XMM0, R(XMM0), CMP_UNORD);
PTEST(XMM1, R(XMM1));
FixupBranch handle_nan = J_CC(CC_NZ, true);
SwitchToFarCode();
SetJumpTarget(handle_nan);

MOVAPD(XMM1, fpr.R(a));
CMPPD(XMM1, R(XMM1), CMP_UNORD);
VBLENDVPD(XMM0, XMM0, fpr.R(a), XMM1);

MOVAPD(tmp, fpr.R(b));
CMPPD(tmp, R(tmp), CMP_UNORD);
PANDN(XMM1, R(tmp));
VBLENDVPD(XMM0, XMM0, fpr.R(b), XMM1);

MOVAPD(tmp, fpr.R(c));
CMPPD(tmp, R(tmp), CMP_UNORD);
PANDN(XMM1, R(tmp));
VBLENDVPD(XMM0, XMM0, fpr.R(c), XMM1);

PCMPEQW(tmp, R(tmp));
PANDN(XMM1, R(tmp)); // XMM1 = ~XMM1
VBLENDVPD(XMM0, XMM0, M(psGeneratedQNaN), XMM1);

FixupBranch done = J(true);
SwitchToNearCode();
SetJumpTarget(done);
}
else
{
// TODO
}
}
else
{
// not paired-single
UCOMISD(XMM0, R(XMM0));
FixupBranch handle_nan = J_CC(CC_P, true);
SwitchToFarCode();
SetJumpTarget(handle_nan);
MOVDDUP(XMM0, fpr.R(a));
UCOMISD(XMM0, R(XMM0));
FixupBranch nan_a = J_CC(CC_P);
MOVDDUP(XMM0, fpr.R(b));
UCOMISD(XMM0, R(XMM0));
FixupBranch nan_b = J_CC(CC_P);
MOVDDUP(XMM0, fpr.R(c));
UCOMISD(XMM0, R(XMM0));
FixupBranch nan_c = J_CC(CC_P);
MOVDDUP(XMM0, M(psGeneratedQNaN));
SetJumpTarget(nan_a);
SetJumpTarget(nan_b);
SetJumpTarget(nan_c);
FixupBranch done = J(true);
SwitchToNearCode();
SetJumpTarget(done);
}

fpr.BindToRegister(d, !single);

if (single)
Expand All @@ -232,6 +320,7 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
}
SetFPRFIfNeeded(inst, fpr.RX(d));
fpr.UnlockAll();
fpr.UnlockAllX();
}

void Jit64::fsign(UGeckoInstruction inst)
Expand Down
94 changes: 0 additions & 94 deletions Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -297,100 +297,6 @@ void Jit64::ps_res(UGeckoInstruction inst)
gpr.UnlockAllX();
}

//TODO: add optimized cases
void Jit64::ps_maddXX(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITPairedOff);
FALLBACK_IF(inst.Rc);

int a = inst.FA;
int b = inst.FB;
int c = inst.FC;
int d = inst.FD;
bool fma = cpu_info.bFMA && !Core::g_want_determinism;
bool round_input = !jit->js.op->fprIsSingle[c];
fpr.Lock(a, b, c, d);

if (fma)
fpr.BindToRegister(b, true, false);

if (inst.SUBOP5 == 14)
{
MOVDDUP(XMM0, fpr.R(c));
if (round_input)
Force25BitPrecision(XMM0, R(XMM0), XMM1);
}
else if (inst.SUBOP5 == 15)
{
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3);
if (round_input)
Force25BitPrecision(XMM0, R(XMM0), XMM1);
}
else
{
if (round_input)
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
else
MOVAPD(XMM0, fpr.R(c));
}

if (fma)
{
switch (inst.SUBOP5)
{
case 14: //madds0
case 15: //madds1
case 29: //madd
VFMADD132PD(XMM0, fpr.RX(b), fpr.R(a));
break;
case 28: //msub
VFMSUB132PD(XMM0, fpr.RX(b), fpr.R(a));
break;
case 30: //nmsub
VFNMADD132PD(XMM0, fpr.RX(b), fpr.R(a));
break;
case 31: //nmadd
VFNMSUB132PD(XMM0, fpr.RX(b), fpr.R(a));
break;
}
}
else
{
switch (inst.SUBOP5)
{
case 14: //madds0
case 15: //madds1
case 29: //madd
MULPD(XMM0, fpr.R(a));
ADDPD(XMM0, fpr.R(b));
break;
case 28: //msub
MULPD(XMM0, fpr.R(a));
SUBPD(XMM0, fpr.R(b));
break;
case 30: //nmsub
MULPD(XMM0, fpr.R(a));
SUBPD(XMM0, fpr.R(b));
PXOR(XMM0, M(psSignBits));
break;
case 31: //nmadd
MULPD(XMM0, fpr.R(a));
ADDPD(XMM0, fpr.R(b));
PXOR(XMM0, M(psSignBits));
break;
default:
_assert_msg_(DYNA_REC, 0, "ps_maddXX WTF!!!");
return;
}
}

fpr.BindToRegister(d, false);
ForceSinglePrecisionP(fpr.RX(d), XMM0);
SetFPRFIfNeeded(inst, fpr.RX(d));
fpr.UnlockAll();
}

void Jit64::ps_cmpXX(UGeckoInstruction inst)
{
INSTRUCTION_START
Expand Down

0 comments on commit 345e1e3

Please sign in to comment.