Skip to content

Commit

Permalink
Add phire's more accurate DoubleToSingle version
Browse files Browse the repository at this point in the history
This method doesn't involve messing around with the quirks of the x87
FPU and should be reasonably fast. As a bonus, it does the correct thing
for out-of-range doubles.

However, it is also a little slower and only benefits programs that rely
on undefined behavior so it is disabled for now.
  • Loading branch information
Tilka committed Feb 23, 2014
1 parent 7062cf8 commit ee21cbe
Show file tree
Hide file tree
Showing 2 changed files with 134 additions and 35 deletions.
6 changes: 3 additions & 3 deletions Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -234,9 +234,9 @@ inline u32 ConvertToSingleFTZ(u64 x)

inline u64 ConvertToDouble(u32 _x)
{
// This is a little-endian re-implementation of the algrothm described in
// the Power PC Programming Enviroments Manual for Loading single
// percision floating point numbers.
// This is a little-endian re-implementation of the algorithm described in
// the PowerPC Programming Environments Manual for loading single
// precision floating point numbers.
// See page 566 of http://www.freescale.com/files/product/doc/MPCFPE32B.pdf

u64 x = _x;
Expand Down
163 changes: 131 additions & 32 deletions Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -438,48 +438,110 @@ static const __uint128_t GC_ALIGNED16(double_qnan_bit) = 0x0008000000000000;
static const __uint128_t GC_ALIGNED16(double_exponent) = 0x7ff0000000000000;
#endif

// Since the following two functions are used in non-arithmetic PPC float instructions,
// Since the following float conversion functions are used in non-arithmetic PPC float instructions,
// they must convert floats bitexact and never flush denormals to zero or turn SNaNs into QNaNs.
// This means we can't use CVTSS2SD/CVTSD2SS :(
// The x87 FPU doesn't even support flush-to-zero so we can use FLD+FSTP even on denormals.
// If the number is a NaN, make sure to set the QNaN bit back to its original value.

void EmuCodeBlock::ConvertSingleToDouble(X64Reg dst, X64Reg src, bool src_is_gpr)
// Another problem is that officially, converting doubles to single format results in undefined behavior.
// Relying on undefined behavior is a bug so no software should ever do this.
// In case it does happen, phire's more accurate implementation of ConvertDoubleToSingle() is reproduced below.

//#define MORE_ACCURATE_DOUBLETOSINGLE
#ifdef MORE_ACCURATE_DOUBLETOSINGLE

#ifdef _WIN32
#ifdef _M_X64
static const __m128i GC_ALIGNED16(double_fraction) = _mm_set_epi64x(0, 0x000fffffffffffff);
static const __m128i GC_ALIGNED16(double_sign_bit) = _mm_set_epi64x(0, 0x8000000000000000);
static const __m128i GC_ALIGNED16(double_explicit_top_bit) = _mm_set_epi64x(0, 0x0010000000000000);
static const __m128i GC_ALIGNED16(double_top_two_bits) = _mm_set_epi64x(0, 0xc000000000000000);
static const __m128i GC_ALIGNED16(double_bottom_bits) = _mm_set_epi64x(0, 0x07ffffffe0000000);
#else
static const __m128i GC_ALIGNED16(double_fraction) = _mm_set_epi32(0, 0, 0x000fffff, 0xffffffff);
static const __m128i GC_ALIGNED16(double_sign_bit) = _mm_set_epi32(0, 0, 0x80000000, 0x00000000);
static const __m128i GC_ALIGNED16(double_explicit_top_bit) = _mm_set_epi32(0, 0, 0x00100000, 0x00000000);
static const __m128i GC_ALIGNED16(double_top_two_bits) = _mm_set_epi32(0, 0, 0xc0000000, 0x00000000);
static const __m128i GC_ALIGNED16(double_bottom_bits) = _mm_set_epi32(0, 0, 0x07ffffff, 0xe0000000);
#endif
#else
static const __uint128_t GC_ALIGNED16(double_fraction) = 0x000fffffffffffff;
static const __uint128_t GC_ALIGNED16(double_sign_bit) = 0x8000000000000000;
static const __uint128_t GC_ALIGNED16(double_explicit_top_bit) = 0x0010000000000000;
static const __uint128_t GC_ALIGNED16(double_top_two_bits) = 0xc000000000000000;
static const __uint128_t GC_ALIGNED16(double_bottom_bits) = 0x07ffffffe0000000;
#endif

// This is the same algorithm used in the interpreter (and actual hardware)
// The documentation states that the conversion of a double with an outside the
// valid range for a single (or a single denormal) is undefined.
// But testing on actual hardware shows it always picks bits 0..1 and 5..34
// unless the exponent is in the range of 874 to 896.
void EmuCodeBlock::ConvertDoubleToSingle(X64Reg dst, X64Reg src)
{
if (src_is_gpr) {
MOV(32, M(&temp32), R(src));
MOVD_xmm(XMM1, R(src));
} else {
MOVSS(M(&temp32), src);
MOVSS(R(XMM1), src);
}
FLD(32, M(&temp32));
CCFlags cond;
if (cpu_info.bSSE4_1) {
PTEST(XMM1, M((void *)&single_exponent));
cond = CC_NC;
} else {
FNSTSW_AX();
TEST(16, R(AX), Imm16(x87_InvalidOperation));
cond = CC_Z;
}
FSTP(64, M(&temp64));
MOVSD(dst, M(&temp64));
FixupBranch dont_reset_qnan_bit = J_CC(cond);
MOVSD(XMM1, R(src));

PANDN(XMM1, M((void *)&single_qnan_bit));
PSLLQ(XMM1, 29);
if (cpu_info.bAVX) {
VPANDN(dst, XMM1, R(dst));
} else {
PANDN(XMM1, R(dst));
MOVSD(dst, R(XMM1));
}
// Grab Exponent
PAND(XMM1, M((void *)&double_exponent));
PSRLQ(XMM1, 52);
MOVD_xmm(R(EAX), XMM1);

SetJumpTarget(dont_reset_qnan_bit);
MOVDDUP(dst, R(dst));

// Check if the double is in the range of valid single subnormal
CMP(16, R(EAX), Imm16(896));
FixupBranch NoDenormalize = J_CC(CC_G);
CMP(16, R(EAX), Imm16(874));
FixupBranch NoDenormalize2 = J_CC(CC_L);

// Denormalise

// shift = (905 - Exponent) plus the 21 bit double to single shift
MOV(16, R(EAX), Imm16(905 + 21));
MOVD_xmm(XMM0, R(EAX));
PSUBQ(XMM0, R(XMM1));

// xmm1 = fraction | 0x0010000000000000
MOVSD(XMM1, R(src));
PAND(XMM1, M((void *)&double_fraction));
POR(XMM1, M((void *)&double_explicit_top_bit));

// fraction >> shift
PSRLQ(XMM1, R(XMM0));

// OR the sign bit in.
MOVSD(XMM0, R(src));
PAND(XMM0, M((void *)&double_sign_bit));
PSRLQ(XMM0, 32);
POR(XMM1, R(XMM0));

FixupBranch end = J(false); // Goto end

SetJumpTarget(NoDenormalize);
SetJumpTarget(NoDenormalize2);

// Don't Denormalize

// We want bits 0, 1
MOVSD(XMM1, R(src));
PAND(XMM1, M((void *)&double_top_two_bits));
PSRLQ(XMM1, 32);

// And 5 through to 34
MOVSD(XMM0, R(src));
PAND(XMM0, M((void *)&double_bottom_bits));
PSRLQ(XMM0, 29);

// OR them togther
POR(XMM1, R(XMM0));

// End
SetJumpTarget(end);
MOVDDUP(dst, R(XMM1));
}

#else // MORE_ACCURATE_DOUBLETOSINGLE

void EmuCodeBlock::ConvertDoubleToSingle(X64Reg dst, X64Reg src)
{
MOVSD(M(&temp64), src);
Expand Down Expand Up @@ -510,6 +572,43 @@ void EmuCodeBlock::ConvertDoubleToSingle(X64Reg dst, X64Reg src)
SetJumpTarget(dont_reset_qnan_bit);
MOVDDUP(dst, R(XMM0));
}
#endif // MORE_ACCURATE_DOUBLETOSINGLE

void EmuCodeBlock::ConvertSingleToDouble(X64Reg dst, X64Reg src, bool src_is_gpr)
{
if (src_is_gpr) {
MOV(32, M(&temp32), R(src));
MOVD_xmm(XMM1, R(src));
} else {
MOVSS(M(&temp32), src);
MOVSS(R(XMM1), src);
}
FLD(32, M(&temp32));
CCFlags cond;
if (cpu_info.bSSE4_1) {
PTEST(XMM1, M((void *)&single_exponent));
cond = CC_NC;
} else {
FNSTSW_AX();
TEST(16, R(AX), Imm16(x87_InvalidOperation));
cond = CC_Z;
}
FSTP(64, M(&temp64));
MOVSD(dst, M(&temp64));
FixupBranch dont_reset_qnan_bit = J_CC(cond);

PANDN(XMM1, M((void *)&single_qnan_bit));
PSLLQ(XMM1, 29);
if (cpu_info.bAVX) {
VPANDN(dst, XMM1, R(dst));
} else {
PANDN(XMM1, R(dst));
MOVSD(dst, R(XMM1));
}

SetJumpTarget(dont_reset_qnan_bit);
MOVDDUP(dst, R(dst));
}

void EmuCodeBlock::JitClearCA()
{
Expand Down

0 comments on commit ee21cbe

Please sign in to comment.