From ee21cbe2d181b8ea2565dfde0f882cbbe3df02d9 Mon Sep 17 00:00:00 2001
From: Tillmann Karras <tilkax@gmail.com>
Date: Wed, 12 Feb 2014 23:26:15 +0100
Subject: [PATCH] Add phire's more accurate DoubleToSingle version

This method doesn't involve messing around with the quirks of the x87
FPU and should be reasonably fast. As a bonus, it does the correct thing
for out-of-range doubles.

However, it is also a little slower and only benefits programs that rely
on undefined behavior so it is disabled for now.
---
 .../PowerPC/Interpreter/Interpreter_FPUtils.h |   6 +-
 .../Core/Core/PowerPC/JitCommon/Jit_Util.cpp  | 163 ++++++++++++++----
 2 files changed, 134 insertions(+), 35 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
index d4c66324537c..4063c19d3036 100644
--- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
+++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
@@ -234,9 +234,9 @@ inline u32 ConvertToSingleFTZ(u64 x)
 
 inline u64 ConvertToDouble(u32 _x)
 {
-	// This is a little-endian re-implementation of the algrothm described in
-	// the Power PC Programming Enviroments Manual for Loading single
-	// percision floating point numbers.
+	// This is a little-endian re-implementation of the algorithm described in
+	// the PowerPC Programming Environments Manual for loading single
+	// precision floating point numbers.
 	// See page 566 of http://www.freescale.com/files/product/doc/MPCFPE32B.pdf
 
 	u64 x = _x;
diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
index 814dbd3cf172..a5a022be6cba 100644
--- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
+++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
@@ -438,48 +438,110 @@ static const __uint128_t GC_ALIGNED16(double_qnan_bit) = 0x0008000000000000;
 static const __uint128_t GC_ALIGNED16(double_exponent) = 0x7ff0000000000000;
 #endif
 
-// Since the following two functions are used in non-arithmetic PPC float instructions,
+// Since the following float conversion functions are used in non-arithmetic PPC float instructions,
 // they must convert floats bitexact and never flush denormals to zero or turn SNaNs into QNaNs.
 // This means we can't use CVTSS2SD/CVTSD2SS :(
 // The x87 FPU doesn't even support flush-to-zero so we can use FLD+FSTP even on denormals.
 // If the number is a NaN, make sure to set the QNaN bit back to its original value.
 
-void EmuCodeBlock::ConvertSingleToDouble(X64Reg dst, X64Reg src, bool src_is_gpr)
+// Another problem is that officially, converting doubles to single format results in undefined behavior.
+// Relying on undefined behavior is a bug so no software should ever do this.
+// In case it does happen, phire's more accurate implementation of ConvertDoubleToSingle() is reproduced below.
+
+//#define MORE_ACCURATE_DOUBLETOSINGLE
+#ifdef MORE_ACCURATE_DOUBLETOSINGLE
+
+#ifdef _WIN32
+#ifdef _M_X64
+static const __m128i GC_ALIGNED16(double_fraction) = _mm_set_epi64x(0, 0x000fffffffffffff);
+static const __m128i GC_ALIGNED16(double_sign_bit) = _mm_set_epi64x(0, 0x8000000000000000);
+static const __m128i GC_ALIGNED16(double_explicit_top_bit) = _mm_set_epi64x(0, 0x0010000000000000);
+static const __m128i GC_ALIGNED16(double_top_two_bits) = _mm_set_epi64x(0, 0xc000000000000000);
+static const __m128i GC_ALIGNED16(double_bottom_bits)  = _mm_set_epi64x(0, 0x07ffffffe0000000);
+#else
+static const __m128i GC_ALIGNED16(double_fraction) = _mm_set_epi32(0, 0, 0x000fffff, 0xffffffff);
+static const __m128i GC_ALIGNED16(double_sign_bit) = _mm_set_epi32(0, 0, 0x80000000, 0x00000000);
+static const __m128i GC_ALIGNED16(double_explicit_top_bit) = _mm_set_epi32(0, 0, 0x00100000, 0x00000000);
+static const __m128i GC_ALIGNED16(double_top_two_bits) = _mm_set_epi32(0, 0, 0xc0000000, 0x00000000);
+static const __m128i GC_ALIGNED16(double_bottom_bits)  = _mm_set_epi32(0, 0, 0x07ffffff, 0xe0000000);
+#endif
+#else
+static const __uint128_t GC_ALIGNED16(double_fraction) = 0x000fffffffffffff;
+static const __uint128_t GC_ALIGNED16(double_sign_bit) = 0x8000000000000000;
+static const __uint128_t GC_ALIGNED16(double_explicit_top_bit) = 0x0010000000000000;
+static const __uint128_t GC_ALIGNED16(double_top_two_bits) = 0xc000000000000000;
+static const __uint128_t GC_ALIGNED16(double_bottom_bits)  = 0x07ffffffe0000000;
+#endif
+
+// This is the same algorithm used in the interpreter (and actual hardware)
+// The documentation states that the conversion of a double with an outside the
+// valid range for a single (or a single denormal) is undefined.
+// But testing on actual hardware shows it always picks bits 0..1 and 5..34
+// unless the exponent is in the range of 874 to 896.
+void EmuCodeBlock::ConvertDoubleToSingle(X64Reg dst, X64Reg src)
 {
-	if (src_is_gpr) {
-		MOV(32, M(&temp32), R(src));
-		MOVD_xmm(XMM1, R(src));
-	} else {
-		MOVSS(M(&temp32), src);
-		MOVSS(R(XMM1), src);
-	}
-	FLD(32, M(&temp32));
-	CCFlags cond;
-	if (cpu_info.bSSE4_1) {
-		PTEST(XMM1, M((void *)&single_exponent));
-		cond = CC_NC;
-	} else {
-		FNSTSW_AX();
-		TEST(16, R(AX), Imm16(x87_InvalidOperation));
-		cond = CC_Z;
-	}
-	FSTP(64, M(&temp64));
-	MOVSD(dst, M(&temp64));
-	FixupBranch dont_reset_qnan_bit = J_CC(cond);
+	MOVSD(XMM1, R(src));
 
-	PANDN(XMM1, M((void *)&single_qnan_bit));
-	PSLLQ(XMM1, 29);
-	if (cpu_info.bAVX) {
-		VPANDN(dst, XMM1, R(dst));
-	} else {
-		PANDN(XMM1, R(dst));
-		MOVSD(dst, R(XMM1));
-	}
+	// Grab Exponent
+	PAND(XMM1, M((void *)&double_exponent));
+	PSRLQ(XMM1, 52);
+	MOVD_xmm(R(EAX), XMM1);
 
-	SetJumpTarget(dont_reset_qnan_bit);
-	MOVDDUP(dst, R(dst));
+
+	// Check if the double is in the range of valid single subnormal
+	CMP(16, R(EAX), Imm16(896));
+	FixupBranch NoDenormalize = J_CC(CC_G);
+	CMP(16, R(EAX), Imm16(874));
+	FixupBranch NoDenormalize2 = J_CC(CC_L);
+
+	// Denormalise
+
+	// shift = (905 - Exponent) plus the 21 bit double to single shift
+	MOV(16, R(EAX), Imm16(905 + 21));
+	MOVD_xmm(XMM0, R(EAX));
+	PSUBQ(XMM0, R(XMM1));
+
+	// xmm1 = fraction | 0x0010000000000000
+	MOVSD(XMM1, R(src));
+	PAND(XMM1, M((void *)&double_fraction));
+	POR(XMM1, M((void *)&double_explicit_top_bit));
+
+	// fraction >> shift
+	PSRLQ(XMM1, R(XMM0));
+
+	// OR the sign bit in.
+	MOVSD(XMM0, R(src));
+	PAND(XMM0, M((void *)&double_sign_bit));
+	PSRLQ(XMM0, 32);
+	POR(XMM1, R(XMM0));
+
+	FixupBranch end = J(false); // Goto end
+
+	SetJumpTarget(NoDenormalize);
+	SetJumpTarget(NoDenormalize2);
+
+	// Don't Denormalize
+
+	// We want bits 0, 1
+	MOVSD(XMM1, R(src));
+	PAND(XMM1, M((void *)&double_top_two_bits));
+	PSRLQ(XMM1, 32);
+
+	// And 5 through to 34
+	MOVSD(XMM0, R(src));
+	PAND(XMM0, M((void *)&double_bottom_bits));
+	PSRLQ(XMM0, 29);
+
+	// OR them togther
+	POR(XMM1, R(XMM0));
+
+	// End
+	SetJumpTarget(end);
+	MOVDDUP(dst, R(XMM1));
 }
 
+#else // MORE_ACCURATE_DOUBLETOSINGLE
+
 void EmuCodeBlock::ConvertDoubleToSingle(X64Reg dst, X64Reg src)
 {
 	MOVSD(M(&temp64), src);
@@ -510,6 +572,43 @@ void EmuCodeBlock::ConvertDoubleToSingle(X64Reg dst, X64Reg src)
 	SetJumpTarget(dont_reset_qnan_bit);
 	MOVDDUP(dst, R(XMM0));
 }
+#endif // MORE_ACCURATE_DOUBLETOSINGLE
+
+void EmuCodeBlock::ConvertSingleToDouble(X64Reg dst, X64Reg src, bool src_is_gpr)
+{
+	if (src_is_gpr) {
+		MOV(32, M(&temp32), R(src));
+		MOVD_xmm(XMM1, R(src));
+	} else {
+		MOVSS(M(&temp32), src);
+		MOVSS(R(XMM1), src);
+	}
+	FLD(32, M(&temp32));
+	CCFlags cond;
+	if (cpu_info.bSSE4_1) {
+		PTEST(XMM1, M((void *)&single_exponent));
+		cond = CC_NC;
+	} else {
+		FNSTSW_AX();
+		TEST(16, R(AX), Imm16(x87_InvalidOperation));
+		cond = CC_Z;
+	}
+	FSTP(64, M(&temp64));
+	MOVSD(dst, M(&temp64));
+	FixupBranch dont_reset_qnan_bit = J_CC(cond);
+
+	PANDN(XMM1, M((void *)&single_qnan_bit));
+	PSLLQ(XMM1, 29);
+	if (cpu_info.bAVX) {
+		VPANDN(dst, XMM1, R(dst));
+	} else {
+		PANDN(XMM1, R(dst));
+		MOVSD(dst, R(XMM1));
+	}
+
+	SetJumpTarget(dont_reset_qnan_bit);
+	MOVDDUP(dst, R(dst));
+}
 
 void EmuCodeBlock::JitClearCA()
 {