Skip to content

Commit

Permalink
Merge pull request #8120 from MerryMage/cdts
Browse files Browse the repository at this point in the history
Jit64: Make DoubleToSingle a common asm routine
  • Loading branch information
Tilka committed Jan 25, 2020
2 parents e3a7922 + f6afce7 commit 709862b
Show file tree
Hide file tree
Showing 9 changed files with 222 additions and 83 deletions.
2 changes: 2 additions & 0 deletions Source/Core/Core/PowerPC/Jit64/JitAsm.cpp
Expand Up @@ -238,6 +238,8 @@ void Jit64AsmRoutineManager::GenerateCommon()
GenFres();
mfcr = AlignCode4();
GenMfcr();
cdts = AlignCode4();
GenConvertDoubleToSingle();

GenQuantizedLoads();
GenQuantizedSingleLoads();
Expand Down
5 changes: 3 additions & 2 deletions Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp
Expand Up @@ -110,14 +110,15 @@ void Jit64::stfXXX(UGeckoInstruction inst)
RCOpArg Rs = fpr.Use(s, RCMode::Read);
RegCache::Realize(Rs);
CVTSD2SS(XMM0, Rs);
MOVD_xmm(R(RSCRATCH), XMM0);
}
else
{
RCX64Reg Rs = fpr.Bind(s, RCMode::Read);
RegCache::Realize(Rs);
ConvertDoubleToSingle(XMM0, Rs);
MOVAPD(XMM0, Rs);
CALL(asm_routines.cdts);
}
MOVD_xmm(R(RSCRATCH), XMM0);
}
else
{
Expand Down
80 changes: 0 additions & 80 deletions Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp
Expand Up @@ -868,89 +868,9 @@ void EmuCodeBlock::Force25BitPrecision(X64Reg output, const OpArg& input, X64Reg
}
}

// Since the following float conversion functions are used in non-arithmetic PPC float
// instructions, they must convert floats bitexact and never flush denormals to zero or turn SNaNs
// into QNaNs. This means we can't use CVTSS2SD/CVTSD2SS. The x87 FPU doesn't even support
// flush-to-zero so we can use FLD+FSTP even on denormals.
// If the number is a NaN, make sure to set the QNaN bit back to its original value.

// Another problem is that officially, converting doubles to single format results in undefined
// behavior. Relying on undefined behavior is a bug so no software should ever do this.
// Super Mario 64 (on Wii VC) accidentally relies on this behavior. See issue #11173

alignas(16) static const __m128i double_exponent = _mm_set_epi64x(0, 0x7ff0000000000000);
alignas(16) static const __m128i double_fraction = _mm_set_epi64x(0, 0x000fffffffffffff);
alignas(16) static const __m128i double_sign_bit = _mm_set_epi64x(0, 0x8000000000000000);
alignas(16) static const __m128i double_explicit_top_bit = _mm_set_epi64x(0, 0x0010000000000000);
alignas(16) static const __m128i double_top_two_bits = _mm_set_epi64x(0, 0xc000000000000000);
alignas(16) static const __m128i double_bottom_bits = _mm_set_epi64x(0, 0x07ffffffe0000000);
alignas(16) static const __m128i double_qnan_bit = _mm_set_epi64x(0xffffffffffffffff,
0xfff7ffffffffffff);

// This is the same algorithm used in the interpreter (and actual hardware)
// The documentation states that the conversion of a double with an outside the
// valid range for a single (or a single denormal) is undefined.
// But testing on actual hardware shows it always picks bits 0..1 and 5..34
// unless the exponent is in the range of 874 to 896.
void EmuCodeBlock::ConvertDoubleToSingle(X64Reg dst, X64Reg src)
{
MOVAPD(XMM1, R(src));

// Grab Exponent
PAND(XMM1, MConst(double_exponent));
PSRLQ(XMM1, 52);
MOVD_xmm(R(RSCRATCH), XMM1);

// Check if the double is in the range of valid single subnormal
SUB(16, R(RSCRATCH), Imm16(874));
CMP(16, R(RSCRATCH), Imm16(896 - 874));
FixupBranch NoDenormalize = J_CC(CC_A);

// Denormalise

// shift = (905 - Exponent) plus the 21 bit double to single shift
MOV(16, R(RSCRATCH), Imm16(905 + 21));
MOVD_xmm(XMM0, R(RSCRATCH));
PSUBQ(XMM0, R(XMM1));

// xmm1 = fraction | 0x0010000000000000
MOVAPD(XMM1, R(src));
PAND(XMM1, MConst(double_fraction));
POR(XMM1, MConst(double_explicit_top_bit));

// fraction >> shift
PSRLQ(XMM1, R(XMM0));

// OR the sign bit in.
MOVAPD(XMM0, R(src));
PAND(XMM0, MConst(double_sign_bit));
PSRLQ(XMM0, 32);
POR(XMM1, R(XMM0));

FixupBranch end = J(false); // Goto end

SetJumpTarget(NoDenormalize);

// Don't Denormalize

// We want bits 0, 1
MOVAPD(XMM1, R(src));
PAND(XMM1, MConst(double_top_two_bits));
PSRLQ(XMM1, 32);

// And 5 through to 34
MOVAPD(XMM0, R(src));
PAND(XMM0, MConst(double_bottom_bits));
PSRLQ(XMM0, 29);

// OR them togther
POR(XMM1, R(XMM0));

// End
SetJumpTarget(end);
MOVDDUP(dst, R(XMM1));
}

// Converting single->double is a bit easier because all single denormals are double normals.
void EmuCodeBlock::ConvertSingleToDouble(X64Reg dst, X64Reg src, bool src_is_gpr)
{
Expand Down
92 changes: 92 additions & 0 deletions Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp
Expand Up @@ -9,6 +9,7 @@
#include "Common/CPUDetect.h"
#include "Common/CommonTypes.h"
#include "Common/FloatUtils.h"
#include "Common/Intrinsics.h"
#include "Common/JitRegister.h"
#include "Common/x64ABI.h"
#include "Common/x64Emitter.h"
Expand All @@ -25,6 +26,97 @@

using namespace Gen;

alignas(16) static const __m128i double_fraction = _mm_set_epi64x(0, 0x000fffffffffffff);
alignas(16) static const __m128i double_sign_bit = _mm_set_epi64x(0, 0x8000000000000000);
alignas(16) static const __m128i double_explicit_top_bit = _mm_set_epi64x(0, 0x0010000000000000);
alignas(16) static const __m128i double_top_two_bits = _mm_set_epi64x(0, 0xc000000000000000);
alignas(16) static const __m128i double_bottom_bits = _mm_set_epi64x(0, 0x07ffffffe0000000);

// Since the following float conversion functions are used in non-arithmetic PPC float
// instructions, they must convert floats bitexact and never flush denormals to zero or turn SNaNs
// into QNaNs. This means we can't use CVTSS2SD/CVTSD2SS. The x87 FPU doesn't even support
// flush-to-zero so we can use FLD+FSTP even on denormals.
// If the number is a NaN, make sure to set the QNaN bit back to its original value.

// Another problem is that officially, converting doubles to single format results in undefined
// behavior. Relying on undefined behavior is a bug so no software should ever do this.
// Super Mario 64 (on Wii VC) accidentally relies on this behavior. See issue #11173

// This is the same algorithm used in the interpreter (and actual hardware)
// The documentation states that the conversion of a double with an outside the
// valid range for a single (or a single denormal) is undefined.
// But testing on actual hardware shows it always picks bits 0..1 and 5..34
// unless the exponent is in the range of 874 to 896.

void CommonAsmRoutines::GenConvertDoubleToSingle()
{
// Input in XMM0, output to RSCRATCH
// Clobbers RSCRATCH/RSCRATCH2/XMM0/XMM1

const void* start = GetCodePtr();

// Grab Exponent
MOVQ_xmm(R(RSCRATCH), XMM0);
MOV(64, R(RSCRATCH2), R(RSCRATCH));
SHR(64, R(RSCRATCH), Imm8(52));
AND(16, R(RSCRATCH), Imm16(0x7ff));

// Check if the double is in the range of valid single subnormal
SUB(16, R(RSCRATCH), Imm16(874));
CMP(16, R(RSCRATCH), Imm16(896 - 874));
FixupBranch Denormalize = J_CC(CC_NA);

// Don't Denormalize

if (cpu_info.bBMI2)
{
// Extract bits 0-1 and 5-34
MOV(64, R(RSCRATCH), Imm64(0xc7ffffffe0000000));
PEXT(64, RSCRATCH, RSCRATCH2, R(RSCRATCH));
}
else
{
// We want bits 0, 1
MOVAPD(XMM1, R(XMM0));
PAND(XMM1, MConst(double_top_two_bits));
PSRLQ(XMM1, 32);

// And 5 through to 34
PAND(XMM0, MConst(double_bottom_bits));
PSRLQ(XMM0, 29);

// OR them togther
POR(XMM0, R(XMM1));
MOVD_xmm(R(RSCRATCH), XMM0);
}
RET();

// Denormalise
SetJumpTarget(Denormalize);

// shift = (905 - Exponent) plus the 21 bit double to single shift
NEG(16, R(RSCRATCH));
ADD(16, R(RSCRATCH), Imm16((905 + 21) - 874));
MOVQ_xmm(XMM1, R(RSCRATCH));

// XMM0 = fraction | 0x0010000000000000
PAND(XMM0, MConst(double_fraction));
POR(XMM0, MConst(double_explicit_top_bit));

// fraction >> shift
PSRLQ(XMM0, R(XMM1));
MOVD_xmm(R(RSCRATCH), XMM0);

// OR the sign bit in.
SHR(64, R(RSCRATCH2), Imm8(32));
AND(32, R(RSCRATCH2), Imm32(0x80000000));

OR(32, R(RSCRATCH), R(RSCRATCH2));
RET();

JitRegister::Register(start, GetCodePtr(), "JIT_cdts");
}

void CommonAsmRoutines::GenFrsqrte()
{
const void* start = GetCodePtr();
Expand Down
1 change: 1 addition & 0 deletions Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.h
Expand Up @@ -31,6 +31,7 @@ class CommonAsmRoutines : public CommonAsmRoutinesBase, public QuantizedMemoryRo
void GenMfcr();

protected:
void GenConvertDoubleToSingle();
const u8* GenQuantizedLoadRuntime(bool single, EQuantizeType type);
const u8* GenQuantizedStoreRuntime(bool single, EQuantizeType type);
void GenQuantizedLoads();
Expand Down
1 change: 1 addition & 0 deletions Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h
Expand Up @@ -25,6 +25,7 @@ struct CommonAsmRoutinesBase
const u8* frsqrte;
const u8* fres;
const u8* mfcr;
const u8* cdts;

// In: array index: GQR to use.
// In: ECX: Address to read from.
Expand Down
5 changes: 4 additions & 1 deletion Source/UnitTests/Core/CMakeLists.txt
Expand Up @@ -15,5 +15,8 @@ add_dolphin_test(ESFormatsTest IOS/ES/FormatsTest.cpp IOS/ES/TestBinaryData.cpp)
add_dolphin_test(FileSystemTest IOS/FS/FileSystemTest.cpp)

if(_M_X86)
add_dolphin_test(PowerPCTest PowerPC/Jit64Common/Frsqrte.cpp)
add_dolphin_test(PowerPCTest
PowerPC/Jit64Common/ConvertDoubleToSingle.cpp
PowerPC/Jit64Common/Frsqrte.cpp
)
endif()
116 changes: 116 additions & 0 deletions Source/UnitTests/Core/PowerPC/Jit64Common/ConvertDoubleToSingle.cpp
@@ -0,0 +1,116 @@
// Copyright 2019 Dolphin Emulator Project
// Licensed under GPLv2+
// Refer to the license.txt file included.

#include <cstring>
#include <tuple>
#include <vector>

#include "Common/CommonTypes.h"
#include "Common/x64ABI.h"
#include "Core/PowerPC/Gekko.h"
#include "Core/PowerPC/Interpreter/Interpreter_FPUtils.h"
#include "Core/PowerPC/Jit64/Jit.h"
#include "Core/PowerPC/Jit64Common/Jit64AsmCommon.h"
#include "Core/PowerPC/Jit64Common/Jit64PowerPCState.h"

#include <gtest/gtest.h>

namespace
{
class TestCommonAsmRoutines : public CommonAsmRoutines
{
public:
TestCommonAsmRoutines() : CommonAsmRoutines(jit)
{
using namespace Gen;

AllocCodeSpace(4096);
m_const_pool.Init(AllocChildCodeSpace(1024), 1024);

const auto raw_cdts = reinterpret_cast<double (*)(double)>(AlignCode4());
GenConvertDoubleToSingle();

wrapped_cdts = reinterpret_cast<u32 (*)(u64)>(AlignCode4());
ABI_PushRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8, 16);

// Call
MOVQ_xmm(XMM0, R(ABI_PARAM1));
ABI_CallFunction(raw_cdts);
MOV(32, R(ABI_RETURN), R(RSCRATCH));

ABI_PopRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8, 16);
RET();
}

u32 (*wrapped_cdts)(u64);
Jit64 jit;
};
} // namespace

TEST(Jit64, ConvertDoubleToSingle)
{
TestCommonAsmRoutines routines;

const std::vector<u64> input_values{
// Special values
0x0000'0000'0000'0000, // positive zero
0x0000'0000'0000'0001, // smallest positive denormal
0x0000'0000'0100'0000,
0x000F'FFFF'FFFF'FFFF, // largest positive denormal
0x0010'0000'0000'0000, // smallest positive normal
0x0010'0000'0000'0002,
0x3FF0'0000'0000'0000, // 1.0
0x7FEF'FFFF'FFFF'FFFF, // largest positive normal
0x7FF0'0000'0000'0000, // positive infinity
0x7FF0'0000'0000'0001, // first positive SNaN
0x7FF7'FFFF'FFFF'FFFF, // last positive SNaN
0x7FF8'0000'0000'0000, // first positive QNaN
0x7FFF'FFFF'FFFF'FFFF, // last positive QNaN
0x8000'0000'0000'0000, // negative zero
0x8000'0000'0000'0001, // smallest negative denormal
0x8000'0000'0100'0000,
0x800F'FFFF'FFFF'FFFF, // largest negative denormal
0x8010'0000'0000'0000, // smallest negative normal
0x8010'0000'0000'0002,
0xBFF0'0000'0000'0000, // -1.0
0xFFEF'FFFF'FFFF'FFFF, // largest negative normal
0xFFF0'0000'0000'0000, // negative infinity
0xFFF0'0000'0000'0001, // first negative SNaN
0xFFF7'FFFF'FFFF'FFFF, // last negative SNaN
0xFFF8'0000'0000'0000, // first negative QNaN
0xFFFF'FFFF'FFFF'FFFF, // last negative QNaN

// (exp > 896) Boundary Case
0x3800'0000'0000'0000, // 2^(-127) = Denormal in single-prec
0x3810'0000'0000'0000, // 2^(-126) = Smallest single-prec normal
0xB800'0000'0000'0000, // -2^(-127) = Denormal in single-prec
0xB810'0000'0000'0000, // -2^(-126) = Smallest single-prec normal
0x3800'1234'5678'9ABC, 0x3810'1234'5678'9ABC, 0xB800'1234'5678'9ABC, 0xB810'1234'5678'9ABC,

// (exp >= 874) Boundary Case
0x3680'0000'0000'0000, // 2^(-150) = Unrepresentable in single-prec
0x36A0'0000'0000'0000, // 2^(-149) = Smallest single-prec denormal
0x36B0'0000'0000'0000, // 2^(-148) = Single-prec denormal
0xB680'0000'0000'0000, // -2^(-150) = Unrepresentable in single-prec
0xB6A0'0000'0000'0000, // -2^(-149) = Smallest single-prec denormal
0xB6B0'0000'0000'0000, // -2^(-148) = Single-prec denormal
0x3680'1234'5678'9ABC, 0x36A0'1234'5678'9ABC, 0x36B0'1234'5678'9ABC, 0xB680'1234'5678'9ABC,
0xB6A0'1234'5678'9ABC, 0xB6B0'1234'5678'9ABC,

// Some typical numbers
0x3FF8'0000'0000'0000, // 1.5
0x408F'4000'0000'0000, // 1000
0xC008'0000'0000'0000, // -3
};

for (const u64 input : input_values)
{
const u32 expected = ConvertToSingle(input);
const u32 actual = routines.wrapped_cdts(input);

printf("%016llx -> %08x == %08x\n", input, actual, expected);

EXPECT_EQ(expected, actual);
}
}
3 changes: 3 additions & 0 deletions Source/UnitTests/Core/PowerPC/Jit64Common/Frsqrte.cpp
Expand Up @@ -16,6 +16,8 @@

#include <gtest/gtest.h>

namespace
{
class TestCommonAsmRoutines : public CommonAsmRoutines
{
public:
Expand Down Expand Up @@ -51,6 +53,7 @@ class TestCommonAsmRoutines : public CommonAsmRoutines
u64 (*wrapped_frsqrte)(u64, UReg_FPSCR&);
Jit64 jit;
};
} // namespace

TEST(Jit64, Frsqrte)
{
Expand Down

0 comments on commit 709862b

Please sign in to comment.