Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Merge pull request #9458 from JosJuice/arm-fpu-round
JitArm64: Set flush-to-zero/rounding mode and improve float/double conversion accuracy
  • Loading branch information
JMC47 committed Apr 25, 2021
2 parents aa3a96f + 69c14d6 commit 5da85f3
Show file tree
Hide file tree
Showing 25 changed files with 819 additions and 120 deletions.
8 changes: 8 additions & 0 deletions Source/Core/Common/Arm64Emitter.cpp
Expand Up @@ -3601,6 +3601,14 @@ void ARM64FloatEmitter::FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn)
{
Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xE, Rd, Rn);
}
void ARM64FloatEmitter::FACGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
EmitThreeSame(1, size >> 6, 0x1D, Rd, Rn, Rm);
}
void ARM64FloatEmitter::FACGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
EmitThreeSame(1, 2 | (size >> 6), 0x1D, Rd, Rn, Rm);
}

void ARM64FloatEmitter::FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond)
{
Expand Down
2 changes: 2 additions & 0 deletions Source/Core/Common/Arm64Emitter.h
Expand Up @@ -1094,6 +1094,8 @@ class ARM64FloatEmitter
void FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn);
void FCMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
void FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn);
void FACGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void FACGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);

// Conditional select
void FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
Expand Down
1 change: 1 addition & 0 deletions Source/Core/Common/ArmCPUDetect.cpp
Expand Up @@ -69,6 +69,7 @@ void CPUInfo::Detect()
CPU64bit = true;
Mode64bit = true;
vendor = CPUVendor::ARM;
bFlushToZero = true;

#ifdef _WIN32
num_cores = std::thread::hardware_concurrency();
Expand Down
78 changes: 78 additions & 0 deletions Source/Core/Common/ArmFPURoundMode.cpp
@@ -0,0 +1,78 @@
// Copyright 2021 Dolphin Emulator Project
// Licensed under GPLv2+
// Refer to the license.txt file included.

#include "Common/CommonTypes.h"
#include "Common/FPURoundMode.h"

#ifdef _MSC_VER
#include <intrin.h>
#endif

static u64 GetFPCR()
{
#ifdef _MSC_VER
return _ReadStatusReg(ARM64_FPCR);
#else
u64 fpcr;
__asm__ __volatile__("mrs %0, fpcr" : "=r"(fpcr));
return fpcr;
#endif
}

static void SetFPCR(u64 fpcr)
{
#ifdef _MSC_VER
_WriteStatusReg(ARM64_FPCR, fpcr);
#else
__asm__ __volatile__("msr fpcr, %0" : : "ri"(fpcr));
#endif
}

namespace FPURoundMode
{
static const u64 default_fpcr = GetFPCR();
static u64 saved_fpcr = default_fpcr;

void SetRoundMode(int mode)
{
// We don't need to do anything here since SetSIMDMode is always called after calling this
}

void SetPrecisionMode(PrecisionMode mode)
{
}

void SetSIMDMode(int rounding_mode, bool non_ieee_mode)
{
// Flush-To-Zero (non-IEEE mode: denormal outputs are set to +/- 0)
constexpr u32 FZ = 1 << 24;

// lookup table for FPSCR.RN-to-FPCR.RMode translation
constexpr u32 rounding_mode_table[] = {
(0 << 22), // nearest
(3 << 22), // zero
(1 << 22), // +inf
(2 << 22), // -inf
};

const u64 base = default_fpcr & ~(0b111 << 22);
SetFPCR(base | rounding_mode_table[rounding_mode] | (non_ieee_mode ? FZ : 0));
}

void SaveSIMDState()
{
saved_fpcr = GetFPCR();
}

void LoadSIMDState()
{
SetFPCR(saved_fpcr);
}

void LoadDefaultSIMDState()
{
SetFPCR(default_fpcr);
}

} // namespace FPURoundMode
2 changes: 1 addition & 1 deletion Source/Core/Common/CMakeLists.txt
Expand Up @@ -199,7 +199,7 @@ if(_M_ARM_64)
Arm64Emitter.h
ArmCommon.h
ArmCPUDetect.cpp
GenericFPURoundMode.cpp
ArmFPURoundMode.cpp
)
else()
if(_M_X86) #X86
Expand Down
3 changes: 3 additions & 0 deletions Source/Core/Core/PowerPC/Jit64/Jit.cpp
Expand Up @@ -982,6 +982,7 @@ bool Jit64::DoJit(u32 em_address, JitBlock* b, u32 nextPC)

js.compilerPC = op.address;
js.op = &op;
js.fpr_is_store_safe = op.fprIsStoreSafeBeforeInst;
js.instructionNumber = i;
js.instructionsLeft = (code_block.m_num_instructions - 1) - i;
const GekkoOPInfo* opinfo = op.opinfo;
Expand Down Expand Up @@ -1118,6 +1119,8 @@ bool Jit64::DoJit(u32 em_address, JitBlock* b, u32 nextPC)

CompileInstruction(op);

js.fpr_is_store_safe = op.fprIsStoreSafeAfterInst;

if (jo.memcheck && (opinfo->flags & FL_LOADSTORE))
{
// If we have a fastmem loadstore, we can omit the exception check and let fastmem handle
Expand Down
2 changes: 1 addition & 1 deletion Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp
Expand Up @@ -105,7 +105,7 @@ void Jit64::stfXXX(UGeckoInstruction inst)

if (single)
{
if (js.op->fprIsStoreSafe[s])
if (js.fpr_is_store_safe[s])
{
RCOpArg Rs = fpr.Use(s, RCMode::Read);
RegCache::Realize(Rs);
Expand Down
4 changes: 4 additions & 0 deletions Source/Core/Core/PowerPC/JitArm64/Jit.cpp
Expand Up @@ -695,6 +695,7 @@ void JitArm64::DoJit(u32 em_address, JitBlock* b, u32 nextPC)

js.compilerPC = op.address;
js.op = &op;
js.fpr_is_store_safe = op.fprIsStoreSafeBeforeInst;
js.instructionNumber = i;
js.instructionsLeft = (code_block.m_num_instructions - 1) - i;
const GekkoOPInfo* opinfo = op.opinfo;
Expand Down Expand Up @@ -830,6 +831,9 @@ void JitArm64::DoJit(u32 em_address, JitBlock* b, u32 nextPC)
}

CompileInstruction(op);

js.fpr_is_store_safe = op.fprIsStoreSafeAfterInst;

if (!CanMergeNextInstructions(1) || js.op[1].opinfo->type != ::OpType::Integer)
FlushCarry();

Expand Down
23 changes: 22 additions & 1 deletion Source/Core/Core/PowerPC/JitArm64/Jit.h
Expand Up @@ -152,7 +152,20 @@ class JitArm64 : public JitBase, public Arm64Gen::ARM64CodeBlock, public CommonA
void psq_l(UGeckoInstruction inst);
void psq_st(UGeckoInstruction inst);

private:
void ConvertDoubleToSingleLower(size_t guest_reg, Arm64Gen::ARM64Reg dest_reg,
Arm64Gen::ARM64Reg src_reg);
void ConvertDoubleToSinglePair(size_t guest_reg, Arm64Gen::ARM64Reg dest_reg,
Arm64Gen::ARM64Reg src_reg);
void ConvertSingleToDoubleLower(size_t guest_reg, Arm64Gen::ARM64Reg dest_reg,
Arm64Gen::ARM64Reg src_reg,
Arm64Gen::ARM64Reg scratch_reg = Arm64Gen::ARM64Reg::INVALID_REG);
void ConvertSingleToDoublePair(size_t guest_reg, Arm64Gen::ARM64Reg dest_reg,
Arm64Gen::ARM64Reg src_reg,
Arm64Gen::ARM64Reg scratch_reg = Arm64Gen::ARM64Reg::INVALID_REG);

bool IsFPRStoreSafe(size_t guest_reg) const;

protected:
struct SlowmemHandler
{
Arm64Gen::ARM64Reg dest_reg;
Expand Down Expand Up @@ -184,14 +197,18 @@ class JitArm64 : public JitBase, public Arm64Gen::ARM64CodeBlock, public CommonA
nearcode = GetWritableCodePtr();
SetCodePtrUnsafe(farcode.GetWritableCodePtr());
AlignCode16();
m_in_farcode = true;
}

void SwitchToNearCode()
{
farcode.SetCodePtrUnsafe(GetWritableCodePtr());
SetCodePtrUnsafe(nearcode);
m_in_farcode = false;
}

bool IsInFarCode() const { return m_in_farcode; }

// Dump a memory range of code
void DumpCode(const u8* start, const u8* end);

Expand All @@ -215,6 +232,9 @@ class JitArm64 : public JitBase, public Arm64Gen::ARM64CodeBlock, public CommonA
// AsmRoutines
void GenerateAsm();
void GenerateCommonAsm();
void GenerateConvertDoubleToSingle();
void GenerateConvertSingleToDouble();
void GenerateQuantizedLoadStores();

// Profiling
void BeginTimeProfile(JitBlock* b);
Expand Down Expand Up @@ -254,6 +274,7 @@ class JitArm64 : public JitBase, public Arm64Gen::ARM64CodeBlock, public CommonA

Arm64Gen::ARM64CodeBlock farcode;
u8* nearcode; // Backed up when we switch to far code.
bool m_in_farcode = false;

bool m_enable_blr_optimization;
bool m_cleanup_after_stackfault = false;
Expand Down
31 changes: 2 additions & 29 deletions Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp
Expand Up @@ -61,23 +61,11 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, AR
if (flags & BackPatchInfo::FLAG_STORE && flags & BackPatchInfo::FLAG_MASK_FLOAT)
{
if (flags & BackPatchInfo::FLAG_SIZE_F32)
{
m_float_emit.FCVT(32, 64, ARM64Reg::D0, RS);
m_float_emit.REV32(8, ARM64Reg::D0, ARM64Reg::D0);
m_float_emit.STR(32, ARM64Reg::D0, MEM_REG, addr);
}
else if (flags & BackPatchInfo::FLAG_SIZE_F32I)
{
m_float_emit.REV32(8, ARM64Reg::D0, RS);
m_float_emit.STR(32, ARM64Reg::D0, MEM_REG, addr);
}
else if (flags & BackPatchInfo::FLAG_SIZE_F32X2)
{
m_float_emit.FCVTN(32, ARM64Reg::D0, RS);
m_float_emit.REV32(8, ARM64Reg::D0, ARM64Reg::D0);
m_float_emit.STR(64, ARM64Reg::Q0, MEM_REG, addr);
}
else if (flags & BackPatchInfo::FLAG_SIZE_F32X2I)
{
m_float_emit.REV32(8, ARM64Reg::D0, RS);
m_float_emit.STR(64, ARM64Reg::Q0, MEM_REG, addr);
Expand Down Expand Up @@ -184,37 +172,22 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, AR
if (flags & BackPatchInfo::FLAG_STORE && flags & BackPatchInfo::FLAG_MASK_FLOAT)
{
if (flags & BackPatchInfo::FLAG_SIZE_F32)
{
m_float_emit.FCVT(32, 64, ARM64Reg::D0, RS);
m_float_emit.UMOV(32, ARM64Reg::W0, ARM64Reg::Q0, 0);
MOVP2R(ARM64Reg::X8, &PowerPC::Write_U32);
BLR(ARM64Reg::X8);
}
else if (flags & BackPatchInfo::FLAG_SIZE_F32I)
{
m_float_emit.UMOV(32, ARM64Reg::W0, RS, 0);
MOVP2R(ARM64Reg::X8, &PowerPC::Write_U32);
BLR(ARM64Reg::X8);
}
else if (flags & BackPatchInfo::FLAG_SIZE_F32X2)
{
m_float_emit.FCVTN(32, ARM64Reg::D0, RS);
m_float_emit.UMOV(64, ARM64Reg::X0, ARM64Reg::D0, 0);
ROR(ARM64Reg::X0, ARM64Reg::X0, 32);
MOVP2R(ARM64Reg::X8, &PowerPC::Write_U64);
BLR(ARM64Reg::X8);
}
else if (flags & BackPatchInfo::FLAG_SIZE_F32X2I)
{
m_float_emit.UMOV(64, ARM64Reg::X0, RS, 0);
ROR(ARM64Reg::X0, ARM64Reg::X0, 32);
MOVP2R(ARM64Reg::X8, &PowerPC::Write_U64);
ROR(ARM64Reg::X0, ARM64Reg::X0, 32);
BLR(ARM64Reg::X8);
}
else
{
MOVP2R(ARM64Reg::X8, &PowerPC::Write_U64);
m_float_emit.UMOV(64, ARM64Reg::X0, RS, 0);
MOVP2R(ARM64Reg::X8, &PowerPC::Write_U64);
BLR(ARM64Reg::X8);
}
}
Expand Down

0 comments on commit 5da85f3

Please sign in to comment.