@@ -2,7 +2,10 @@
// Licensed under GPLv2+
// Refer to the license.txt file included.

#include <limits>

#include "Common/Arm64Emitter.h"
#include "Common/BitUtils.h"
#include "Common/CommonTypes.h"
#include "Common/FloatUtils.h"
#include "Common/JitRegister.h"
@@ -198,6 +201,14 @@ void JitArm64::GenerateAsm()

void JitArm64::GenerateCommonAsm()
{
GetAsmRoutines()->fres = GetCodePtr();
GenerateFres();
JitRegister::Register(GetAsmRoutines()->fres, GetCodePtr(), "JIT_fres");

GetAsmRoutines()->frsqrte = GetCodePtr();
GenerateFrsqrte();
JitRegister::Register(GetAsmRoutines()->frsqrte, GetCodePtr(), "JIT_frsqrte");

GetAsmRoutines()->cdts = GetCodePtr();
GenerateConvertDoubleToSingle();
JitRegister::Register(GetAsmRoutines()->cdts, GetCodePtr(), "JIT_cdts");
@@ -215,6 +226,125 @@ void JitArm64::GenerateCommonAsm()
GenerateQuantizedLoadStores();
}

// Input: X1 contains input, and D0 contains result of running the input through AArch64 FRECPE.
// Output in X0 and memory (PPCState). Clobbers X0-X4 and flags.
void JitArm64::GenerateFres()
{
// The idea behind this implementation: AArch64's frecpe instruction calculates the exponent and
// sign the same way as PowerPC's fresx does. For the special inputs zero, NaN and infinity,
// even the mantissa matches. But the mantissa does not match for most other inputs, so in the
// normal case we calculate the mantissa using the table-based algorithm from the interpreter.

UBFX(ARM64Reg::X2, ARM64Reg::X1, 52, 11); // Grab the exponent
m_float_emit.FMOV(ARM64Reg::X0, ARM64Reg::D0);
CMP(ARM64Reg::X2, 895);
ANDI2R(ARM64Reg::X3, ARM64Reg::X1, Common::DOUBLE_SIGN);
FixupBranch small_exponent = B(CCFlags::CC_LO);

MOVI2R(ARM64Reg::X4, 1148LL);
CMP(ARM64Reg::X2, ARM64Reg::X4);
FixupBranch large_exponent = B(CCFlags::CC_HI);

UBFX(ARM64Reg::X2, ARM64Reg::X1, 47, 5); // Grab upper part of mantissa
MOVP2R(ARM64Reg::X3, &Common::fres_expected);
ADD(ARM64Reg::X2, ARM64Reg::X3, ARM64Reg::X2, ArithOption(ARM64Reg::X2, ShiftType::LSL, 3));
LDP(IndexType::Signed, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::X2, 0);
UBFX(ARM64Reg::X1, ARM64Reg::X1, 37, 10); // Grab lower part of mantissa
MOVI2R(ARM64Reg::W4, 1);
ANDI2R(ARM64Reg::X0, ARM64Reg::X0, Common::DOUBLE_SIGN | Common::DOUBLE_EXP);
MADD(ARM64Reg::W1, ARM64Reg::W3, ARM64Reg::W1, ARM64Reg::W4);
SUB(ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W1, ArithOption(ARM64Reg::W1, ShiftType::LSR, 1));
ORR(ARM64Reg::X0, ARM64Reg::X0, ARM64Reg::X1, ArithOption(ARM64Reg::X1, ShiftType::LSL, 29));
RET();

SetJumpTarget(small_exponent);
TSTI2R(ARM64Reg::X1, Common::DOUBLE_EXP | Common::DOUBLE_FRAC);
FixupBranch zero = B(CCFlags::CC_EQ);
MOVI2R(ARM64Reg::X4,
Common::BitCast<u64>(static_cast<double>(std::numeric_limits<float>::max())));
ORR(ARM64Reg::X0, ARM64Reg::X3, ARM64Reg::X4);
RET();

SetJumpTarget(zero);
LDR(IndexType::Unsigned, ARM64Reg::W4, PPC_REG, PPCSTATE_OFF(fpscr));
FixupBranch skip_set_zx = TBNZ(ARM64Reg::W4, 26);
ORRI2R(ARM64Reg::W4, ARM64Reg::W4, FPSCR_FX | FPSCR_ZX, ARM64Reg::W2);
STR(IndexType::Unsigned, ARM64Reg::W4, PPC_REG, PPCSTATE_OFF(fpscr));
SetJumpTarget(skip_set_zx);
RET();

SetJumpTarget(large_exponent);
MOVI2R(ARM64Reg::X4, 0x7FF);
CMP(ARM64Reg::X2, ARM64Reg::X4);
CSEL(ARM64Reg::X0, ARM64Reg::X0, ARM64Reg::X3, CCFlags::CC_EQ);
RET();
}

// Input: X1 contains input, and D0 contains result of running the input through AArch64 FRSQRTE.
// Output in X0 and memory (PPCState). Clobbers X0-X4 and flags.
void JitArm64::GenerateFrsqrte()
{
// The idea behind this implementation: AArch64's frsqrte instruction calculates the exponent and
// sign the same way as PowerPC's frsqrtex does. For the special inputs zero, negative, NaN and
// inf, even the mantissa matches. But the mantissa does not match for most other inputs, so in
// the normal case we calculate the mantissa using the table-based algorithm from the interpreter.

TSTI2R(ARM64Reg::X1, Common::DOUBLE_EXP | Common::DOUBLE_FRAC);
m_float_emit.FMOV(ARM64Reg::X0, ARM64Reg::D0);
FixupBranch zero = B(CCFlags::CC_EQ);
ANDI2R(ARM64Reg::X2, ARM64Reg::X1, Common::DOUBLE_EXP);
MOVI2R(ARM64Reg::X3, Common::DOUBLE_EXP);
CMP(ARM64Reg::X2, ARM64Reg::X3);
FixupBranch nan_or_inf = B(CCFlags::CC_EQ);
FixupBranch negative = TBNZ(ARM64Reg::X1, 63);
ANDI2R(ARM64Reg::X3, ARM64Reg::X1, Common::DOUBLE_FRAC);
FixupBranch normal = CBNZ(ARM64Reg::X2);

// "Normalize" denormal values
CLZ(ARM64Reg::X3, ARM64Reg::X3);
SUB(ARM64Reg::X4, ARM64Reg::X3, 11);
MOVI2R(ARM64Reg::X2, 0x00C0'0000'0000'0000);
LSLV(ARM64Reg::X4, ARM64Reg::X1, ARM64Reg::X4);
SUB(ARM64Reg::X2, ARM64Reg::X2, ARM64Reg::X3, ArithOption(ARM64Reg::X3, ShiftType::LSL, 52));
ANDI2R(ARM64Reg::X3, ARM64Reg::X4, Common::DOUBLE_FRAC - 1);

SetJumpTarget(normal);
LSR(ARM64Reg::X2, ARM64Reg::X2, 48);
ANDI2R(ARM64Reg::X2, ARM64Reg::X2, 0x10);
MOVP2R(ARM64Reg::X1, &Common::frsqrte_expected);
ORR(ARM64Reg::X2, ARM64Reg::X2, ARM64Reg::X3, ArithOption(ARM64Reg::X8, ShiftType::LSR, 48));
EORI2R(ARM64Reg::X2, ARM64Reg::X2, 0x10);
ADD(ARM64Reg::X2, ARM64Reg::X1, ARM64Reg::X2, ArithOption(ARM64Reg::X2, ShiftType::LSL, 3));
LDP(IndexType::Signed, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::X2, 0);
UBFX(ARM64Reg::X3, ARM64Reg::X3, 37, 11);
ANDI2R(ARM64Reg::X0, ARM64Reg::X0, Common::DOUBLE_SIGN | Common::DOUBLE_EXP);
MSUB(ARM64Reg::W3, ARM64Reg::W3, ARM64Reg::W2, ARM64Reg::W1);
ORR(ARM64Reg::X0, ARM64Reg::X0, ARM64Reg::X3, ArithOption(ARM64Reg::X3, ShiftType::LSL, 26));
RET();

SetJumpTarget(zero);
LDR(IndexType::Unsigned, ARM64Reg::W4, PPC_REG, PPCSTATE_OFF(fpscr));
FixupBranch skip_set_zx = TBNZ(ARM64Reg::W4, 26);
ORRI2R(ARM64Reg::W4, ARM64Reg::W4, FPSCR_FX | FPSCR_ZX, ARM64Reg::W2);
STR(IndexType::Unsigned, ARM64Reg::W4, PPC_REG, PPCSTATE_OFF(fpscr));
SetJumpTarget(skip_set_zx);
RET();

SetJumpTarget(nan_or_inf);
MOVI2R(ARM64Reg::X3, Common::BitCast<u64>(-std::numeric_limits<double>::infinity()));
CMP(ARM64Reg::X1, ARM64Reg::X3);
FixupBranch nan_or_positive_inf = B(CCFlags::CC_NEQ);

SetJumpTarget(negative);
LDR(IndexType::Unsigned, ARM64Reg::W4, PPC_REG, PPCSTATE_OFF(fpscr));
FixupBranch skip_set_vxsqrt = TBNZ(ARM64Reg::W4, 9);
ORRI2R(ARM64Reg::W4, ARM64Reg::W4, FPSCR_FX | FPSCR_VXSQRT, ARM64Reg::W2);
STR(IndexType::Unsigned, ARM64Reg::W4, PPC_REG, PPCSTATE_OFF(fpscr));
SetJumpTarget(skip_set_vxsqrt);
SetJumpTarget(nan_or_positive_inf);
RET();
}

// Input in X0, output in W1, clobbers X0-X3 and flags.
void JitArm64::GenerateConvertDoubleToSingle()
{
@@ -25,6 +25,8 @@ elseif(_M_ARM_64)
PowerPC/DivUtilsTest.cpp
PowerPC/JitArm64/ConvertSingleDouble.cpp
PowerPC/JitArm64/FPRF.cpp
PowerPC/JitArm64/Fres.cpp
PowerPC/JitArm64/Frsqrte.cpp
PowerPC/JitArm64/MovI2R.cpp
)
else()
@@ -0,0 +1,66 @@
// Copyright 2021 Dolphin Emulator Project
// Licensed under GPLv2+
// Refer to the license.txt file included.

#include <functional>

#include "Common/Arm64Emitter.h"
#include "Common/BitUtils.h"
#include "Common/CommonTypes.h"
#include "Core/PowerPC/Interpreter/Interpreter_FPUtils.h"
#include "Core/PowerPC/JitArm64/Jit.h"
#include "Core/PowerPC/PowerPC.h"

#include "../TestValues.h"

#include <gtest/gtest.h>

namespace
{
using namespace Arm64Gen;

class TestFres : public JitArm64
{
public:
TestFres()
{
AllocCodeSpace(4096);

const u8* raw_fres = GetCodePtr();
GenerateFres();

fres = Common::BitCast<u64 (*)(u64)>(GetCodePtr());
MOV(ARM64Reg::X15, ARM64Reg::X30);
MOV(ARM64Reg::X14, PPC_REG);
MOVP2R(PPC_REG, &PowerPC::ppcState);
MOV(ARM64Reg::X1, ARM64Reg::X0);
m_float_emit.FMOV(ARM64Reg::D0, ARM64Reg::X0);
m_float_emit.FRECPE(ARM64Reg::D0, ARM64Reg::D0);
BL(raw_fres);
MOV(ARM64Reg::X30, ARM64Reg::X15);
MOV(PPC_REG, ARM64Reg::X14);
RET();
}

std::function<u64(u64)> fres;
};

} // namespace

TEST(JitArm64, Fres)
{
TestFres test;

for (const u64 ivalue : double_test_values)
{
const double dvalue = Common::BitCast<double>(ivalue);

const u64 expected = Common::BitCast<u64>(Common::ApproximateReciprocal(dvalue));
const u64 actual = test.fres(ivalue);

if (expected != actual)
fmt::print("{:016x} -> {:016x} == {:016x}\n", ivalue, actual, expected);

EXPECT_EQ(expected, actual);
}
}
@@ -0,0 +1,66 @@
// Copyright 2021 Dolphin Emulator Project
// Licensed under GPLv2+
// Refer to the license.txt file included.

#include <functional>

#include "Common/Arm64Emitter.h"
#include "Common/BitUtils.h"
#include "Common/CommonTypes.h"
#include "Core/PowerPC/Interpreter/Interpreter_FPUtils.h"
#include "Core/PowerPC/JitArm64/Jit.h"
#include "Core/PowerPC/PowerPC.h"

#include "../TestValues.h"

#include <gtest/gtest.h>

namespace
{
using namespace Arm64Gen;

class TestFrsqrte : public JitArm64
{
public:
TestFrsqrte()
{
AllocCodeSpace(4096);

const u8* raw_frsqrte = GetCodePtr();
GenerateFrsqrte();

frsqrte = Common::BitCast<u64 (*)(u64)>(GetCodePtr());
MOV(ARM64Reg::X15, ARM64Reg::X30);
MOV(ARM64Reg::X14, PPC_REG);
MOVP2R(PPC_REG, &PowerPC::ppcState);
MOV(ARM64Reg::X1, ARM64Reg::X0);
m_float_emit.FMOV(ARM64Reg::D0, ARM64Reg::X0);
m_float_emit.FRSQRTE(ARM64Reg::D0, ARM64Reg::D0);
BL(raw_frsqrte);
MOV(ARM64Reg::X30, ARM64Reg::X15);
MOV(PPC_REG, ARM64Reg::X14);
RET();
}

std::function<u64(u64)> frsqrte;
};

} // namespace

TEST(JitArm64, Frsqrte)
{
TestFrsqrte test;

for (const u64 ivalue : double_test_values)
{
const double dvalue = Common::BitCast<double>(ivalue);

const u64 expected = Common::BitCast<u64>(Common::ApproximateReciprocalSquareRoot(dvalue));
const u64 actual = test.frsqrte(ivalue);

if (expected != actual)
fmt::print("{:016x} -> {:016x} == {:016x}\n", ivalue, actual, expected);

EXPECT_EQ(expected, actual);
}
}
@@ -8,7 +8,7 @@

#include "Common/CommonTypes.h"

constexpr std::array<u64, 49> double_test_values{
constexpr std::array<u64, 57> double_test_values{
// Special values
0x0000'0000'0000'0000, // positive zero
0x0000'0000'0000'0001, // smallest positive denormal
@@ -54,13 +54,25 @@ constexpr std::array<u64, 49> double_test_values{
0x3680'1234'5678'9ABC, 0x36A0'1234'5678'9ABC, 0x36B0'1234'5678'9ABC, 0xB680'1234'5678'9ABC,
0xB6A0'1234'5678'9ABC, 0xB6B0'1234'5678'9ABC,

// (exp > 1148) Boundary case for fres
0x47C0'0000'0000'0000, // 2^125 = fres result is non-zero
0x47D0'0000'0000'0000, // 2^126 = fres result is zero
0xC7C0'0000'0000'0000, // -2^125 = fres result is non-zero
0xC7D0'0000'0000'0000, // -2^126 = fres result is zero

// (exp < 895) Boundary case for fres
0x37F0'0000'0000'0000, // 2^(-128) = fres result is non-max
0x37E0'0000'0000'0000, // 2^(-129) = fres result is max
0xB7F0'0000'0000'0000, // -2^(-128) = fres result is non-max
0xB7E0'0000'0000'0000, // -2^(-129) = fres result is max

// Some typical numbers
0x3FF8'0000'0000'0000, // 1.5
0x408F'4000'0000'0000, // 1000
0xC008'0000'0000'0000, // -3
};

constexpr std::array<u32, 29> single_test_values{
constexpr std::array<u32, 33> single_test_values{
// Special values
0x0000'0000, // positive zero
0x0000'0001, // smallest positive denormal
@@ -89,6 +101,12 @@ constexpr std::array<u32, 29> single_test_values{
0xFFC0'0000, // first negative QNaN
0xFFFF'FFFF, // last negative QNaN

// (exp > 252) Boundary case for fres
0x7E00'0000, // 2^125 = fres result is non-zero
0x7E80'0000, // 2^126 = fres result is zero
0xC7C0'0000, // -2^125 = fres result is non-zero
0xC7D0'0000, // -2^126 = fres result is zero

// Some typical numbers
0x3FC0'0000, // 1.5
0x447A'0000, // 1000
@@ -84,6 +84,8 @@
<ItemGroup Condition="'$(Platform)'=='ARM64'">
<ClCompile Include="Core\PowerPC\JitArm64\ConvertSingleDouble.cpp" />
<ClCompile Include="Core\PowerPC\JitArm64\FPRF.cpp" />
<ClCompile Include="Core\PowerPC\JitArm64\Fres.cpp" />
<ClCompile Include="Core\PowerPC\JitArm64\Frsqrte.cpp" />
<ClCompile Include="Core\PowerPC\JitArm64\MovI2R.cpp" />
</ItemGroup>
<ItemGroup>