Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Merge pull request #9695 from JosJuice/jitarm64-fres
JitArm64: Implement fres and frsqrte
  • Loading branch information
lioncash committed May 20, 2021
2 parents 539c2cb + e5f2dcd commit 6958df5
Show file tree
Hide file tree
Showing 12 changed files with 474 additions and 40 deletions.
15 changes: 15 additions & 0 deletions Source/Core/Common/Arm64Emitter.cpp
Expand Up @@ -2310,6 +2310,12 @@ void ARM64FloatEmitter::EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd
(DecodeReg(Rn) << 5) | DecodeReg(Rd));
}

void ARM64FloatEmitter::EmitScalar2RegMisc(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn)
{
Write32((1 << 30) | (U << 29) | (0b11110001 << 21) | (size << 22) | (opcode << 12) | (1 << 11) |
(DecodeReg(Rn) << 5) | DecodeReg(Rd));
}

void ARM64FloatEmitter::Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn)
{
ASSERT_MSG(DYNA_REC, !IsSingle(Rd), "%s doesn't support singles!", __func__);
Expand Down Expand Up @@ -3102,6 +3108,15 @@ void ARM64FloatEmitter::FSQRT(ARM64Reg Rd, ARM64Reg Rn)
EmitScalar1Source(0, 0, IsDouble(Rd), 3, Rd, Rn);
}

void ARM64FloatEmitter::FRECPE(ARM64Reg Rd, ARM64Reg Rn)
{
EmitScalar2RegMisc(0, 2 | IsDouble(Rd), 0x1D, Rd, Rn);
}
void ARM64FloatEmitter::FRSQRTE(ARM64Reg Rd, ARM64Reg Rn)
{
EmitScalar2RegMisc(1, 2 | IsDouble(Rd), 0x1D, Rd, Rn);
}

// Scalar - 2 Source
void ARM64FloatEmitter::FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
Expand Down
3 changes: 3 additions & 0 deletions Source/Core/Common/Arm64Emitter.h
Expand Up @@ -996,6 +996,8 @@ class ARM64FloatEmitter
void FNEG(ARM64Reg Rd, ARM64Reg Rn);
void FSQRT(ARM64Reg Rd, ARM64Reg Rn);
void FMOV(ARM64Reg Rd, ARM64Reg Rn, bool top = false); // Also generalized move between GPR/FP
void FRECPE(ARM64Reg Rd, ARM64Reg Rn);
void FRSQRTE(ARM64Reg Rd, ARM64Reg Rn);

// Scalar - 2 Source
void FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
Expand Down Expand Up @@ -1145,6 +1147,7 @@ class ARM64FloatEmitter
ARM64Reg Rm);
void EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd, ARM64Reg Rn);
void EmitScalar2RegMisc(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
void Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt,
ARM64Reg Rn);
Expand Down
6 changes: 6 additions & 0 deletions Source/Core/Core/PowerPC/JitArm64/Jit.h
Expand Up @@ -140,13 +140,17 @@ class JitArm64 : public JitBase, public Arm64Gen::ARM64CodeBlock, public CommonA
void fcmpX(UGeckoInstruction inst);
void frspx(UGeckoInstruction inst);
void fctiwzx(UGeckoInstruction inst);
void fresx(UGeckoInstruction inst);
void frsqrtex(UGeckoInstruction inst);

// Paired
void ps_maddXX(UGeckoInstruction inst);
void ps_mergeXX(UGeckoInstruction inst);
void ps_mulsX(UGeckoInstruction inst);
void ps_sel(UGeckoInstruction inst);
void ps_sumX(UGeckoInstruction inst);
void ps_res(UGeckoInstruction inst);
void ps_rsqrte(UGeckoInstruction inst);

// Loadstore paired
void psq_l(UGeckoInstruction inst);
Expand Down Expand Up @@ -232,6 +236,8 @@ class JitArm64 : public JitBase, public Arm64Gen::ARM64CodeBlock, public CommonA
// AsmRoutines
void GenerateAsm();
void GenerateCommonAsm();
void GenerateFres();
void GenerateFrsqrte();
void GenerateConvertDoubleToSingle();
void GenerateConvertSingleToDouble();
void GenerateFPRF(bool single);
Expand Down
66 changes: 64 additions & 2 deletions Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
Expand Up @@ -24,8 +24,16 @@ void JitArm64::SetFPRFIfNeeded(bool single, ARM64Reg reg)

gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30);

reg = single ? EncodeRegToSingle(reg) : EncodeRegToDouble(reg);
m_float_emit.FMOV(single ? ARM64Reg::W0 : ARM64Reg::X0, reg);
const ARM64Reg routine_input_reg = single ? ARM64Reg::W0 : ARM64Reg::X0;
if (IsVector(reg))
{
m_float_emit.FMOV(routine_input_reg, single ? EncodeRegToSingle(reg) : EncodeRegToDouble(reg));
}
else if (reg != routine_input_reg)
{
MOV(routine_input_reg, reg);
}

BL(single ? GetAsmRoutines()->fprf_single : GetAsmRoutines()->fprf_double);

gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30);
Expand Down Expand Up @@ -430,6 +438,60 @@ void JitArm64::fctiwzx(UGeckoInstruction inst)
"Register allocation turned singles into doubles in the middle of fctiwzx");
}

void JitArm64::fresx(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITFloatingPointOff);
FALLBACK_IF(inst.Rc);

const u32 b = inst.FB;
const u32 d = inst.FD;

gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30);
fpr.Lock(ARM64Reg::Q0);

const ARM64Reg VB = fpr.R(b, RegType::LowerPair);
m_float_emit.FMOV(ARM64Reg::X1, EncodeRegToDouble(VB));
m_float_emit.FRECPE(ARM64Reg::D0, EncodeRegToDouble(VB));

BL(GetAsmRoutines()->fres);

gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30);
fpr.Unlock(ARM64Reg::Q0);

const ARM64Reg VD = fpr.RW(d, RegType::Duplicated);
m_float_emit.FMOV(EncodeRegToDouble(VD), ARM64Reg::X0);

SetFPRFIfNeeded(false, ARM64Reg::X0);
}

void JitArm64::frsqrtex(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITFloatingPointOff);
FALLBACK_IF(inst.Rc);

const u32 b = inst.FB;
const u32 d = inst.FD;

gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30);
fpr.Lock(ARM64Reg::Q0);

const ARM64Reg VB = fpr.R(b, RegType::LowerPair);
m_float_emit.FMOV(ARM64Reg::X1, EncodeRegToDouble(VB));
m_float_emit.FRSQRTE(ARM64Reg::D0, EncodeRegToDouble(VB));

BL(GetAsmRoutines()->frsqrte);

gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30);
fpr.Unlock(ARM64Reg::Q0);

const ARM64Reg VD = fpr.RW(d, RegType::LowerPair);
m_float_emit.FMOV(EncodeRegToDouble(VD), ARM64Reg::X0);

SetFPRFIfNeeded(false, ARM64Reg::X0);
}

// Since the following float conversion functions are used in non-arithmetic PPC float
// instructions, they must convert floats bitexact and never flush denormals to zero or turn SNaNs
// into QNaNs. This means we can't just use FCVT/FCVTL/FCVTN.
Expand Down
64 changes: 64 additions & 0 deletions Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
Expand Up @@ -353,3 +353,67 @@ void JitArm64::ps_sumX(UGeckoInstruction inst)

SetFPRFIfNeeded(true, VD);
}

void JitArm64::ps_res(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITPairedOff);
FALLBACK_IF(inst.Rc);

const u32 b = inst.FB;
const u32 d = inst.FD;

gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30);
fpr.Lock(ARM64Reg::Q0);

const ARM64Reg VB = fpr.R(b, RegType::Register);
const ARM64Reg VD = fpr.RW(d, RegType::Register);

m_float_emit.FMOV(ARM64Reg::X1, EncodeRegToDouble(VB));
m_float_emit.FRECPE(64, ARM64Reg::Q0, EncodeRegToQuad(VB));
BL(GetAsmRoutines()->fres);
m_float_emit.UMOV(64, ARM64Reg::X1, EncodeRegToQuad(VB), 1);
m_float_emit.DUP(64, ARM64Reg::Q0, ARM64Reg::Q0, 1);
m_float_emit.FMOV(EncodeRegToDouble(VD), ARM64Reg::X0);
BL(GetAsmRoutines()->fres);
m_float_emit.INS(64, EncodeRegToQuad(VD), 1, ARM64Reg::X0);

gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30);
fpr.Unlock(ARM64Reg::Q0);

fpr.FixSinglePrecision(d);

SetFPRFIfNeeded(true, VD);
}

void JitArm64::ps_rsqrte(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITPairedOff);
FALLBACK_IF(inst.Rc);

const u32 b = inst.FB;
const u32 d = inst.FD;

gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30);
fpr.Lock(ARM64Reg::Q0);

const ARM64Reg VB = fpr.R(b, RegType::Register);
const ARM64Reg VD = fpr.RW(d, RegType::Register);

m_float_emit.FMOV(ARM64Reg::X1, EncodeRegToDouble(VB));
m_float_emit.FRSQRTE(64, ARM64Reg::Q0, EncodeRegToQuad(VB));
BL(GetAsmRoutines()->frsqrte);
m_float_emit.UMOV(64, ARM64Reg::X1, EncodeRegToQuad(VB), 1);
m_float_emit.DUP(64, ARM64Reg::Q0, ARM64Reg::Q0, 1);
m_float_emit.FMOV(EncodeRegToDouble(VD), ARM64Reg::X0);
BL(GetAsmRoutines()->frsqrte);
m_float_emit.INS(64, EncodeRegToQuad(VD), 1, ARM64Reg::X0);

gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30);
fpr.Unlock(ARM64Reg::Q0);

fpr.FixSinglePrecision(d);

SetFPRFIfNeeded(true, VD);
}
72 changes: 36 additions & 36 deletions Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp
Expand Up @@ -106,23 +106,23 @@ constexpr std::array<GekkoOPTemplate, 13> table4{{
}};

constexpr std::array<GekkoOPTemplate, 17> table4_2{{
{10, &JitArm64::ps_sumX}, // ps_sum0
{11, &JitArm64::ps_sumX}, // ps_sum1
{12, &JitArm64::ps_mulsX}, // ps_muls0
{13, &JitArm64::ps_mulsX}, // ps_muls1
{14, &JitArm64::ps_maddXX}, // ps_madds0
{15, &JitArm64::ps_maddXX}, // ps_madds1
{18, &JitArm64::fp_arith}, // ps_div
{20, &JitArm64::fp_arith}, // ps_sub
{21, &JitArm64::fp_arith}, // ps_add
{23, &JitArm64::ps_sel}, // ps_sel
{24, &JitArm64::FallBackToInterpreter}, // ps_res
{25, &JitArm64::fp_arith}, // ps_mul
{26, &JitArm64::FallBackToInterpreter}, // ps_rsqrte
{28, &JitArm64::ps_maddXX}, // ps_msub
{29, &JitArm64::ps_maddXX}, // ps_madd
{30, &JitArm64::ps_maddXX}, // ps_nmsub
{31, &JitArm64::ps_maddXX}, // ps_nmadd
{10, &JitArm64::ps_sumX}, // ps_sum0
{11, &JitArm64::ps_sumX}, // ps_sum1
{12, &JitArm64::ps_mulsX}, // ps_muls0
{13, &JitArm64::ps_mulsX}, // ps_muls1
{14, &JitArm64::ps_maddXX}, // ps_madds0
{15, &JitArm64::ps_maddXX}, // ps_madds1
{18, &JitArm64::fp_arith}, // ps_div
{20, &JitArm64::fp_arith}, // ps_sub
{21, &JitArm64::fp_arith}, // ps_add
{23, &JitArm64::ps_sel}, // ps_sel
{24, &JitArm64::ps_res}, // ps_res
{25, &JitArm64::fp_arith}, // ps_mul
{26, &JitArm64::ps_rsqrte}, // ps_rsqrte
{28, &JitArm64::ps_maddXX}, // ps_msub
{29, &JitArm64::ps_maddXX}, // ps_madd
{30, &JitArm64::ps_maddXX}, // ps_nmsub
{31, &JitArm64::ps_maddXX}, // ps_nmadd
}};

constexpr std::array<GekkoOPTemplate, 4> table4_3{{
Expand Down Expand Up @@ -293,15 +293,15 @@ constexpr std::array<GekkoOPTemplate, 107> table31{{
}};

constexpr std::array<GekkoOPTemplate, 9> table59{{
{18, &JitArm64::fp_arith}, // fdivsx
{20, &JitArm64::fp_arith}, // fsubsx
{21, &JitArm64::fp_arith}, // faddsx
{24, &JitArm64::FallBackToInterpreter}, // fresx
{25, &JitArm64::fp_arith}, // fmulsx
{28, &JitArm64::fp_arith}, // fmsubsx
{29, &JitArm64::fp_arith}, // fmaddsx
{30, &JitArm64::fp_arith}, // fnmsubsx
{31, &JitArm64::fp_arith}, // fnmaddsx
{18, &JitArm64::fp_arith}, // fdivsx
{20, &JitArm64::fp_arith}, // fsubsx
{21, &JitArm64::fp_arith}, // faddsx
{24, &JitArm64::fresx}, // fresx
{25, &JitArm64::fp_arith}, // fmulsx
{28, &JitArm64::fp_arith}, // fmsubsx
{29, &JitArm64::fp_arith}, // fmaddsx
{30, &JitArm64::fp_arith}, // fnmsubsx
{31, &JitArm64::fp_arith}, // fnmaddsx
}};

constexpr std::array<GekkoOPTemplate, 15> table63{{
Expand All @@ -324,16 +324,16 @@ constexpr std::array<GekkoOPTemplate, 15> table63{{
}};

constexpr std::array<GekkoOPTemplate, 10> table63_2{{
{18, &JitArm64::fp_arith}, // fdivx
{20, &JitArm64::fp_arith}, // fsubx
{21, &JitArm64::fp_arith}, // faddx
{23, &JitArm64::fselx}, // fselx
{25, &JitArm64::fp_arith}, // fmulx
{26, &JitArm64::FallBackToInterpreter}, // frsqrtex
{28, &JitArm64::fp_arith}, // fmsubx
{29, &JitArm64::fp_arith}, // fmaddx
{30, &JitArm64::fp_arith}, // fnmsubx
{31, &JitArm64::fp_arith}, // fnmaddx
{18, &JitArm64::fp_arith}, // fdivx
{20, &JitArm64::fp_arith}, // fsubx
{21, &JitArm64::fp_arith}, // faddx
{23, &JitArm64::fselx}, // fselx
{25, &JitArm64::fp_arith}, // fmulx
{26, &JitArm64::frsqrtex}, // frsqrtex
{28, &JitArm64::fp_arith}, // fmsubx
{29, &JitArm64::fp_arith}, // fmaddx
{30, &JitArm64::fp_arith}, // fnmsubx
{31, &JitArm64::fp_arith}, // fnmaddx
}};

constexpr std::array<JitArm64::Instruction, 64> dynaOpTable = [] {
Expand Down

0 comments on commit 6958df5

Please sign in to comment.