Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp

// Copyright 2015 Dolphin Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later

#include "Core/PowerPC/JitArm64/Jit.h"

#include <optional>

#include "Common/Arm64Emitter.h"
#include "Common/CPUDetect.h"
#include "Common/CommonTypes.h"
#include "Common/Config/Config.h"
#include "Common/StringUtil.h"

#include "Core/Config/SessionSettings.h"
#include "Core/ConfigManager.h"
#include "Core/Core.h"
#include "Core/CoreTiming.h"
#include "Core/PowerPC/Gekko.h"
#include "Core/PowerPC/JitArm64/JitArm64_RegCache.h"
#include "Core/PowerPC/PPCTables.h"
#include "Core/PowerPC/PowerPC.h"

using namespace Arm64Gen;

void JitArm64::SetFPRFIfNeeded(bool single, ARM64Reg reg)
{
  if (!m_fprf || !js.op->wantsFPRF)
    return;

  gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30);

  const ARM64Reg routine_input_reg = single ? ARM64Reg::W0 : ARM64Reg::X0;
  if (IsVector(reg))
  {
    m_float_emit.FMOV(routine_input_reg, single ? EncodeRegToSingle(reg) : EncodeRegToDouble(reg));
  }
  else if (reg != routine_input_reg)
  {
    MOV(routine_input_reg, reg);
  }

  BL(single ? GetAsmRoutines()->fprf_single : GetAsmRoutines()->fprf_double);

  gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30);
}

// Emulate the odd truncation/rounding that the PowerPC does on the RHS operand before
// a single precision multiply. To be precise, it drops the low 28 bits of the mantissa,
// rounding to nearest as it does.
void JitArm64::Force25BitPrecision(ARM64Reg output, ARM64Reg input)
{
  if (IsQuad(input))
  {
    m_float_emit.URSHR(64, output, input, 28);
    m_float_emit.SHL(64, output, output, 28);
  }
  else
  {
    m_float_emit.URSHR(output, input, 28);
    m_float_emit.SHL(output, output, 28);
  }
}

void JitArm64::fp_arith(UGeckoInstruction inst)
{
  INSTRUCTION_START
  JITDISABLE(bJITFloatingPointOff);
  FALLBACK_IF(inst.Rc);
  FALLBACK_IF(jo.fp_exceptions || (jo.div_by_zero_exceptions && inst.SUBOP5 == 18));

  const u32 a = inst.FA;
  const u32 b = inst.FB;
  const u32 c = inst.FC;
  const u32 d = inst.FD;
  const u32 op5 = inst.SUBOP5;

  const bool use_c = op5 >= 25;  // fmul and all kind of fmaddXX
  const bool use_b = op5 != 25;  // fmul uses no B
  const bool fma = use_b && use_c;
  const bool negate_result = (op5 & ~0x1) == 30;

  // Addition and subtraction can't generate new NaNs, they can only take NaNs from inputs
  const bool can_generate_nan = (op5 & ~0x1) != 20;

  const bool output_is_single = inst.OPCD == 59;
  const bool inaccurate_fma = op5 > 25 && !Config::Get(Config::SESSION_USE_FMA);
  const bool round_c = use_c && output_is_single && !js.op->fprIsSingle[inst.FC];

  const auto inputs_are_singles_func = [&] {
    return fpr.IsSingle(a, true) && (!use_b || fpr.IsSingle(b, true)) &&
           (!use_c || fpr.IsSingle(c, true));
  };
  const bool inputs_are_singles = inputs_are_singles_func();

  const bool single = inputs_are_singles && output_is_single;
  const RegType type = single ? RegType::LowerPairSingle : RegType::LowerPair;
  const RegType type_out =
      output_is_single ? (inputs_are_singles ? RegType::DuplicatedSingle : RegType::Duplicated) :
                         RegType::LowerPair;
  const auto reg_encoder = single ? EncodeRegToSingle : EncodeRegToDouble;

  const ARM64Reg VA = reg_encoder(fpr.R(a, type));
  const ARM64Reg VB = use_b ? reg_encoder(fpr.R(b, type)) : ARM64Reg::INVALID_REG;
  const ARM64Reg VC = use_c ? reg_encoder(fpr.R(c, type)) : ARM64Reg::INVALID_REG;
  const ARM64Reg VD = reg_encoder(fpr.RW(d, type_out));

  ARM64Reg V0Q = ARM64Reg::INVALID_REG;
  ARM64Reg V1Q = ARM64Reg::INVALID_REG;

  ARM64Reg rounded_c_reg = VC;
  if (round_c)
  {
    ASSERT_MSG(DYNA_REC, !inputs_are_singles, "Tried to apply 25-bit precision to single");

    V0Q = fpr.GetReg();
    rounded_c_reg = reg_encoder(V0Q);
    Force25BitPrecision(rounded_c_reg, VC);
  }

  ARM64Reg inaccurate_fma_reg = VD;
  if (fma && inaccurate_fma && VD == VB)
  {
    if (V0Q == ARM64Reg::INVALID_REG)
      V0Q = fpr.GetReg();
    inaccurate_fma_reg = reg_encoder(V0Q);
  }

  ARM64Reg result_reg = VD;
  const bool preserve_d =
      m_accurate_nans && (VD == VA || (use_b && VD == VB) || (use_c && VD == VC));
  if (preserve_d)
  {
    V1Q = fpr.GetReg();
    result_reg = reg_encoder(V1Q);
  }

  const ARM64Reg temp_gpr = m_accurate_nans && !single ? gpr.GetReg() : ARM64Reg::INVALID_REG;

  if (m_accurate_nans)
  {
    if (V0Q == ARM64Reg::INVALID_REG)
      V0Q = fpr.GetReg();
  }

  switch (op5)
  {
  case 18:
    m_float_emit.FDIV(result_reg, VA, VB);
    break;
  case 20:
    m_float_emit.FSUB(result_reg, VA, VB);
    break;
  case 21:
    m_float_emit.FADD(result_reg, VA, VB);
    break;
  case 25:
    m_float_emit.FMUL(result_reg, VA, rounded_c_reg);
    break;
  // While it may seem like PowerPC's nmadd/nmsub map to AArch64's nmadd/msub [sic],
  // the subtly different definitions affect how signed zeroes are handled.
  // Also, PowerPC's nmadd/nmsub perform rounding before the final negation.
  // So, we negate using a separate FNEG instruction instead of using AArch64's nmadd/msub.
  case 28:  // fmsub: "D = A*C - B" vs "Vd = (-Va) + Vn*Vm"
  case 30:  // fnmsub: "D = -(A*C - B)" vs "Vd = -((-Va) + Vn*Vm)"
    if (inaccurate_fma)
    {
      m_float_emit.FMUL(inaccurate_fma_reg, VA, rounded_c_reg);
      m_float_emit.FSUB(result_reg, inaccurate_fma_reg, VB);
    }
    else
    {
      m_float_emit.FNMSUB(result_reg, VA, rounded_c_reg, VB);
    }
    break;
  case 29:  // fmadd: "D = A*C + B" vs "Vd = Va + Vn*Vm"
  case 31:  // fnmadd: "D = -(A*C + B)" vs "Vd = -(Va + Vn*Vm)"
    if (inaccurate_fma)
    {
      m_float_emit.FMUL(inaccurate_fma_reg, VA, rounded_c_reg);
      m_float_emit.FADD(result_reg, inaccurate_fma_reg, VB);
    }
    else
    {
      m_float_emit.FMADD(result_reg, VA, rounded_c_reg, VB);
    }
    break;
  default:
    ASSERT_MSG(DYNA_REC, 0, "fp_arith");
    break;
  }

  std::vector<FixupBranch> nan_fixups;
  if (m_accurate_nans)
  {
    // Check if we need to handle NaNs
    m_float_emit.FCMP(result_reg);
    FixupBranch no_nan = B(CCFlags::CC_VC);
    FixupBranch nan = B();
    SetJumpTarget(no_nan);

    SwitchToFarCode();
    SetJumpTarget(nan);

    const ARM64Reg quiet_bit_reg = reg_encoder(V0Q);

    EmitQuietNaNBitConstant(quiet_bit_reg, inputs_are_singles && output_is_single, temp_gpr);

    std::vector<ARM64Reg> inputs;
    inputs.push_back(VA);
    if (use_b && VA != VB)
      inputs.push_back(VB);
    if (use_c && VA != VC && (!use_b || VB != VC))
      inputs.push_back(VC);

    // If any inputs are NaNs, pick the first NaN of them and OR it with the quiet bit
    for (size_t i = 0; i < inputs.size(); ++i)
    {
      // Skip checking if the input is a NaN if it's the last input and we're guaranteed to have at
      // least one NaN input
      const bool check_input = can_generate_nan || i != inputs.size() - 1;

      const ARM64Reg input = inputs[i];
      FixupBranch skip;
      if (check_input)
      {
        m_float_emit.FCMP(input);
        skip = B(CCFlags::CC_VC);
      }

      m_float_emit.ORR(EncodeRegToDouble(VD), EncodeRegToDouble(input),
                       EncodeRegToDouble(quiet_bit_reg));
      nan_fixups.push_back(B());

      if (check_input)
        SetJumpTarget(skip);
    }

    std::optional<FixupBranch> nan_early_fixup;
    if (can_generate_nan)
    {
      // There was no NaN in any of the inputs, so the NaN must have been generated by the
      // arithmetic instruction. In this case, the result is already correct.
      if (negate_result)
      {
        if (result_reg != VD)
          m_float_emit.MOV(EncodeRegToDouble(VD), EncodeRegToDouble(result_reg));
        nan_fixups.push_back(B());
      }
      else
      {
        nan_early_fixup = B();
      }
    }

    SwitchToNearCode();

    if (nan_early_fixup)
      SetJumpTarget(*nan_early_fixup);
  }

  // PowerPC's nmadd/nmsub perform rounding before the final negation, which is not the case
  // for any of AArch64's FMA instructions, so we negate using a separate instruction.
  if (negate_result)
    m_float_emit.FNEG(VD, result_reg);
  else if (result_reg != VD)
    m_float_emit.MOV(EncodeRegToDouble(VD), EncodeRegToDouble(result_reg));

  for (FixupBranch fixup : nan_fixups)
    SetJumpTarget(fixup);

  if (V0Q != ARM64Reg::INVALID_REG)
    fpr.Unlock(V0Q);
  if (V1Q != ARM64Reg::INVALID_REG)
    fpr.Unlock(V1Q);
  if (temp_gpr != ARM64Reg::INVALID_REG)
    gpr.Unlock(temp_gpr);

  if (output_is_single)
  {
    ASSERT_MSG(DYNA_REC, inputs_are_singles == inputs_are_singles_func(),
               "Register allocation turned singles into doubles in the middle of fp_arith");

    fpr.FixSinglePrecision(d);
  }

  SetFPRFIfNeeded(output_is_single, VD);
}

void JitArm64::fp_logic(UGeckoInstruction inst)
{
  INSTRUCTION_START
  JITDISABLE(bJITFloatingPointOff);
  FALLBACK_IF(inst.Rc);

  const u32 b = inst.FB;
  const u32 d = inst.FD;
  const u32 op10 = inst.SUBOP10;

  bool packed = inst.OPCD == 4;

  // MR with source === dest => no-op
  if (op10 == 72 && b == d)
    return;

  const bool single = fpr.IsSingle(b, !packed);
  const u8 size = single ? 32 : 64;

  if (packed)
  {
    const RegType type = single ? RegType::Single : RegType::Register;
    const auto reg_encoder = single ? EncodeRegToDouble : EncodeRegToQuad;

    const ARM64Reg VB = reg_encoder(fpr.R(b, type));
    const ARM64Reg VD = reg_encoder(fpr.RW(d, type));

    switch (op10)
    {
    case 40:
      m_float_emit.FNEG(size, VD, VB);
      break;
    case 72:
      m_float_emit.ORR(VD, VB, VB);
      break;
    case 136:
      m_float_emit.FABS(size, VD, VB);
      m_float_emit.FNEG(size, VD, VD);
      break;
    case 264:
      m_float_emit.FABS(size, VD, VB);
      break;
    default:
      ASSERT_MSG(DYNA_REC, 0, "fp_logic");
      break;
    }
  }
  else
  {
    const RegType type = single ? RegType::LowerPairSingle : RegType::LowerPair;
    const auto reg_encoder = single ? EncodeRegToSingle : EncodeRegToDouble;

    const ARM64Reg VB = fpr.R(b, type);
    const ARM64Reg VD = fpr.RW(d, type);

    switch (op10)
    {
    case 40:
      m_float_emit.FNEG(reg_encoder(VD), reg_encoder(VB));
      break;
    case 72:
      m_float_emit.INS(size, VD, 0, VB, 0);
      break;
    case 136:
      m_float_emit.FABS(reg_encoder(VD), reg_encoder(VB));
      m_float_emit.FNEG(reg_encoder(VD), reg_encoder(VD));
      break;
    case 264:
      m_float_emit.FABS(reg_encoder(VD), reg_encoder(VB));
      break;
    default:
      ASSERT_MSG(DYNA_REC, 0, "fp_logic");
      break;
    }
  }

  ASSERT_MSG(DYNA_REC, single == fpr.IsSingle(b, !packed),
             "Register allocation turned singles into doubles in the middle of fp_logic");
}

void JitArm64::fselx(UGeckoInstruction inst)
{
  INSTRUCTION_START
  JITDISABLE(bJITFloatingPointOff);
  FALLBACK_IF(inst.Rc);

  const u32 a = inst.FA;
  const u32 b = inst.FB;
  const u32 c = inst.FC;
  const u32 d = inst.FD;

  const bool b_and_c_singles = fpr.IsSingle(b, true) && fpr.IsSingle(c, true);
  const RegType b_and_c_type = b_and_c_singles ? RegType::LowerPairSingle : RegType::LowerPair;
  const auto b_and_c_reg_encoder = b_and_c_singles ? EncodeRegToSingle : EncodeRegToDouble;

  const bool a_single = fpr.IsSingle(a, true) && (b_and_c_singles || (a != b && a != c));
  const RegType a_type = a_single ? RegType::LowerPairSingle : RegType::LowerPair;
  const auto a_reg_encoder = a_single ? EncodeRegToSingle : EncodeRegToDouble;

  const ARM64Reg VA = fpr.R(a, a_type);
  const ARM64Reg VB = fpr.R(b, b_and_c_type);
  const ARM64Reg VC = fpr.R(c, b_and_c_type);

  // If a == d, the RW call below may change the type of a to double. This is okay, because the
  // actual value in the register is not altered by RW. So let's just assert before calling RW.
  ASSERT_MSG(DYNA_REC, a_single == fpr.IsSingle(a, true),
             "Register allocation turned singles into doubles in the middle of fselx");

  const ARM64Reg VD = fpr.RW(d, b_and_c_type);

  m_float_emit.FCMPE(a_reg_encoder(VA));
  m_float_emit.FCSEL(b_and_c_reg_encoder(VD), b_and_c_reg_encoder(VC), b_and_c_reg_encoder(VB),
                     CC_GE);

  ASSERT_MSG(DYNA_REC, b_and_c_singles == (fpr.IsSingle(b, true) && fpr.IsSingle(c, true)),
             "Register allocation turned singles into doubles in the middle of fselx");
}

void JitArm64::frspx(UGeckoInstruction inst)
{
  INSTRUCTION_START
  JITDISABLE(bJITFloatingPointOff);
  FALLBACK_IF(inst.Rc);
  FALLBACK_IF(jo.fp_exceptions);

  const u32 b = inst.FB;
  const u32 d = inst.FD;

  const bool single = fpr.IsSingle(b, true);
  if (single && js.fpr_is_store_safe[b])
  {
    // Source is already in single precision, so no need to do anything but to copy to PSR1.
    const ARM64Reg VB = fpr.R(b, RegType::LowerPairSingle);
    const ARM64Reg VD = fpr.RW(d, RegType::DuplicatedSingle);

    if (b != d)
      m_float_emit.FMOV(EncodeRegToSingle(VD), EncodeRegToSingle(VB));

    ASSERT_MSG(DYNA_REC, fpr.IsSingle(b, true),
               "Register allocation turned singles into doubles in the middle of frspx");

    SetFPRFIfNeeded(true, VD);
  }
  else
  {
    const ARM64Reg VB = fpr.R(b, RegType::LowerPair);
    const ARM64Reg VD = fpr.RW(d, RegType::DuplicatedSingle);

    m_float_emit.FCVT(32, 64, EncodeRegToDouble(VD), EncodeRegToDouble(VB));

    SetFPRFIfNeeded(true, VD);
  }
}

void JitArm64::FloatCompare(UGeckoInstruction inst, bool upper)
{
  const bool fprf = m_fprf && js.op->wantsFPRF;

  const u32 a = inst.FA;
  const u32 b = inst.FB;
  const int crf = inst.CRFD;

  // On the GC/Wii CPU, outputs are flushed to zero if FPSCR.NI is set, and inputs are never
  // flushed to zero. Ideally we would emulate FPSCR.NI by setting FPCR.FZ and FPCR.AH, but
  // unfortunately FPCR.AH is a very new feature that we can't rely on (as of 2021). For CPUs
  // without FPCR.AH, the best we can do (without killing the performance by explicitly flushing
  // outputs using bitwise operations) is to only set FPCR.FZ, which flushes both inputs and
  // outputs. This may cause problems in some cases, and one such case is Pokémon Battle Revolution,
  // which does not progress past the title screen if a denormal single compares equal to zero.
  // Workaround: Perform the comparison using a double operation instead. This ensures that denormal
  // singles behave correctly in comparisons, but we still have a problem with denormal doubles.
  const bool input_ftz_workaround =
      !cpu_info.bAFP && (!js.fpr_is_store_safe[a] || !js.fpr_is_store_safe[b]);

  const bool singles = fpr.IsSingle(a, !upper) && fpr.IsSingle(b, !upper) && !input_ftz_workaround;
  const RegType lower_type = singles ? RegType::LowerPairSingle : RegType::LowerPair;
  const RegType upper_type = singles ? RegType::Single : RegType::Register;
  const auto reg_encoder = singles ? EncodeRegToSingle : EncodeRegToDouble;
  const auto paired_reg_encoder = singles ? EncodeRegToDouble : EncodeRegToQuad;

  const bool upper_a = upper && !js.op->fprIsDuplicated[a];
  const bool upper_b = upper && !js.op->fprIsDuplicated[b];
  ARM64Reg VA = reg_encoder(fpr.R(a, upper_a ? upper_type : lower_type));
  ARM64Reg VB = reg_encoder(fpr.R(b, upper_b ? upper_type : lower_type));

  gpr.BindCRToRegister(crf, false);
  const ARM64Reg XA = gpr.CR(crf);

  ARM64Reg fpscr_reg = ARM64Reg::INVALID_REG;
  if (fprf)
  {
    fpscr_reg = gpr.GetReg();
    LDR(IndexType::Unsigned, fpscr_reg, PPC_REG, PPCSTATE_OFF(fpscr));
    AND(fpscr_reg, fpscr_reg, LogicalImm(~FPCC_MASK, 32));
  }

  ARM64Reg V0Q = ARM64Reg::INVALID_REG;
  ARM64Reg V1Q = ARM64Reg::INVALID_REG;
  if (upper_a)
  {
    V0Q = fpr.GetReg();
    m_float_emit.DUP(singles ? 32 : 64, paired_reg_encoder(V0Q), paired_reg_encoder(VA), 1);
    VA = reg_encoder(V0Q);
  }
  if (upper_b)
  {
    if (a == b)
    {
      VB = VA;
    }
    else
    {
      V1Q = fpr.GetReg();
      m_float_emit.DUP(singles ? 32 : 64, paired_reg_encoder(V1Q), paired_reg_encoder(VB), 1);
      VB = reg_encoder(V1Q);
    }
  }

  m_float_emit.FCMP(VA, VB);

  if (V0Q != ARM64Reg::INVALID_REG)
    fpr.Unlock(V0Q);
  if (V1Q != ARM64Reg::INVALID_REG)
    fpr.Unlock(V1Q);

  FixupBranch pNaN, pLesser, pGreater;
  FixupBranch continue1, continue2, continue3;

  if (a != b)
  {
    // if B > A goto Greater's jump target
    pGreater = B(CC_GT);
    // if B < A, goto Lesser's jump target
    pLesser = B(CC_MI);
  }

  pNaN = B(CC_VS);

  // A == B
  MOVI2R(XA, 0);
  if (fprf)
    ORR(fpscr_reg, fpscr_reg, LogicalImm(PowerPC::CR_EQ << FPRF_SHIFT, 32));

  continue1 = B();

  SetJumpTarget(pNaN);
  MOVI2R(XA, ~(1ULL << PowerPC::CR_EMU_LT_BIT));
  if (fprf)
    ORR(fpscr_reg, fpscr_reg, LogicalImm(PowerPC::CR_SO << FPRF_SHIFT, 32));

  if (a != b)
  {
    continue2 = B();

    SetJumpTarget(pGreater);
    MOVI2R(XA, 1);
    if (fprf)
      ORR(fpscr_reg, fpscr_reg, LogicalImm(PowerPC::CR_GT << FPRF_SHIFT, 32));

    continue3 = B();

    SetJumpTarget(pLesser);
    MOVI2R(XA, ~(1ULL << PowerPC::CR_EMU_SO_BIT));
    if (fprf)
      ORR(fpscr_reg, fpscr_reg, LogicalImm(PowerPC::CR_LT << FPRF_SHIFT, 32));

    SetJumpTarget(continue2);
    SetJumpTarget(continue3);
  }
  SetJumpTarget(continue1);

  ASSERT_MSG(DYNA_REC, singles == (fpr.IsSingle(a, true) && fpr.IsSingle(b, true)),
             "Register allocation turned singles into doubles in the middle of fcmpX");

  if (fprf)
  {
    STR(IndexType::Unsigned, fpscr_reg, PPC_REG, PPCSTATE_OFF(fpscr));
    gpr.Unlock(fpscr_reg);
  }
}

void JitArm64::fcmpX(UGeckoInstruction inst)
{
  INSTRUCTION_START
  JITDISABLE(bJITFloatingPointOff);
  FALLBACK_IF(jo.fp_exceptions);

  FloatCompare(inst);
}

void JitArm64::fctiwx(UGeckoInstruction inst)
{
  INSTRUCTION_START
  JITDISABLE(bJITFloatingPointOff);
  FALLBACK_IF(inst.Rc);
  FALLBACK_IF(jo.fp_exceptions);

  const u32 b = inst.FB;
  const u32 d = inst.FD;

  const bool single = fpr.IsSingle(b, true);
  const bool is_fctiwzx = inst.SUBOP10 == 15;

  const ARM64Reg VB = fpr.R(b, single ? RegType::LowerPairSingle : RegType::LowerPair);
  const ARM64Reg VD = fpr.RW(d, RegType::LowerPair);

  // TODO: The upper 32 bits of the result are set to 0xfff80000, except for -0.0 where should be
  // set to 0xfff80001 (TODO).

  if (single)
  {
    const ARM64Reg V0 = fpr.GetReg();

    if (is_fctiwzx)
    {
      m_float_emit.FCVTS(EncodeRegToSingle(VD), EncodeRegToSingle(VB), RoundingMode::Z);
    }
    else
    {
      m_float_emit.FRINTI(EncodeRegToSingle(VD), EncodeRegToSingle(VB));
      m_float_emit.FCVTS(EncodeRegToSingle(VD), EncodeRegToSingle(VD), RoundingMode::Z);
    }

    // Generate 0xFFF8'0000'0000'0000ULL
    m_float_emit.MOVI(64, EncodeRegToDouble(V0), 0xFFFF'0000'0000'0000ULL);
    m_float_emit.BIC(16, EncodeRegToDouble(V0), 0x7);

    m_float_emit.ORR(EncodeRegToDouble(VD), EncodeRegToDouble(VD), EncodeRegToDouble(V0));

    fpr.Unlock(V0);
  }
  else
  {
    const ARM64Reg WA = gpr.GetReg();

    if (is_fctiwzx)
    {
      m_float_emit.FCVTS(WA, EncodeRegToDouble(VB), RoundingMode::Z);
    }
    else
    {
      m_float_emit.FRINTI(EncodeRegToDouble(VD), EncodeRegToDouble(VB));
      m_float_emit.FCVTS(WA, EncodeRegToDouble(VD), RoundingMode::Z);
    }

    ORR(EncodeRegTo64(WA), EncodeRegTo64(WA), LogicalImm(0xFFF8'0000'0000'0000ULL, 64));
    m_float_emit.FMOV(EncodeRegToDouble(VD), EncodeRegTo64(WA));

    gpr.Unlock(WA);
  }

  ASSERT_MSG(DYNA_REC, b == d || single == fpr.IsSingle(b, true),
             "Register allocation turned singles into doubles in the middle of fctiwzx");
}

void JitArm64::fresx(UGeckoInstruction inst)
{
  INSTRUCTION_START
  JITDISABLE(bJITFloatingPointOff);
  FALLBACK_IF(inst.Rc);
  FALLBACK_IF(jo.fp_exceptions || jo.div_by_zero_exceptions);

  const u32 b = inst.FB;
  const u32 d = inst.FD;
  fpr.Lock(ARM64Reg::Q0);

  const ARM64Reg VB = fpr.R(b, RegType::LowerPair);
  const ARM64Reg VD = fpr.RW(d, RegType::Duplicated);

  gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30);

  m_float_emit.FMOV(ARM64Reg::X1, EncodeRegToDouble(VB));
  m_float_emit.FRECPE(ARM64Reg::D0, EncodeRegToDouble(VB));

  BL(GetAsmRoutines()->fres);

  m_float_emit.FMOV(EncodeRegToDouble(VD), ARM64Reg::X0);

  SetFPRFIfNeeded(false, ARM64Reg::X0);

  gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30);
  fpr.Unlock(ARM64Reg::Q0);
}

void JitArm64::frsqrtex(UGeckoInstruction inst)
{
  INSTRUCTION_START
  JITDISABLE(bJITFloatingPointOff);
  FALLBACK_IF(inst.Rc);
  FALLBACK_IF(jo.fp_exceptions || jo.div_by_zero_exceptions);

  const u32 b = inst.FB;
  const u32 d = inst.FD;

  fpr.Lock(ARM64Reg::Q0);

  const ARM64Reg VB = fpr.R(b, RegType::LowerPair);
  const ARM64Reg VD = fpr.RW(d, RegType::LowerPair);

  gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30);

  m_float_emit.FMOV(ARM64Reg::X1, EncodeRegToDouble(VB));
  m_float_emit.FRSQRTE(ARM64Reg::D0, EncodeRegToDouble(VB));

  BL(GetAsmRoutines()->frsqrte);

  m_float_emit.FMOV(EncodeRegToDouble(VD), ARM64Reg::X0);

  SetFPRFIfNeeded(false, ARM64Reg::X0);

  gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30);
  fpr.Unlock(ARM64Reg::Q0);
}

// Since the following float conversion functions are used in non-arithmetic PPC float
// instructions, they must convert floats bitexact and never flush denormals to zero or turn SNaNs
// into QNaNs. This means we can't just use FCVT/FCVTL/FCVTN.

void JitArm64::ConvertDoubleToSingleLower(size_t guest_reg, ARM64Reg dest_reg, ARM64Reg src_reg)
{
  if (js.fpr_is_store_safe[guest_reg] && js.op->fprIsSingle[guest_reg])
  {
    m_float_emit.FCVT(32, 64, EncodeRegToDouble(dest_reg), EncodeRegToDouble(src_reg));
    return;
  }

  FlushCarry();

  const BitSet32 gpr_saved = gpr.GetCallerSavedUsed() & BitSet32{0, 1, 2, 3, 30};
  ABI_PushRegisters(gpr_saved);

  m_float_emit.FMOV(ARM64Reg::X0, EncodeRegToDouble(src_reg));
  BL(cdts);
  m_float_emit.FMOV(EncodeRegToSingle(dest_reg), ARM64Reg::W1);

  ABI_PopRegisters(gpr_saved);
}

void JitArm64::ConvertDoubleToSinglePair(size_t guest_reg, ARM64Reg dest_reg, ARM64Reg src_reg)
{
  if (js.fpr_is_store_safe[guest_reg] && js.op->fprIsSingle[guest_reg])
  {
    m_float_emit.FCVTN(32, EncodeRegToDouble(dest_reg), EncodeRegToDouble(src_reg));
    return;
  }

  FlushCarry();

  const BitSet32 gpr_saved = gpr.GetCallerSavedUsed() & BitSet32{0, 1, 2, 3, 30};
  ABI_PushRegisters(gpr_saved);

  m_float_emit.FMOV(ARM64Reg::X0, EncodeRegToDouble(src_reg));
  BL(cdts);
  m_float_emit.UMOV(64, ARM64Reg::X0, src_reg, 1);
  m_float_emit.FMOV(EncodeRegToSingle(dest_reg), ARM64Reg::W1);
  BL(cdts);
  m_float_emit.INS(32, dest_reg, 1, ARM64Reg::W1);

  ABI_PopRegisters(gpr_saved);
}

void JitArm64::ConvertSingleToDoubleLower(size_t guest_reg, ARM64Reg dest_reg, ARM64Reg src_reg,
                                          ARM64Reg scratch_reg)
{
  ASSERT(scratch_reg != src_reg);

  if (js.fpr_is_store_safe[guest_reg])
  {
    m_float_emit.FCVT(64, 32, EncodeRegToDouble(dest_reg), EncodeRegToDouble(src_reg));
    return;
  }

  const bool switch_to_farcode = !IsInFarCode();

  FlushCarry();

  // Do we know that the input isn't NaN, and that the input isn't denormal or FPCR.FZ is not set?
  // (This check unfortunately also catches zeroes)

  FixupBranch fast;
  if (scratch_reg != ARM64Reg::INVALID_REG)
  {
    m_float_emit.FABS(EncodeRegToSingle(scratch_reg), EncodeRegToSingle(src_reg));
    m_float_emit.FCMP(EncodeRegToSingle(scratch_reg));
    fast = B(CCFlags::CC_GT);

    if (switch_to_farcode)
    {
      FixupBranch slow = B();

      SwitchToFarCode();
      SetJumpTarget(slow);
    }
  }

  // If no (or if we don't have a scratch register), call the bit-exact routine

  const BitSet32 gpr_saved = gpr.GetCallerSavedUsed() & BitSet32{0, 1, 2, 3, 4, 30};
  ABI_PushRegisters(gpr_saved);

  m_float_emit.FMOV(ARM64Reg::W0, EncodeRegToSingle(src_reg));
  BL(cstd);
  m_float_emit.FMOV(EncodeRegToDouble(dest_reg), ARM64Reg::X1);

  ABI_PopRegisters(gpr_saved);

  // If yes, do a fast conversion with FCVT

  if (scratch_reg != ARM64Reg::INVALID_REG)
  {
    FixupBranch continue1 = B();

    if (switch_to_farcode)
      SwitchToNearCode();

    SetJumpTarget(fast);

    m_float_emit.FCVT(64, 32, EncodeRegToDouble(dest_reg), EncodeRegToDouble(src_reg));

    SetJumpTarget(continue1);
  }
}

void JitArm64::ConvertSingleToDoublePair(size_t guest_reg, ARM64Reg dest_reg, ARM64Reg src_reg,
                                         ARM64Reg scratch_reg)
{
  ASSERT(scratch_reg != src_reg);

  if (js.fpr_is_store_safe[guest_reg])
  {
    m_float_emit.FCVTL(64, EncodeRegToDouble(dest_reg), EncodeRegToDouble(src_reg));
    return;
  }

  const bool switch_to_farcode = !IsInFarCode();

  FlushCarry();

  // Do we know that neither input is NaN, and that neither input is denormal or FPCR.FZ is not set?
  // (This check unfortunately also catches zeroes)

  FixupBranch fast;
  if (scratch_reg != ARM64Reg::INVALID_REG)
  {
    // Set each 32-bit element of scratch_reg to 0x0000'0000 or 0xFFFF'FFFF depending on whether
    // the absolute value of the corresponding element in src_reg compares greater than 0
    m_float_emit.MOVI(64, EncodeRegToDouble(scratch_reg), 0);
    m_float_emit.FACGT(32, EncodeRegToDouble(scratch_reg), EncodeRegToDouble(src_reg),
                       EncodeRegToDouble(scratch_reg));

    // 0x0000'0000'0000'0000 (zero)     -> 0x0000'0000'0000'0000 (zero)
    // 0x0000'0000'FFFF'FFFF (denormal) -> 0xFF00'0000'FFFF'FFFF (normal)
    // 0xFFFF'FFFF'0000'0000 (NaN)      -> 0x00FF'FFFF'0000'0000 (normal)
    // 0xFFFF'FFFF'FFFF'FFFF (NaN)      -> 0xFFFF'FFFF'FFFF'FFFF (NaN)
    m_float_emit.INS(8, EncodeRegToDouble(scratch_reg), 7, EncodeRegToDouble(scratch_reg), 0);

    // Is scratch_reg a NaN (0xFFFF'FFFF'FFFF'FFFF)?
    m_float_emit.FCMP(EncodeRegToDouble(scratch_reg));
    fast = B(CCFlags::CC_VS);

    if (switch_to_farcode)
    {
      FixupBranch slow = B();

      SwitchToFarCode();
      SetJumpTarget(slow);
    }
  }

  // If no (or if we don't have a scratch register), call the bit-exact routine

  const BitSet32 gpr_saved = gpr.GetCallerSavedUsed() & BitSet32{0, 1, 2, 3, 4, 30};
  ABI_PushRegisters(gpr_saved);

  m_float_emit.FMOV(ARM64Reg::W0, EncodeRegToSingle(src_reg));
  BL(cstd);
  m_float_emit.UMOV(32, ARM64Reg::W0, src_reg, 1);
  m_float_emit.FMOV(EncodeRegToDouble(dest_reg), ARM64Reg::X1);
  BL(cstd);
  m_float_emit.INS(64, dest_reg, 1, ARM64Reg::X1);

  ABI_PopRegisters(gpr_saved);

  // If yes, do a fast conversion with FCVTL

  if (scratch_reg != ARM64Reg::INVALID_REG)
  {
    FixupBranch continue1 = B();

    if (switch_to_farcode)
      SwitchToNearCode();

    SetJumpTarget(fast);
    m_float_emit.FCVTL(64, EncodeRegToDouble(dest_reg), EncodeRegToDouble(src_reg));

    SetJumpTarget(continue1);
  }
}

void JitArm64::EmitQuietNaNBitConstant(ARM64Reg dest_reg, bool single, ARM64Reg temp_gpr)
{
  // dest_reg = QNaN & ~SNaN
  //
  // (Alternatively, dest_reg = QNaN would also work, but that would take
  // two instructions to emit even for singles)

  if (single)
  {
    m_float_emit.MOVI(32, dest_reg, 0x40, 16);
  }
  else
  {
    ASSERT(temp_gpr != ARM64Reg::INVALID_REG);

    MOVI2R(EncodeRegTo64(temp_gpr), 0x0008'0000'0000'0000);
    if (IsQuad(dest_reg))
      m_float_emit.DUP(64, dest_reg, EncodeRegTo64(temp_gpr));
    else
      m_float_emit.FMOV(dest_reg, EncodeRegTo64(temp_gpr));
  }
}

bool JitArm64::IsFPRStoreSafe(size_t guest_reg) const
{
  return js.fpr_is_store_safe[guest_reg];
}