@@ -26,7 +26,7 @@ void JitArm64::lfXX(UGeckoInstruction inst)
u32 a = inst.RA, b = inst.RB;

s32 offset = inst.SIMM_16;
u32 flags = BackPatchInfo::FLAG_LOAD;
u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT;
bool update = false;
s32 offset_reg = -1;

@@ -36,46 +36,46 @@ void JitArm64::lfXX(UGeckoInstruction inst)
switch (inst.SUBOP10)
{
case 567: // lfsux
flags |= BackPatchInfo::FLAG_SIZE_F32;
flags |= BackPatchInfo::FLAG_SIZE_32;
update = true;
offset_reg = b;
break;
case 535: // lfsx
flags |= BackPatchInfo::FLAG_SIZE_F32;
flags |= BackPatchInfo::FLAG_SIZE_32;
offset_reg = b;
break;
case 631: // lfdux
flags |= BackPatchInfo::FLAG_SIZE_F64;
flags |= BackPatchInfo::FLAG_SIZE_64;
update = true;
offset_reg = b;
break;
case 599: // lfdx
flags |= BackPatchInfo::FLAG_SIZE_F64;
flags |= BackPatchInfo::FLAG_SIZE_64;
offset_reg = b;
break;
}
break;
case 49: // lfsu
flags |= BackPatchInfo::FLAG_SIZE_F32;
flags |= BackPatchInfo::FLAG_SIZE_32;
update = true;
break;
case 48: // lfs
flags |= BackPatchInfo::FLAG_SIZE_F32;
flags |= BackPatchInfo::FLAG_SIZE_32;
break;
case 51: // lfdu
flags |= BackPatchInfo::FLAG_SIZE_F64;
flags |= BackPatchInfo::FLAG_SIZE_64;
update = true;
break;
case 50: // lfd
flags |= BackPatchInfo::FLAG_SIZE_F64;
flags |= BackPatchInfo::FLAG_SIZE_64;
break;
}

u32 imm_addr = 0;
bool is_immediate = false;

const RegType type =
(flags & BackPatchInfo::FLAG_SIZE_F64) != 0 ? RegType::LowerPair : RegType::DuplicatedSingle;
(flags & BackPatchInfo::FLAG_SIZE_64) != 0 ? RegType::LowerPair : RegType::DuplicatedSingle;

gpr.Lock(ARM64Reg::W0, ARM64Reg::W30);
fpr.Lock(ARM64Reg::Q0);
@@ -190,7 +190,7 @@ void JitArm64::stfXX(UGeckoInstruction inst)

bool want_single = false;
s32 offset = inst.SIMM_16;
u32 flags = BackPatchInfo::FLAG_STORE;
u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT;
bool update = false;
s32 offset_reg = -1;

@@ -201,46 +201,46 @@ void JitArm64::stfXX(UGeckoInstruction inst)
{
case 663: // stfsx
want_single = true;
flags |= BackPatchInfo::FLAG_SIZE_F32;
flags |= BackPatchInfo::FLAG_SIZE_32;
offset_reg = b;
break;
case 695: // stfsux
want_single = true;
flags |= BackPatchInfo::FLAG_SIZE_F32;
flags |= BackPatchInfo::FLAG_SIZE_32;
update = true;
offset_reg = b;
break;
case 727: // stfdx
flags |= BackPatchInfo::FLAG_SIZE_F64;
flags |= BackPatchInfo::FLAG_SIZE_64;
offset_reg = b;
break;
case 759: // stfdux
flags |= BackPatchInfo::FLAG_SIZE_F64;
flags |= BackPatchInfo::FLAG_SIZE_64;
update = true;
offset_reg = b;
break;
case 983: // stfiwx
// This instruction writes the lower 32 bits of a double. want_single must be false
flags |= BackPatchInfo::FLAG_SIZE_F32;
flags |= BackPatchInfo::FLAG_SIZE_32;
offset_reg = b;
break;
}
break;
case 53: // stfsu
want_single = true;
flags |= BackPatchInfo::FLAG_SIZE_F32;
flags |= BackPatchInfo::FLAG_SIZE_32;
update = true;
break;
case 52: // stfs
want_single = true;
flags |= BackPatchInfo::FLAG_SIZE_F32;
flags |= BackPatchInfo::FLAG_SIZE_32;
break;
case 55: // stfdu
flags |= BackPatchInfo::FLAG_SIZE_F64;
flags |= BackPatchInfo::FLAG_SIZE_64;
update = true;
break;
case 54: // stfd
flags |= BackPatchInfo::FLAG_SIZE_F64;
flags |= BackPatchInfo::FLAG_SIZE_64;
break;
}

@@ -361,16 +361,16 @@ void JitArm64::stfXX(UGeckoInstruction inst)
if (jo.optimizeGatherPipe && PowerPC::IsOptimizableGatherPipeWrite(imm_addr))
{
int accessSize;
if (flags & BackPatchInfo::FLAG_SIZE_F64)
if (flags & BackPatchInfo::FLAG_SIZE_64)
accessSize = 64;
else
accessSize = 32;

LDR(IndexType::Unsigned, ARM64Reg::X0, PPC_REG, PPCSTATE_OFF(gather_pipe_ptr));

if (flags & BackPatchInfo::FLAG_SIZE_F64)
if (flags & BackPatchInfo::FLAG_SIZE_64)
m_float_emit.REV64(8, ARM64Reg::Q0, V0);
else if (flags & BackPatchInfo::FLAG_SIZE_F32)
else if (flags & BackPatchInfo::FLAG_SIZE_32)
m_float_emit.REV32(8, ARM64Reg::D0, V0);

m_float_emit.STR(accessSize, IndexType::Post, accessSize == 64 ? ARM64Reg::Q0 : ARM64Reg::D0,
@@ -19,14 +19,14 @@ void JitArm64::psq_lXX(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITLoadStorePairedOff);
FALLBACK_IF(jo.memcheck || !jo.fastmem);
FALLBACK_IF(jo.memcheck);

// The asm routines assume address translation is on.
FALLBACK_IF(!MSR.DR);
// If we have a fastmem arena, the asm routines assume address translation is on.
FALLBACK_IF(!js.assumeNoPairedQuantize && jo.fastmem_arena && !MSR.DR);

// X30 is LR
// X0 contains the scale
// X1 is the address
// X0 is the address
// X1 contains the scale
// X2 is a temporary
// Q0 is the return register
// Q1 is a temporary
@@ -36,13 +36,18 @@ void JitArm64::psq_lXX(UGeckoInstruction inst)
const int i = indexed ? inst.Ix : inst.I;
const int w = indexed ? inst.Wx : inst.W;

gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W30);
fpr.Lock(ARM64Reg::Q0, ARM64Reg::Q1);
gpr.Lock(ARM64Reg::W0, ARM64Reg::W30);
fpr.Lock(ARM64Reg::Q0);
if (!js.assumeNoPairedQuantize)
{
gpr.Lock(ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3);
fpr.Lock(ARM64Reg::Q1);
}

constexpr ARM64Reg scale_reg = ARM64Reg::W0;
constexpr ARM64Reg addr_reg = ARM64Reg::W1;
constexpr ARM64Reg addr_reg = ARM64Reg::W0;
constexpr ARM64Reg scale_reg = ARM64Reg::W1;
constexpr ARM64Reg type_reg = ARM64Reg::W2;
ARM64Reg VS;
ARM64Reg VS = fpr.RW(inst.RS, RegType::Single);

if (inst.RA || update) // Always uses the register on update
{
@@ -69,17 +74,20 @@ void JitArm64::psq_lXX(UGeckoInstruction inst)

if (js.assumeNoPairedQuantize)
{
VS = fpr.RW(inst.RS, RegType::Single);
BitSet32 gprs_in_use = gpr.GetCallerSavedUsed();
BitSet32 fprs_in_use = fpr.GetCallerSavedUsed();

// Wipe the registers we are using as temporaries
gprs_in_use[DecodeReg(ARM64Reg::W0)] = false;
fprs_in_use[DecodeReg(ARM64Reg::Q0)] = false;
fprs_in_use[DecodeReg(VS)] = 0;

u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_32;
if (!w)
{
ADD(EncodeRegTo64(addr_reg), EncodeRegTo64(addr_reg), MEM_REG);
m_float_emit.LD1(32, 1, EncodeRegToDouble(VS), EncodeRegTo64(addr_reg));
}
else
{
m_float_emit.LDR(32, VS, EncodeRegTo64(addr_reg), MEM_REG);
}
m_float_emit.REV32(8, EncodeRegToDouble(VS), EncodeRegToDouble(VS));
flags |= BackPatchInfo::FLAG_PAIR;

EmitBackpatchRoutine(flags, jo.fastmem, jo.fastmem, VS, EncodeRegTo64(addr_reg), gprs_in_use,
fprs_in_use);
}
else
{
@@ -91,7 +99,6 @@ void JitArm64::psq_lXX(UGeckoInstruction inst)
LDR(EncodeRegTo64(type_reg), ARM64Reg::X30, ArithOption(EncodeRegTo64(type_reg), true));
BLR(EncodeRegTo64(type_reg));

VS = fpr.RW(inst.RS, RegType::Single);
m_float_emit.ORR(EncodeRegToDouble(VS), ARM64Reg::D0, ARM64Reg::D0);
}

@@ -101,18 +108,23 @@ void JitArm64::psq_lXX(UGeckoInstruction inst)
m_float_emit.INS(32, VS, 1, ARM64Reg::Q0, 0);
}

gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W30);
fpr.Unlock(ARM64Reg::Q0, ARM64Reg::Q1);
gpr.Unlock(ARM64Reg::W0, ARM64Reg::W30);
fpr.Unlock(ARM64Reg::Q0);
if (!js.assumeNoPairedQuantize)
{
gpr.Unlock(ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3);
fpr.Unlock(ARM64Reg::Q1);
}
}

void JitArm64::psq_stXX(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITLoadStorePairedOff);
FALLBACK_IF(jo.memcheck || !jo.fastmem);
FALLBACK_IF(jo.memcheck);

// The asm routines assume address translation is on.
FALLBACK_IF(!MSR.DR);
// If we have a fastmem arena, the asm routines assume address translation is on.
FALLBACK_IF(!js.assumeNoPairedQuantize && jo.fastmem_arena && !MSR.DR);

// X30 is LR
// X0 contains the scale
@@ -125,7 +137,8 @@ void JitArm64::psq_stXX(UGeckoInstruction inst)
const int i = indexed ? inst.Ix : inst.I;
const int w = indexed ? inst.Wx : inst.W;

fpr.Lock(ARM64Reg::Q0, ARM64Reg::Q1);
if (!js.assumeNoPairedQuantize)
fpr.Lock(ARM64Reg::Q0, ARM64Reg::Q1);

const bool have_single = fpr.IsSingle(inst.RS);

@@ -160,7 +173,9 @@ void JitArm64::psq_stXX(UGeckoInstruction inst)
}
}

gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W30);
gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W30);
if (!js.assumeNoPairedQuantize)
gpr.Lock(ARM64Reg::W2);

constexpr ARM64Reg scale_reg = ARM64Reg::W0;
constexpr ARM64Reg addr_reg = ARM64Reg::W1;
@@ -189,18 +204,18 @@ void JitArm64::psq_stXX(UGeckoInstruction inst)
MOV(gpr.R(inst.RA), addr_reg);
}

BitSet32 gprs_in_use = gpr.GetCallerSavedUsed();
BitSet32 fprs_in_use = fpr.GetCallerSavedUsed();

// Wipe the registers we are using as temporaries
gprs_in_use &= BitSet32(~7);
fprs_in_use &= BitSet32(~3);

if (js.assumeNoPairedQuantize)
{
u32 flags = BackPatchInfo::FLAG_STORE;
BitSet32 gprs_in_use = gpr.GetCallerSavedUsed();
BitSet32 fprs_in_use = fpr.GetCallerSavedUsed();

// Wipe the registers we are using as temporaries
gprs_in_use[DecodeReg(ARM64Reg::W0)] = false;
gprs_in_use[DecodeReg(ARM64Reg::W1)] = false;

flags |= (w ? BackPatchInfo::FLAG_SIZE_F32 : BackPatchInfo::FLAG_SIZE_F32X2);
u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_32;
if (!w)
flags |= BackPatchInfo::FLAG_PAIR;

EmitBackpatchRoutine(flags, jo.fastmem, jo.fastmem, VS, EncodeRegTo64(addr_reg), gprs_in_use,
fprs_in_use);
@@ -211,38 +226,18 @@ void JitArm64::psq_stXX(UGeckoInstruction inst)
UBFM(type_reg, scale_reg, 0, 2); // Type
UBFM(scale_reg, scale_reg, 8, 13); // Scale

// Inline address check
// FIXME: This doesn't correctly account for the BAT configuration.
TST(addr_reg, LogicalImm(0x0c000000, 32));
FixupBranch pass = B(CC_EQ);
FixupBranch fail = B();

SwitchToFarCode();
SetJumpTarget(fail);
// Slow
MOVP2R(ARM64Reg::X30, &paired_store_quantized[16 + w * 8]);
LDR(EncodeRegTo64(type_reg), ARM64Reg::X30, ArithOption(EncodeRegTo64(type_reg), true));

ABI_PushRegisters(gprs_in_use);
m_float_emit.ABI_PushRegisters(fprs_in_use, ARM64Reg::X30);
BLR(EncodeRegTo64(type_reg));
m_float_emit.ABI_PopRegisters(fprs_in_use, ARM64Reg::X30);
ABI_PopRegisters(gprs_in_use);
FixupBranch continue1 = B();
SwitchToNearCode();
SetJumpTarget(pass);

// Fast
MOVP2R(ARM64Reg::X30, &paired_store_quantized[w * 8]);
MOVP2R(ARM64Reg::X30, w ? single_store_quantized : paired_store_quantized);
LDR(EncodeRegTo64(type_reg), ARM64Reg::X30, ArithOption(EncodeRegTo64(type_reg), true));
BLR(EncodeRegTo64(type_reg));

SetJumpTarget(continue1);
}

if (js.assumeNoPairedQuantize && !have_single)
fpr.Unlock(VS);

gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W30);
fpr.Unlock(ARM64Reg::Q0, ARM64Reg::Q1);
gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W30);
if (!js.assumeNoPairedQuantize)
{
gpr.Unlock(ARM64Reg::W2);
fpr.Unlock(ARM64Reg::Q0, ARM64Reg::Q1);
}
}
@@ -149,15 +149,9 @@ void Arm64GPRCache::Start(PPCAnalyst::BlockRegStats& stats)
{
}

bool Arm64GPRCache::IsCalleeSaved(ARM64Reg reg) const
bool Arm64GPRCache::IsCallerSaved(ARM64Reg reg) const
{
static constexpr auto callee_regs = {
ARM64Reg::X28, ARM64Reg::X27, ARM64Reg::X26, ARM64Reg::X25,
ARM64Reg::X24, ARM64Reg::X23, ARM64Reg::X22, ARM64Reg::X21,
ARM64Reg::X20, ARM64Reg::X19, ARM64Reg::INVALID_REG,
};

return std::find(callee_regs.begin(), callee_regs.end(), EncodeRegTo64(reg)) != callee_regs.end();
return ARM64XEmitter::CALLER_SAVED_GPRS[DecodeReg(reg)];
}

const OpArg& Arm64GPRCache::GetGuestGPROpArg(size_t preg) const
@@ -416,7 +410,7 @@ BitSet32 Arm64GPRCache::GetCallerSavedUsed() const
BitSet32 registers(0);
for (const auto& it : m_host_registers)
{
if (it.IsLocked() && !IsCalleeSaved(it.GetReg()))
if (it.IsLocked() && IsCallerSaved(it.GetReg()))
registers[DecodeReg(it.GetReg())] = true;
}
return registers;
@@ -716,14 +710,9 @@ void Arm64FPRCache::FlushByHost(ARM64Reg host_reg, ARM64Reg tmp_reg)
}
}

bool Arm64FPRCache::IsCalleeSaved(ARM64Reg reg) const
bool Arm64FPRCache::IsCallerSaved(ARM64Reg reg) const
{
static constexpr auto callee_regs = {
ARM64Reg::Q8, ARM64Reg::Q9, ARM64Reg::Q10, ARM64Reg::Q11, ARM64Reg::Q12,
ARM64Reg::Q13, ARM64Reg::Q14, ARM64Reg::Q15, ARM64Reg::INVALID_REG,
};

return std::find(callee_regs.begin(), callee_regs.end(), reg) != callee_regs.end();
return ARM64XEmitter::CALLER_SAVED_FPRS[DecodeReg(reg)];
}

bool Arm64FPRCache::IsTopHalfUsed(ARM64Reg reg) const
@@ -841,7 +830,7 @@ BitSet32 Arm64FPRCache::GetCallerSavedUsed() const
BitSet32 registers(0);
for (const auto& it : m_host_registers)
{
if (it.IsLocked() && (!IsCalleeSaved(it.GetReg()) || IsTopHalfUsed(it.GetReg())))
if (it.IsLocked() && (IsCallerSaved(it.GetReg()) || IsTopHalfUsed(it.GetReg())))
registers[DecodeReg(it.GetReg())] = true;
}
return registers;
@@ -291,7 +291,7 @@ class Arm64GPRCache : public Arm64RegCache
void FlushRegister(size_t index, bool maintain_state, Arm64Gen::ARM64Reg tmp_reg) override;

private:
bool IsCalleeSaved(Arm64Gen::ARM64Reg reg) const;
bool IsCallerSaved(Arm64Gen::ARM64Reg reg) const;

struct GuestRegInfo
{
@@ -350,7 +350,7 @@ class Arm64FPRCache : public Arm64RegCache
void FlushRegister(size_t preg, bool maintain_state, Arm64Gen::ARM64Reg tmp_reg) override;

private:
bool IsCalleeSaved(Arm64Gen::ARM64Reg reg) const;
bool IsCallerSaved(Arm64Gen::ARM64Reg reg) const;
bool IsTopHalfUsed(Arm64Gen::ARM64Reg reg) const;

void FlushRegisters(BitSet32 regs, bool maintain_state, Arm64Gen::ARM64Reg tmp_reg);

Large diffs are not rendered by default.

@@ -191,19 +191,47 @@ class MMIOReadCodeGenerator : public MMIO::ReadHandlingMethodVisitor<T>
bool m_sign_extend;
};

void ByteswapAfterLoad(ARM64XEmitter* emit, ARM64Reg dst_reg, ARM64Reg src_reg, u32 flags,
bool is_reversed, bool is_extended)
void SwapPairs(ARM64XEmitter* emit, ARM64Reg dst_reg, ARM64Reg src_reg, u32 flags)
{
if (flags & BackPatchInfo::FLAG_SIZE_32)
emit->ROR(dst_reg, src_reg, 32);
else if (flags & BackPatchInfo::FLAG_SIZE_16)
emit->ROR(dst_reg, src_reg, 16);
else
emit->REV16(dst_reg, src_reg);
}

void ByteswapAfterLoad(ARM64XEmitter* emit, Arm64Gen::ARM64FloatEmitter* float_emit,
ARM64Reg dst_reg, ARM64Reg src_reg, u32 flags, bool is_reversed,
bool is_extended)
{
if (is_reversed == !(flags & BackPatchInfo::FLAG_REVERSE))
{
if (flags & BackPatchInfo::FLAG_SIZE_32)
if (flags & BackPatchInfo::FLAG_SIZE_64)
{
if (flags & BackPatchInfo::FLAG_FLOAT)
float_emit->REV64(8, dst_reg, src_reg);
else
emit->REV64(dst_reg, src_reg);

src_reg = dst_reg;
}
else if (flags & BackPatchInfo::FLAG_SIZE_32)
{
emit->REV32(dst_reg, src_reg);
if (flags & BackPatchInfo::FLAG_FLOAT)
float_emit->REV32(8, dst_reg, src_reg);
else
emit->REV32(dst_reg, src_reg);

src_reg = dst_reg;
}
else if (flags & BackPatchInfo::FLAG_SIZE_16)
{
emit->REV16(dst_reg, src_reg);
if (flags & BackPatchInfo::FLAG_FLOAT)
float_emit->REV16(8, dst_reg, src_reg);
else
emit->REV16(dst_reg, src_reg);

src_reg = dst_reg;
}
}
@@ -215,25 +243,47 @@ void ByteswapAfterLoad(ARM64XEmitter* emit, ARM64Reg dst_reg, ARM64Reg src_reg,
}

if (dst_reg != src_reg)
emit->MOV(dst_reg, src_reg);
{
if (flags & BackPatchInfo::FLAG_FLOAT)
float_emit->ORR(dst_reg, src_reg, src_reg);
else
emit->MOV(dst_reg, src_reg);
}
}

ARM64Reg ByteswapBeforeStore(ARM64XEmitter* emit, ARM64Reg tmp_reg, ARM64Reg src_reg, u32 flags,
bool want_reversed)
ARM64Reg ByteswapBeforeStore(ARM64XEmitter* emit, Arm64Gen::ARM64FloatEmitter* float_emit,
ARM64Reg tmp_reg, ARM64Reg src_reg, u32 flags, bool want_reversed)
{
ARM64Reg dst_reg = src_reg;

if (want_reversed == !(flags & BackPatchInfo::FLAG_REVERSE))
{
if (flags & BackPatchInfo::FLAG_SIZE_32)
if (flags & BackPatchInfo::FLAG_SIZE_64)
{
dst_reg = tmp_reg;
emit->REV32(dst_reg, src_reg);

if (flags & BackPatchInfo::FLAG_FLOAT)
float_emit->REV64(8, dst_reg, src_reg);
else
emit->REV64(dst_reg, src_reg);
}
else if (flags & BackPatchInfo::FLAG_SIZE_32)
{
dst_reg = tmp_reg;

if (flags & BackPatchInfo::FLAG_FLOAT)
float_emit->REV32(8, dst_reg, src_reg);
else
emit->REV32(dst_reg, src_reg);
}
else if (flags & BackPatchInfo::FLAG_SIZE_16)
{
dst_reg = tmp_reg;
emit->REV16(dst_reg, src_reg);

if (flags & BackPatchInfo::FLAG_FLOAT)
float_emit->REV16(8, dst_reg, src_reg);
else
emit->REV16(dst_reg, src_reg);
}
}

@@ -243,6 +293,8 @@ ARM64Reg ByteswapBeforeStore(ARM64XEmitter* emit, ARM64Reg tmp_reg, ARM64Reg src
void MMIOLoadToReg(MMIO::Mapping* mmio, Arm64Gen::ARM64XEmitter* emit, BitSet32 gprs_in_use,
BitSet32 fprs_in_use, ARM64Reg dst_reg, u32 address, u32 flags)
{
ASSERT(!(flags & BackPatchInfo::FLAG_FLOAT));

if (flags & BackPatchInfo::FLAG_SIZE_8)
{
MMIOReadCodeGenerator<u8> gen(emit, gprs_in_use, fprs_in_use, dst_reg, address,
@@ -262,13 +314,15 @@ void MMIOLoadToReg(MMIO::Mapping* mmio, Arm64Gen::ARM64XEmitter* emit, BitSet32
mmio->GetHandlerForRead<u32>(address).Visit(gen);
}

ByteswapAfterLoad(emit, dst_reg, dst_reg, flags, false, true);
ByteswapAfterLoad(emit, nullptr, dst_reg, dst_reg, flags, false, true);
}

void MMIOWriteRegToAddr(MMIO::Mapping* mmio, Arm64Gen::ARM64XEmitter* emit, BitSet32 gprs_in_use,
BitSet32 fprs_in_use, ARM64Reg src_reg, u32 address, u32 flags)
{
src_reg = ByteswapBeforeStore(emit, ARM64Reg::W1, src_reg, flags, false);
ASSERT(!(flags & BackPatchInfo::FLAG_FLOAT));

src_reg = ByteswapBeforeStore(emit, nullptr, ARM64Reg::W1, src_reg, flags, false);

if (flags & BackPatchInfo::FLAG_SIZE_8)
{
@@ -8,11 +8,17 @@

#include "Core/HW/MMIO.h"

void ByteswapAfterLoad(Arm64Gen::ARM64XEmitter* emit, Arm64Gen::ARM64Reg dst_reg,
Arm64Gen::ARM64Reg src_reg, u32 flags, bool is_reversed, bool is_extended);
void SwapPairs(Arm64Gen::ARM64XEmitter* emit, Arm64Gen::ARM64Reg dst_reg,
Arm64Gen::ARM64Reg src_reg, u32 flags);

Arm64Gen::ARM64Reg ByteswapBeforeStore(Arm64Gen::ARM64XEmitter* emit, Arm64Gen::ARM64Reg tmp_reg,
Arm64Gen::ARM64Reg src_reg, u32 flags, bool want_reversed);
void ByteswapAfterLoad(Arm64Gen::ARM64XEmitter* emit, Arm64Gen::ARM64FloatEmitter* float_emit,
Arm64Gen::ARM64Reg dst_reg, Arm64Gen::ARM64Reg src_reg, u32 flags,
bool is_reversed, bool is_extended);

Arm64Gen::ARM64Reg ByteswapBeforeStore(Arm64Gen::ARM64XEmitter* emit,
Arm64Gen::ARM64FloatEmitter* float_emit,
Arm64Gen::ARM64Reg tmp_reg, Arm64Gen::ARM64Reg src_reg,
u32 flags, bool want_reversed);

void MMIOLoadToReg(MMIO::Mapping* mmio, Arm64Gen::ARM64XEmitter* emit, BitSet32 gprs_in_use,
BitSet32 fprs_in_use, Arm64Gen::ARM64Reg dst_reg, u32 address, u32 flags);
@@ -13,31 +13,32 @@ struct BackPatchInfo
FLAG_SIZE_8 = (1 << 2),
FLAG_SIZE_16 = (1 << 3),
FLAG_SIZE_32 = (1 << 4),
FLAG_SIZE_F32 = (1 << 5),
FLAG_SIZE_F32X2 = (1 << 6),
FLAG_SIZE_F64 = (1 << 7),
FLAG_SIZE_64 = (1 << 5),
FLAG_FLOAT = (1 << 6),
FLAG_PAIR = (1 << 7),
FLAG_REVERSE = (1 << 8),
FLAG_EXTEND = (1 << 9),
FLAG_ZERO_256 = (1 << 10),
FLAG_MASK_FLOAT = FLAG_SIZE_F32 | FLAG_SIZE_F32X2 | FLAG_SIZE_F64,
};

static u32 GetFlagSize(u32 flags)
{
u32 size = 0;

if (flags & FLAG_SIZE_8)
return 8;
size = 8;
if (flags & FLAG_SIZE_16)
return 16;
size = 16;
if (flags & FLAG_SIZE_32)
return 32;
if (flags & FLAG_SIZE_F32)
return 32;
if (flags & FLAG_SIZE_F32X2)
return 64;
if (flags & FLAG_SIZE_F64)
return 64;
size = 32;
if (flags & FLAG_SIZE_64)
size = 64;
if (flags & FLAG_ZERO_256)
return 256;
return 0;
size = 256;

if (flags & FLAG_PAIR)
size *= 2;

return size;
}
};