@@ -26,7 +26,7 @@ void JitArm64::SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 o
// We want to make sure to not get LR as a temp register
gpr.Lock(ARM64Reg::W0, ARM64Reg::W30);

gpr.BindToRegister(dest, dest == (u32)addr || dest == (u32)offsetReg);
gpr.BindToRegister(dest, dest == (u32)addr || dest == (u32)offsetReg, false);
ARM64Reg dest_reg = gpr.R(dest);
ARM64Reg up_reg = ARM64Reg::INVALID_REG;
ARM64Reg off_reg = ARM64Reg::INVALID_REG;
@@ -101,19 +101,26 @@ void JitArm64::SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 o

ARM64Reg XA = EncodeRegTo64(addr_reg);

if (is_immediate)
MOVI2R(XA, imm_addr);
bool addr_reg_set = !is_immediate;
const auto set_addr_reg_if_needed = [&] {
if (!addr_reg_set)
MOVI2R(XA, imm_addr);
};

if (update)
const bool early_update = !jo.memcheck && dest != static_cast<u32>(addr);
if (update && early_update)
{
gpr.BindToRegister(addr, false);
set_addr_reg_if_needed();
MOV(gpr.R(addr), addr_reg);
}

BitSet32 regs_in_use = gpr.GetCallerSavedUsed();
BitSet32 fprs_in_use = fpr.GetCallerSavedUsed();
regs_in_use[DecodeReg(ARM64Reg::W0)] = 0;
regs_in_use[DecodeReg(dest_reg)] = 0;
if (!update || early_update)
regs_in_use[DecodeReg(ARM64Reg::W0)] = 0;
if (!jo.memcheck)
regs_in_use[DecodeReg(dest_reg)] = 0;

u32 access_size = BackPatchInfo::GetFlagSize(flags);
u32 mmio_address = 0;
@@ -122,6 +129,7 @@ void JitArm64::SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 o

if (jo.fastmem_arena && is_immediate && PowerPC::IsOptimizableRAMAddress(imm_addr))
{
set_addr_reg_if_needed();
EmitBackpatchRoutine(flags, true, false, dest_reg, XA, BitSet32(0), BitSet32(0));
}
else if (mmio_address)
@@ -131,13 +139,25 @@ void JitArm64::SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 o
}
else
{
set_addr_reg_if_needed();
EmitBackpatchRoutine(flags, jo.fastmem, jo.fastmem, dest_reg, XA, regs_in_use, fprs_in_use);
}

gpr.BindToRegister(dest, false, true);
ASSERT(dest_reg == gpr.R(dest));

if (update && !early_update)
{
gpr.BindToRegister(addr, false);
set_addr_reg_if_needed();
MOV(gpr.R(addr), addr_reg);
}

gpr.Unlock(ARM64Reg::W0, ARM64Reg::W30);
}

void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s32 offset)
void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s32 offset,
bool update)
{
// We want to make sure to not get LR as a temp register
gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W30);
@@ -152,11 +172,6 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s
if (dest != -1 && !gpr.IsImm(dest))
reg_dest = gpr.R(dest);

BitSet32 regs_in_use = gpr.GetCallerSavedUsed();
BitSet32 fprs_in_use = fpr.GetCallerSavedUsed();
regs_in_use[DecodeReg(ARM64Reg::W0)] = 0;
regs_in_use[DecodeReg(ARM64Reg::W1)] = 0;

ARM64Reg addr_reg = ARM64Reg::W1;

u32 imm_addr = 0;
@@ -222,6 +237,26 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s

ARM64Reg XA = EncodeRegTo64(addr_reg);

bool addr_reg_set = !is_immediate;
const auto set_addr_reg_if_needed = [&] {
if (!addr_reg_set)
MOVI2R(XA, imm_addr);
};

const bool early_update = !jo.memcheck && value != static_cast<u32>(dest);
if (update && early_update)
{
gpr.BindToRegister(dest, false);
set_addr_reg_if_needed();
MOV(gpr.R(dest), addr_reg);
}

BitSet32 regs_in_use = gpr.GetCallerSavedUsed();
BitSet32 fprs_in_use = fpr.GetCallerSavedUsed();
regs_in_use[DecodeReg(ARM64Reg::W0)] = 0;
if (!update || early_update)
regs_in_use[DecodeReg(ARM64Reg::W1)] = 0;

u32 access_size = BackPatchInfo::GetFlagSize(flags);
u32 mmio_address = 0;
if (is_immediate)
@@ -255,7 +290,7 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s
}
else if (jo.fastmem_arena && is_immediate && PowerPC::IsOptimizableRAMAddress(imm_addr))
{
MOVI2R(XA, imm_addr);
set_addr_reg_if_needed();
EmitBackpatchRoutine(flags, true, false, RS, XA, BitSet32(0), BitSet32(0));
}
else if (mmio_address)
@@ -265,12 +300,17 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s
}
else
{
if (is_immediate)
MOVI2R(XA, imm_addr);

set_addr_reg_if_needed();
EmitBackpatchRoutine(flags, jo.fastmem, jo.fastmem, RS, XA, regs_in_use, fprs_in_use);
}

if (update && !early_update)
{
gpr.BindToRegister(dest, false);
set_addr_reg_if_needed();
MOV(gpr.R(dest), addr_reg);
}

gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W30);
}

@@ -306,7 +346,6 @@ void JitArm64::lXX(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITLoadStoreOff);
FALLBACK_IF(jo.memcheck);

u32 a = inst.RA, b = inst.RB, d = inst.RD;
s32 offset = inst.SIMM_16;
@@ -385,7 +424,6 @@ void JitArm64::stX(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITLoadStoreOff);
FALLBACK_IF(jo.memcheck);

u32 a = inst.RA, b = inst.RB, s = inst.RS;
s32 offset = inst.SIMM_16;
@@ -444,122 +482,104 @@ void JitArm64::stX(UGeckoInstruction inst)
break;
}

SafeStoreFromReg(update ? a : (a ? a : -1), s, regOffset, flags, offset);

if (update)
{
gpr.BindToRegister(a, false);

ARM64Reg WA = gpr.GetReg();
ARM64Reg RB = {};
ARM64Reg RA = gpr.R(a);
if (regOffset != -1)
RB = gpr.R(regOffset);
if (regOffset == -1)
{
ADDI2R(RA, RA, offset, WA);
}
else
{
ADD(RA, RA, RB);
}
gpr.Unlock(WA);
}
SafeStoreFromReg(update ? a : (a ? a : -1), s, regOffset, flags, offset, update);
}

void JitArm64::lmw(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITLoadStoreOff);
FALLBACK_IF(!jo.fastmem || jo.memcheck);

u32 a = inst.RA;
u32 a = inst.RA, d = inst.RD;
s32 offset = inst.SIMM_16;

ARM64Reg WA = gpr.GetReg();
ARM64Reg XA = EncodeRegTo64(WA);
gpr.Lock(ARM64Reg::W0, ARM64Reg::W30);

// MMU games make use of a >= d despite this being invalid according to the PEM.
// Because of this, make sure to not re-read rA after starting doing the loads.
ARM64Reg addr_reg = ARM64Reg::W0;
if (a)
{
ADDI2R(WA, gpr.R(a), inst.SIMM_16, WA);
ADD(XA, XA, MEM_REG);
if (gpr.IsImm(a))
MOVI2R(addr_reg, gpr.GetImm(a) + offset);
else
ADDI2R(addr_reg, gpr.R(a), offset, addr_reg);
}
else
{
ADDI2R(XA, MEM_REG, (u32)(s32)(s16)inst.SIMM_16, XA);
MOVI2R(addr_reg, offset);
}

for (int i = inst.RD; i < 32; i++)
// TODO: This doesn't handle rollback on DSI correctly
constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_SIZE_32;
for (u32 i = d; i < 32; i++)
{
int remaining = 32 - i;
if (remaining >= 4)
{
gpr.BindToRegister(i + 3, false);
gpr.BindToRegister(i + 2, false);
gpr.BindToRegister(i + 1, false);
gpr.BindToRegister(i, false);
ARM64Reg RX4 = gpr.R(i + 3);
ARM64Reg RX3 = gpr.R(i + 2);
ARM64Reg RX2 = gpr.R(i + 1);
ARM64Reg RX1 = gpr.R(i);
LDP(IndexType::Post, EncodeRegTo64(RX1), EncodeRegTo64(RX3), XA, 16);
REV32(EncodeRegTo64(RX1), EncodeRegTo64(RX1));
REV32(EncodeRegTo64(RX3), EncodeRegTo64(RX3));
LSR(EncodeRegTo64(RX2), EncodeRegTo64(RX1), 32);
LSR(EncodeRegTo64(RX4), EncodeRegTo64(RX3), 32);
i += 3;
}
else if (remaining >= 2)
{
gpr.BindToRegister(i + 1, false);
gpr.BindToRegister(i, false);
ARM64Reg RX2 = gpr.R(i + 1);
ARM64Reg RX1 = gpr.R(i);
LDP(IndexType::Post, RX1, RX2, XA, 8);
REV32(RX1, RX1);
REV32(RX2, RX2);
++i;
}
else
{
gpr.BindToRegister(i, false);
ARM64Reg RX = gpr.R(i);
LDR(IndexType::Post, RX, XA, 4);
REV32(RX, RX);
}
gpr.BindToRegister(i, false, false);
ARM64Reg dest_reg = gpr.R(i);

BitSet32 regs_in_use = gpr.GetCallerSavedUsed();
BitSet32 fprs_in_use = fpr.GetCallerSavedUsed();
if (i == 31)
regs_in_use[DecodeReg(addr_reg)] = 0;
if (!jo.memcheck)
regs_in_use[DecodeReg(dest_reg)] = 0;

EmitBackpatchRoutine(flags, jo.fastmem, jo.fastmem, dest_reg, EncodeRegTo64(addr_reg),
regs_in_use, fprs_in_use);

gpr.BindToRegister(i, false, true);
ASSERT(dest_reg == gpr.R(i));

if (i != 31)
ADD(addr_reg, addr_reg, 4);
}

gpr.Unlock(WA);
gpr.Unlock(ARM64Reg::W0, ARM64Reg::W30);
}

void JitArm64::stmw(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITLoadStoreOff);
FALLBACK_IF(!jo.fastmem || jo.memcheck);

u32 a = inst.RA;
u32 a = inst.RA, s = inst.RS;
s32 offset = inst.SIMM_16;

ARM64Reg WA = gpr.GetReg();
ARM64Reg XA = EncodeRegTo64(WA);
ARM64Reg WB = gpr.GetReg();
gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W30);

ARM64Reg addr_reg = ARM64Reg::W1;
if (a)
{
ADDI2R(WA, gpr.R(a), inst.SIMM_16, WA);
ADD(XA, XA, MEM_REG);
if (gpr.IsImm(a))
MOVI2R(addr_reg, gpr.GetImm(a) + offset);
else
ADDI2R(addr_reg, gpr.R(a), offset, addr_reg);
}
else
{
ADDI2R(XA, MEM_REG, (u32)(s32)(s16)inst.SIMM_16, XA);
MOVI2R(addr_reg, offset);
}

for (int i = inst.RD; i < 32; i++)
// TODO: This doesn't handle rollback on DSI correctly
constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_SIZE_32;
for (u32 i = s; i < 32; i++)
{
ARM64Reg RX = gpr.R(i);
REV32(WB, RX);
STR(IndexType::Unsigned, WB, XA, (i - inst.RD) * 4);
ARM64Reg src_reg = gpr.R(i);

BitSet32 regs_in_use = gpr.GetCallerSavedUsed();
BitSet32 fprs_in_use = fpr.GetCallerSavedUsed();
regs_in_use[DecodeReg(ARM64Reg::W0)] = 0;
if (i == 31)
regs_in_use[DecodeReg(addr_reg)] = 0;

EmitBackpatchRoutine(flags, jo.fastmem, jo.fastmem, src_reg, EncodeRegTo64(addr_reg),
regs_in_use, fprs_in_use);

if (i != 31)
ADD(addr_reg, addr_reg, 4);
}

gpr.Unlock(WA, WB);
gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W30);
}

void JitArm64::dcbx(UGeckoInstruction inst)
@@ -743,7 +763,6 @@ void JitArm64::dcbz(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITLoadStoreOff);
FALLBACK_IF(jo.memcheck || !jo.fastmem_arena);
FALLBACK_IF(SConfig::GetInstance().bLowDCBZHack);

int a = inst.RA, b = inst.RB;
@@ -796,7 +815,7 @@ void JitArm64::dcbz(UGeckoInstruction inst)
BitSet32 fprs_to_push = fpr.GetCallerSavedUsed();
gprs_to_push[DecodeReg(ARM64Reg::W0)] = 0;

EmitBackpatchRoutine(BackPatchInfo::FLAG_ZERO_256, true, true, ARM64Reg::W0,
EmitBackpatchRoutine(BackPatchInfo::FLAG_ZERO_256, jo.fastmem, jo.fastmem, ARM64Reg::W0,
EncodeRegTo64(addr_reg), gprs_to_push, fprs_to_push);

gpr.Unlock(ARM64Reg::W0, ARM64Reg::W30);
@@ -21,7 +21,6 @@ void JitArm64::lfXX(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITLoadStoreFloatingOff);
FALLBACK_IF(jo.memcheck);

u32 a = inst.RA, b = inst.RB;

@@ -80,7 +79,7 @@ void JitArm64::lfXX(UGeckoInstruction inst)
gpr.Lock(ARM64Reg::W0, ARM64Reg::W30);
fpr.Lock(ARM64Reg::Q0);

const ARM64Reg VD = fpr.RW(inst.FD, type);
const ARM64Reg VD = fpr.RW(inst.FD, type, false);
ARM64Reg addr_reg = ARM64Reg::W0;

if (update)
@@ -155,17 +154,20 @@ void JitArm64::lfXX(UGeckoInstruction inst)
if (is_immediate)
MOVI2R(XA, imm_addr);

if (update)
const bool early_update = !jo.memcheck;
if (update && early_update)
{
gpr.BindToRegister(a, false);
MOV(gpr.R(a), addr_reg);
}

BitSet32 regs_in_use = gpr.GetCallerSavedUsed();
BitSet32 fprs_in_use = fpr.GetCallerSavedUsed();
regs_in_use[DecodeReg(ARM64Reg::W0)] = 0;
if (!update || early_update)
regs_in_use[DecodeReg(ARM64Reg::W0)] = 0;
fprs_in_use[DecodeReg(ARM64Reg::Q0)] = 0;
fprs_in_use[DecodeReg(VD)] = 0;
if (!jo.memcheck)
fprs_in_use[DecodeReg(VD)] = 0;

if (jo.fastmem_arena && is_immediate && PowerPC::IsOptimizableRAMAddress(imm_addr))
{
@@ -176,6 +178,15 @@ void JitArm64::lfXX(UGeckoInstruction inst)
EmitBackpatchRoutine(flags, jo.fastmem, jo.fastmem, VD, XA, regs_in_use, fprs_in_use);
}

const ARM64Reg VD_again = fpr.RW(inst.FD, type, true);
ASSERT(VD == VD_again);

if (update && !early_update)
{
gpr.BindToRegister(a, false);
MOV(gpr.R(a), addr_reg);
}

gpr.Unlock(ARM64Reg::W0, ARM64Reg::W30);
fpr.Unlock(ARM64Reg::Q0);
}
@@ -184,7 +195,6 @@ void JitArm64::stfXX(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITLoadStoreFloatingOff);
FALLBACK_IF(jo.memcheck);

u32 a = inst.RA, b = inst.RB;

@@ -334,26 +344,25 @@ void JitArm64::stfXX(UGeckoInstruction inst)

ARM64Reg XA = EncodeRegTo64(addr_reg);

if (is_immediate && !(jo.optimizeGatherPipe && PowerPC::IsOptimizableGatherPipeWrite(imm_addr)))
{
MOVI2R(XA, imm_addr);
bool addr_reg_set = !is_immediate;
const auto set_addr_reg_if_needed = [&] {
if (!addr_reg_set)
MOVI2R(XA, imm_addr);
};

if (update)
{
gpr.BindToRegister(a, false);
MOV(gpr.R(a), addr_reg);
}
}
else if (!is_immediate && update)
const bool early_update = !jo.memcheck;
if (update && early_update)
{
gpr.BindToRegister(a, false);
set_addr_reg_if_needed();
MOV(gpr.R(a), addr_reg);
}

BitSet32 regs_in_use = gpr.GetCallerSavedUsed();
BitSet32 fprs_in_use = fpr.GetCallerSavedUsed();
regs_in_use[DecodeReg(ARM64Reg::W0)] = 0;
regs_in_use[DecodeReg(ARM64Reg::W1)] = 0;
if (!update || early_update)
regs_in_use[DecodeReg(ARM64Reg::W1)] = 0;
fprs_in_use[DecodeReg(ARM64Reg::Q0)] = 0;

if (is_immediate)
@@ -378,28 +387,31 @@ void JitArm64::stfXX(UGeckoInstruction inst)

STR(IndexType::Unsigned, ARM64Reg::X0, PPC_REG, PPCSTATE_OFF(gather_pipe_ptr));
js.fifoBytesSinceCheck += accessSize >> 3;

if (update)
{
// Chance of this happening is fairly low, but support it
gpr.BindToRegister(a, false);
MOVI2R(gpr.R(a), imm_addr);
}
}
else if (jo.fastmem_arena && PowerPC::IsOptimizableRAMAddress(imm_addr))
{
set_addr_reg_if_needed();
EmitBackpatchRoutine(flags, true, false, V0, XA, BitSet32(0), BitSet32(0));
}
else
{
set_addr_reg_if_needed();
EmitBackpatchRoutine(flags, false, false, V0, XA, regs_in_use, fprs_in_use);
}
}
else
{
set_addr_reg_if_needed();
EmitBackpatchRoutine(flags, jo.fastmem, jo.fastmem, V0, XA, regs_in_use, fprs_in_use);
}

if (update && !early_update)
{
gpr.BindToRegister(a, false);
set_addr_reg_if_needed();
MOV(gpr.R(a), addr_reg);
}

if (want_single && !have_single)
fpr.Unlock(V0);

@@ -8,6 +8,7 @@

#include "Core/Core.h"
#include "Core/CoreTiming.h"
#include "Core/PowerPC/Gekko.h"
#include "Core/PowerPC/JitArm64/Jit.h"
#include "Core/PowerPC/JitArm64/JitArm64_RegCache.h"
#include "Core/PowerPC/PPCTables.h"
@@ -19,7 +20,6 @@ void JitArm64::psq_lXX(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITLoadStorePairedOff);
FALLBACK_IF(jo.memcheck);

// If we have a fastmem arena, the asm routines assume address translation is on.
FALLBACK_IF(!js.assumeNoPairedQuantize && jo.fastmem_arena && !MSR.DR);
@@ -47,7 +47,7 @@ void JitArm64::psq_lXX(UGeckoInstruction inst)
constexpr ARM64Reg addr_reg = ARM64Reg::W0;
constexpr ARM64Reg scale_reg = ARM64Reg::W1;
constexpr ARM64Reg type_reg = ARM64Reg::W2;
ARM64Reg VS = fpr.RW(inst.RS, RegType::Single);
ARM64Reg VS = fpr.RW(inst.RS, RegType::Single, false);

if (inst.RA || update) // Always uses the register on update
{
@@ -66,7 +66,8 @@ void JitArm64::psq_lXX(UGeckoInstruction inst)
MOVI2R(addr_reg, (u32)offset);
}

if (update)
const bool early_update = !jo.memcheck;
if (update && early_update)
{
gpr.BindToRegister(inst.RA, false);
MOV(gpr.R(inst.RA), addr_reg);
@@ -78,9 +79,11 @@ void JitArm64::psq_lXX(UGeckoInstruction inst)
BitSet32 fprs_in_use = fpr.GetCallerSavedUsed();

// Wipe the registers we are using as temporaries
gprs_in_use[DecodeReg(ARM64Reg::W0)] = false;
if (!update || early_update)
gprs_in_use[DecodeReg(ARM64Reg::W0)] = false;
fprs_in_use[DecodeReg(ARM64Reg::Q0)] = false;
fprs_in_use[DecodeReg(VS)] = 0;
if (!jo.memcheck)
fprs_in_use[DecodeReg(VS)] = 0;

u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_32;
if (!w)
@@ -99,6 +102,8 @@ void JitArm64::psq_lXX(UGeckoInstruction inst)
LDR(EncodeRegTo64(type_reg), ARM64Reg::X30, ArithOption(EncodeRegTo64(type_reg), true));
BLR(EncodeRegTo64(type_reg));

WriteConditionalExceptionExit(EXCEPTION_DSI, ARM64Reg::X30, ARM64Reg::Q1);

m_float_emit.ORR(EncodeRegToDouble(VS), ARM64Reg::D0, ARM64Reg::D0);
}

@@ -108,6 +113,15 @@ void JitArm64::psq_lXX(UGeckoInstruction inst)
m_float_emit.INS(32, VS, 1, ARM64Reg::Q0, 0);
}

const ARM64Reg VS_again = fpr.RW(inst.RS, RegType::Single, true);
ASSERT(VS == VS_again);

if (update && !early_update)
{
gpr.BindToRegister(inst.RA, false);
MOV(gpr.R(inst.RA), addr_reg);
}

gpr.Unlock(ARM64Reg::W0, ARM64Reg::W30);
fpr.Unlock(ARM64Reg::Q0);
if (!js.assumeNoPairedQuantize)
@@ -121,7 +135,6 @@ void JitArm64::psq_stXX(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITLoadStorePairedOff);
FALLBACK_IF(jo.memcheck);

// If we have a fastmem arena, the asm routines assume address translation is on.
FALLBACK_IF(!js.assumeNoPairedQuantize && jo.fastmem_arena && !MSR.DR);
@@ -198,7 +211,8 @@ void JitArm64::psq_stXX(UGeckoInstruction inst)
MOVI2R(addr_reg, (u32)offset);
}

if (update)
const bool early_update = !jo.memcheck;
if (update && early_update)
{
gpr.BindToRegister(inst.RA, false);
MOV(gpr.R(inst.RA), addr_reg);
@@ -211,7 +225,8 @@ void JitArm64::psq_stXX(UGeckoInstruction inst)

// Wipe the registers we are using as temporaries
gprs_in_use[DecodeReg(ARM64Reg::W0)] = false;
gprs_in_use[DecodeReg(ARM64Reg::W1)] = false;
if (!update || early_update)
gprs_in_use[DecodeReg(ARM64Reg::W1)] = false;

u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_32;
if (!w)
@@ -229,6 +244,14 @@ void JitArm64::psq_stXX(UGeckoInstruction inst)
MOVP2R(ARM64Reg::X30, w ? single_store_quantized : paired_store_quantized);
LDR(EncodeRegTo64(type_reg), ARM64Reg::X30, ArithOption(EncodeRegTo64(type_reg), true));
BLR(EncodeRegTo64(type_reg));

WriteConditionalExceptionExit(EXCEPTION_DSI, ARM64Reg::X30, ARM64Reg::Q1);
}

if (update && !early_update)
{
gpr.BindToRegister(inst.RA, false);
MOV(gpr.R(inst.RA), addr_reg);
}

if (js.assumeNoPairedQuantize && !have_single)
@@ -343,25 +343,29 @@ void Arm64GPRCache::SetImmediate(const GuestRegInfo& guest_reg, u32 imm)
reg.LoadToImm(imm);
}

void Arm64GPRCache::BindToRegister(const GuestRegInfo& guest_reg, bool do_load)
void Arm64GPRCache::BindToRegister(const GuestRegInfo& guest_reg, bool do_load, bool set_dirty)
{
OpArg& reg = guest_reg.reg;
const size_t bitsize = guest_reg.bitsize;

reg.ResetLastUsed();
reg.SetDirty(true);

const RegType reg_type = reg.GetType();
if (reg_type == RegType::NotLoaded || reg_type == RegType::Discarded)
{
const ARM64Reg host_reg = bitsize != 64 ? GetReg() : EncodeRegTo64(GetReg());
reg.Load(host_reg);
reg.SetDirty(set_dirty);
if (do_load)
{
ASSERT_MSG(DYNA_REC, reg_type != RegType::Discarded, "Attempted to load a discarded value");
m_emit->LDR(IndexType::Unsigned, host_reg, PPC_REG, u32(guest_reg.ppc_offset));
}
}
else if (set_dirty)
{
reg.SetDirty(true);
}
}

void Arm64GPRCache::GetAllocationOrder()
@@ -570,26 +574,15 @@ ARM64Reg Arm64FPRCache::R(size_t preg, RegType type)
return ARM64Reg::INVALID_REG;
}

ARM64Reg Arm64FPRCache::RW(size_t preg, RegType type)
ARM64Reg Arm64FPRCache::RW(size_t preg, RegType type, bool set_dirty)
{
OpArg& reg = m_guest_registers[preg];

bool was_dirty = reg.IsDirty();

IncrementAllUsed();
reg.ResetLastUsed();

reg.SetDirty(true);

// If not loaded at all, just alloc a new one.
if (reg.GetType() == RegType::NotLoaded || reg.GetType() == RegType::Discarded)
{
reg.Load(GetReg(), type);
return reg.GetReg();
}

// Only the lower value will be overwritten, so we must be extra careful to store PSR1 if dirty.
if ((type == RegType::LowerPair || type == RegType::LowerPairSingle) && was_dirty)
if (reg.IsDirty() && (type == RegType::LowerPair || type == RegType::LowerPairSingle))
{
// We must *not* change host_reg as this register might still be in use. So it's fine to
// store this register, but it's *not* fine to convert it to double. So for double conversion,
@@ -612,13 +605,15 @@ ARM64Reg Arm64FPRCache::RW(size_t preg, RegType type)
m_jit->ConvertSingleToDoubleLower(preg, flush_reg, flush_reg, scratch_reg);
m_float_emit->STR(64, IndexType::Unsigned, flush_reg, PPC_REG, u32(PPCSTATE_OFF_PS1(preg)));
Unlock(scratch_reg);
reg.Load(host_reg, RegType::LowerPairSingle);
break;
}
else
{
m_jit->ConvertSingleToDoublePair(preg, flush_reg, host_reg, flush_reg);
m_float_emit->STR(128, IndexType::Unsigned, flush_reg, PPC_REG,
u32(PPCSTATE_OFF_PS0(preg)));
reg.SetDirty(false);
}
break;
case RegType::Register:
@@ -627,6 +622,7 @@ ARM64Reg Arm64FPRCache::RW(size_t preg, RegType type)
// It would take longer to do an insert to a temporary and a 64bit store than to just do this.
m_float_emit->STR(128, IndexType::Unsigned, flush_reg, PPC_REG,
static_cast<s32>(PPCSTATE_OFF_PS0(preg)));
reg.SetDirty(false);
break;
case RegType::DuplicatedSingle:
flush_reg = GetReg();
@@ -636,6 +632,8 @@ ARM64Reg Arm64FPRCache::RW(size_t preg, RegType type)
// Store PSR1 (which is equal to PSR0) in memory.
m_float_emit->STR(64, IndexType::Unsigned, flush_reg, PPC_REG,
static_cast<s32>(PPCSTATE_OFF_PS1(preg)));
reg.Load(host_reg, reg.GetType() == RegType::DuplicatedSingle ? RegType::LowerPairSingle :
RegType::LowerPair);
break;
default:
// All other types doesn't store anything in PSR1.
@@ -646,7 +644,18 @@ ARM64Reg Arm64FPRCache::RW(size_t preg, RegType type)
Unlock(flush_reg);
}

reg.Load(reg.GetReg(), type);
if (reg.GetType() == RegType::NotLoaded || reg.GetType() == RegType::Discarded)
{
// If not loaded at all, just alloc a new one.
reg.Load(GetReg(), type);
reg.SetDirty(set_dirty);
}
else if (set_dirty)
{
reg.Load(reg.GetReg(), type);
reg.SetDirty(true);
}

return reg.GetReg();
}

@@ -266,9 +266,15 @@ class Arm64GPRCache : public Arm64RegCache
// Gets the immediate that a register is set to, only valid for guest GPRs
u32 GetImm(size_t preg) const { return GetGuestGPROpArg(preg).GetImm(); }
// Binds a guest GPR to a host register, optionally loading its value
void BindToRegister(size_t preg, bool do_load) { BindToRegister(GetGuestGPR(preg), do_load); }
void BindToRegister(size_t preg, bool do_load, bool set_dirty = true)
{
BindToRegister(GetGuestGPR(preg), do_load, set_dirty);
}
// Binds a guest CR to a host register, optionally loading its value
void BindCRToRegister(size_t preg, bool do_load) { BindToRegister(GetGuestCR(preg), do_load); }
void BindCRToRegister(size_t preg, bool do_load, bool set_dirty = true)
{
BindToRegister(GetGuestCR(preg), do_load, set_dirty);
}
BitSet32 GetCallerSavedUsed() const override;

void StoreRegisters(BitSet32 regs, Arm64Gen::ARM64Reg tmp_reg = Arm64Gen::ARM64Reg::INVALID_REG)
@@ -307,7 +313,7 @@ class Arm64GPRCache : public Arm64RegCache

Arm64Gen::ARM64Reg R(const GuestRegInfo& guest_reg);
void SetImmediate(const GuestRegInfo& guest_reg, u32 imm);
void BindToRegister(const GuestRegInfo& guest_reg, bool do_load);
void BindToRegister(const GuestRegInfo& guest_reg, bool do_load, bool set_dirty = true);

void FlushRegisters(BitSet32 regs, bool maintain_state, Arm64Gen::ARM64Reg tmp_reg);
void FlushCRRegisters(BitSet32 regs, bool maintain_state, Arm64Gen::ARM64Reg tmp_reg);
@@ -326,7 +332,7 @@ class Arm64FPRCache : public Arm64RegCache
// Will dump an immediate to the host register as well
Arm64Gen::ARM64Reg R(size_t preg, RegType type);

Arm64Gen::ARM64Reg RW(size_t preg, RegType type);
Arm64Gen::ARM64Reg RW(size_t preg, RegType type, bool set_dirty = true);

BitSet32 GetCallerSavedUsed() const override;

@@ -495,7 +495,9 @@ void JitArm64::GenerateQuantizedLoads()
// Q1 is a temporary
ARM64Reg addr_reg = ARM64Reg::X0;
ARM64Reg scale_reg = ARM64Reg::X1;
BitSet32 gprs_to_push = CALLER_SAVED_GPRS & ~BitSet32{0, 2, 3};
BitSet32 gprs_to_push = CALLER_SAVED_GPRS & ~BitSet32{2, 3};
if (!jo.memcheck)
gprs_to_push &= ~BitSet32{0};
BitSet32 fprs_to_push = BitSet32(0xFFFFFFFF) & ~BitSet32{0, 1};
ARM64FloatEmitter float_emit(this);

@@ -524,8 +526,8 @@ void JitArm64::GenerateQuantizedLoads()
float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0);
float_emit.UCVTF(32, ARM64Reg::D0, ARM64Reg::D0);

MOVP2R(addr_reg, &m_dequantizeTableS);
ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
MOVP2R(ARM64Reg::X2, &m_dequantizeTableS);
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0);
RET(ARM64Reg::X30);
@@ -542,8 +544,8 @@ void JitArm64::GenerateQuantizedLoads()
float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0);
float_emit.SCVTF(32, ARM64Reg::D0, ARM64Reg::D0);

MOVP2R(addr_reg, &m_dequantizeTableS);
ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
MOVP2R(ARM64Reg::X2, &m_dequantizeTableS);
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0);
RET(ARM64Reg::X30);
@@ -559,8 +561,8 @@ void JitArm64::GenerateQuantizedLoads()
float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0);
float_emit.UCVTF(32, ARM64Reg::D0, ARM64Reg::D0);

MOVP2R(addr_reg, &m_dequantizeTableS);
ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
MOVP2R(ARM64Reg::X2, &m_dequantizeTableS);
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0);
RET(ARM64Reg::X30);
@@ -576,8 +578,8 @@ void JitArm64::GenerateQuantizedLoads()
float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0);
float_emit.SCVTF(32, ARM64Reg::D0, ARM64Reg::D0);

MOVP2R(addr_reg, &m_dequantizeTableS);
ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
MOVP2R(ARM64Reg::X2, &m_dequantizeTableS);
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0);
RET(ARM64Reg::X30);
@@ -605,8 +607,8 @@ void JitArm64::GenerateQuantizedLoads()
float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0);
float_emit.UCVTF(32, ARM64Reg::D0, ARM64Reg::D0);

MOVP2R(addr_reg, &m_dequantizeTableS);
ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
MOVP2R(ARM64Reg::X2, &m_dequantizeTableS);
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0);
RET(ARM64Reg::X30);
@@ -623,8 +625,8 @@ void JitArm64::GenerateQuantizedLoads()
float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0);
float_emit.SCVTF(32, ARM64Reg::D0, ARM64Reg::D0);

MOVP2R(addr_reg, &m_dequantizeTableS);
ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
MOVP2R(ARM64Reg::X2, &m_dequantizeTableS);
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0);
RET(ARM64Reg::X30);
@@ -640,8 +642,8 @@ void JitArm64::GenerateQuantizedLoads()
float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0);
float_emit.UCVTF(32, ARM64Reg::D0, ARM64Reg::D0);

MOVP2R(addr_reg, &m_dequantizeTableS);
ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
MOVP2R(ARM64Reg::X2, &m_dequantizeTableS);
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0);
RET(ARM64Reg::X30);
@@ -657,8 +659,8 @@ void JitArm64::GenerateQuantizedLoads()
float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0);
float_emit.SCVTF(32, ARM64Reg::D0, ARM64Reg::D0);

MOVP2R(addr_reg, &m_dequantizeTableS);
ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
MOVP2R(ARM64Reg::X2, &m_dequantizeTableS);
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0);
RET(ARM64Reg::X30);
@@ -701,7 +703,9 @@ void JitArm64::GenerateQuantizedStores()
// Q1 is a temporary
ARM64Reg scale_reg = ARM64Reg::X0;
ARM64Reg addr_reg = ARM64Reg::X1;
BitSet32 gprs_to_push = CALLER_SAVED_GPRS & ~BitSet32{0, 1, 2};
BitSet32 gprs_to_push = CALLER_SAVED_GPRS & ~BitSet32{0, 2};
if (!jo.memcheck)
gprs_to_push &= ~BitSet32{1};
BitSet32 fprs_to_push = BitSet32(0xFFFFFFFF) & ~BitSet32{0, 1};
ARM64FloatEmitter float_emit(this);