@@ -36,14 +36,13 @@ class Jit64AsmRoutineManager : public CommonAsmRoutines

explicit Jit64AsmRoutineManager(Jit64& jit);

void Init(u8* stack_top);
void Init();

void ResetStack(Gen::X64CodeBlock& emitter);

private:
void Generate();
void GenerateCommon();

u8* m_stack_top = nullptr;
JitBase& m_jit;
};
@@ -38,11 +38,6 @@ constexpr size_t CODE_SIZE = 1024 * 1024 * 32;
constexpr size_t FARCODE_SIZE = 1024 * 1024 * 64;
constexpr size_t FARCODE_SIZE_MMU = 1024 * 1024 * 64;

constexpr size_t STACK_SIZE = 2 * 1024 * 1024;
constexpr size_t SAFE_STACK_SIZE = 512 * 1024;
constexpr size_t GUARD_SIZE = 64 * 1024; // two guards - bottom (permanent) and middle (see above)
constexpr size_t GUARD_OFFSET = STACK_SIZE - SAFE_STACK_SIZE - GUARD_SIZE;

JitArm64::JitArm64() : m_float_emit(this)
{
}
@@ -71,10 +66,6 @@ void JitArm64::Init()
code_block.m_gpa = &js.gpa;
code_block.m_fpa = &js.fpa;

m_enable_blr_optimization = jo.enableBlocklink && m_fastmem_enabled && !m_enable_debugging;
m_cleanup_after_stackfault = false;

AllocStack();
GenerateAsm();

ResetFreeMemoryRanges();
@@ -117,9 +108,8 @@ bool JitArm64::HandleFault(uintptr_t access_address, SContext* ctx)
bool success = false;

// Handle BLR stack faults, may happen in C++ code.
uintptr_t stack = (uintptr_t)m_stack_base;
uintptr_t diff = access_address - stack;
if (diff >= GUARD_OFFSET && diff < GUARD_OFFSET + GUARD_SIZE)
const uintptr_t stack_guard = reinterpret_cast<uintptr_t>(m_stack_guard);
if (access_address >= stack_guard && access_address < stack_guard + GUARD_SIZE)
success = HandleStackFault();

// If the fault is in JIT code space, look for fastmem areas.
@@ -156,23 +146,6 @@ bool JitArm64::HandleFault(uintptr_t access_address, SContext* ctx)
return success;
}

bool JitArm64::HandleStackFault()
{
if (!m_enable_blr_optimization)
return false;

ERROR_LOG_FMT(POWERPC, "BLR cache disabled due to excessive BL in the emulated program.");
m_enable_blr_optimization = false;
#ifndef _WIN32
Common::UnWriteProtectMemory(m_stack_base + GUARD_OFFSET, GUARD_SIZE);
#endif
GetBlockCache()->InvalidateICache(0, 0xffffffff, true);
Core::System::GetInstance().GetCoreTiming().ForceExceptionCheck(0);
m_cleanup_after_stackfault = true;

return true;
}

void JitArm64::ClearCache()
{
m_fault_to_handler.clear();
@@ -205,7 +178,6 @@ void JitArm64::Shutdown()
memory.ShutdownFastmemArena();
FreeCodeSpace();
blocks.Shutdown();
FreeStack();
}

void JitArm64::FallBackToInterpreter(UGeckoInstruction inst)
@@ -337,40 +309,6 @@ void JitArm64::ResetStack()
ADD(ARM64Reg::SP, ARM64Reg::X0, 0);
}

void JitArm64::AllocStack()
{
if (!m_enable_blr_optimization)
return;

#ifndef _WIN32
m_stack_base = static_cast<u8*>(Common::AllocateMemoryPages(STACK_SIZE));
if (!m_stack_base)
{
m_enable_blr_optimization = false;
return;
}

m_stack_pointer = m_stack_base + STACK_SIZE;
Common::ReadProtectMemory(m_stack_base, GUARD_SIZE);
Common::ReadProtectMemory(m_stack_base + GUARD_OFFSET, GUARD_SIZE);
#else
// For windows we just keep using the system stack and reserve a large amount of memory at the end
// of the stack.
ULONG reserveSize = SAFE_STACK_SIZE;
SetThreadStackGuarantee(&reserveSize);
#endif
}

void JitArm64::FreeStack()
{
#ifndef _WIN32
if (m_stack_base)
Common::FreeMemoryPages(m_stack_base, STACK_SIZE);
m_stack_base = nullptr;
m_stack_pointer = nullptr;
#endif
}

void JitArm64::IntializeSpeculativeConstants()
{
// If the block depends on an input register which looks like a gather pipe or MMIO related
@@ -696,14 +634,22 @@ void JitArm64::EndTimeProfile(JitBlock* b)

void JitArm64::Run()
{
ProtectStack();

CompiledCode pExecAddr = (CompiledCode)enter_code;
pExecAddr();

UnprotectStack();
}

void JitArm64::SingleStep()
{
ProtectStack();

CompiledCode pExecAddr = (CompiledCode)enter_code;
pExecAddr();

UnprotectStack();
}

void JitArm64::Trace()
@@ -740,15 +686,7 @@ void JitArm64::Jit(u32 em_address)

void JitArm64::Jit(u32 em_address, bool clear_cache_and_retry_on_failure)
{
if (m_cleanup_after_stackfault)
{
ClearCache();
m_cleanup_after_stackfault = false;
#ifdef _WIN32
// The stack is in an invalid state with no guard page, reset it.
_resetstkoflw();
#endif
}
CleanUpAfterStackFault();

if (SConfig::GetInstance().bJITNoBlockCache)
ClearCache();
@@ -32,7 +32,6 @@ class JitArm64 : public JitBase, public Arm64Gen::ARM64CodeBlock, public CommonA
bool IsInCodeSpace(const u8* ptr) const { return IsInSpace(ptr); }
bool HandleFault(uintptr_t access_address, SContext* ctx) override;
void DoBacktrace(uintptr_t access_address, SContext* ctx);
bool HandleStackFault() override;
bool HandleFastmemFault(SContext* ctx);

void ClearCache() override;
@@ -288,8 +287,6 @@ class JitArm64 : public JitBase, public Arm64Gen::ARM64CodeBlock, public CommonA
void DoDownCount();
void Cleanup();
void ResetStack();
void AllocStack();
void FreeStack();

void ResetFreeMemoryRanges();

@@ -363,12 +360,6 @@ class JitArm64 : public JitBase, public Arm64Gen::ARM64CodeBlock, public CommonA
u8* m_near_code_end = nullptr;
bool m_near_code_write_failed = false;

bool m_enable_blr_optimization = false;
bool m_cleanup_after_stackfault = false;
u8* m_stack_base = nullptr;
u8* m_stack_pointer = nullptr;
u8* m_saved_stack_pointer = nullptr;

HyoutaUtilities::RangeSizeSet<u8*> m_free_ranges_near;
HyoutaUtilities::RangeSizeSet<u8*> m_free_ranges_far;
};
@@ -45,23 +45,14 @@ void JitArm64::GenerateAsm()

MOVP2R(PPC_REG, &PowerPC::ppcState);

// Swap the stack pointer, so we have proper guard pages.
// Store the stack pointer, so we can reset it if the BLR optimization fails.
ADD(ARM64Reg::X0, ARM64Reg::SP, 0);
STR(IndexType::Unsigned, ARM64Reg::X0, ARM64Reg::X1,
MOVPage2R(ARM64Reg::X1, &m_saved_stack_pointer));
LDR(IndexType::Unsigned, ARM64Reg::X0, ARM64Reg::X1, MOVPage2R(ARM64Reg::X1, &m_stack_pointer));
FixupBranch no_fake_stack = CBZ(ARM64Reg::X0);
ADD(ARM64Reg::SP, ARM64Reg::X0, 0);
SetJumpTarget(no_fake_stack);
STR(IndexType::Unsigned, ARM64Reg::X0, PPC_REG, PPCSTATE_OFF(stored_stack_pointer));

// Push {nullptr; -1} as invalid destination on the stack.
MOVI2R(ARM64Reg::X0, 0xFFFFFFFF);
STP(IndexType::Pre, ARM64Reg::ZR, ARM64Reg::X0, ARM64Reg::SP, -16);

// Store the stack pointer, so we can reset it if the BLR optimization fails.
ADD(ARM64Reg::X0, ARM64Reg::SP, 0);
STR(IndexType::Unsigned, ARM64Reg::X0, PPC_REG, PPCSTATE_OFF(stored_stack_pointer));

// The PC will be loaded into DISPATCHER_PC after the call to CoreTiming::Advance().
// Advance() does an exception check so we don't know what PC to use until afterwards.
FixupBranch to_start_of_timing_slice = B();
@@ -204,9 +195,9 @@ void JitArm64::GenerateAsm()
if (enable_debugging)
SetJumpTarget(debug_exit);

// Reset the stack pointer, as the BLR optimization have touched it.
LDR(IndexType::Unsigned, ARM64Reg::X0, ARM64Reg::X1,
MOVPage2R(ARM64Reg::X1, &m_saved_stack_pointer));
// Reset the stack pointer, since the BLR optimization may have pushed things onto the stack
// without popping them.
LDR(IndexType::Unsigned, ARM64Reg::X0, PPC_REG, PPCSTATE_OFF(stored_stack_pointer));
ADD(ARM64Reg::SP, ARM64Reg::X0, 0);

m_float_emit.ABI_PopRegisters(regs_to_save_fpr, ARM64Reg::X30);
@@ -3,15 +3,53 @@

#include "Core/PowerPC/JitCommon/JitBase.h"

#include "Common/Align.h"
#include "Common/CommonTypes.h"
#include "Common/MemoryUtil.h"
#include "Common/Thread.h"
#include "Core/Config/MainSettings.h"
#include "Core/ConfigManager.h"
#include "Core/Core.h"
#include "Core/CoreTiming.h"
#include "Core/HW/CPU.h"
#include "Core/PowerPC/PPCAnalyst.h"
#include "Core/PowerPC/PowerPC.h"
#include "Core/System.h"

#ifdef _WIN32
#include <windows.h>
#include <processthreadsapi.h>
#else
#include <unistd.h>
#endif

// The BLR optimization is nice, but it means that JITted code can overflow the
// native stack by repeatedly running BL. (The chance of this happening in any
// retail game is close to 0, but correctness is correctness...) Also, the
// overflow might not happen directly in the JITted code but in a C++ function
// called from it, so we can't just adjust RSP in the case of a fault.
// Instead, we have to have extra stack space preallocated under the fault
// point which allows the code to continue, after wiping the JIT cache so we
// can reset things at a safe point. Once this condition trips, the
// optimization is permanently disabled, under the assumption this will never
// happen in practice.

// On Unix, we just mark an appropriate region of the stack as PROT_NONE and
// handle it the same way as fastmem faults. It's safe to take a fault with a
// bad RSP, because on Linux we can use sigaltstack and on OS X we're already
// on a separate thread.

// Windows is... under-documented.
// It already puts guard pages so it can automatically grow the stack and it
// doesn't look like there is a way to hook into a guard page fault and implement
// our own logic.
// But when windows reaches the last guard page, it raises a "Stack Overflow"
// exception which we can hook into, however by default it leaves you with less
// than 4kb of stack. So we use SetThreadStackGuarantee to trigger the Stack
// Overflow early while we still have 256kb of stack remaining.
// After resetting the stack to the top, we call _resetstkoflw() to restore
// the guard page at the 256kb mark.

const u8* JitBase::Dispatch(JitBase& jit)
{
return jit.GetBlockCache()->Dispatch();
@@ -72,6 +110,107 @@ void JitBase::RefreshConfig()
analyzer.SetDivByZeroExceptionsEnabled(m_enable_div_by_zero_exceptions);
}

void JitBase::InitBLROptimization()
{
m_enable_blr_optimization = jo.enableBlocklink && m_fastmem_enabled && !m_enable_debugging;
m_cleanup_after_stackfault = false;
}

void JitBase::ProtectStack()
{
if (!m_enable_blr_optimization)
return;

#ifdef _WIN32
ULONG reserveSize = SAFE_STACK_SIZE;
SetThreadStackGuarantee(&reserveSize);
#else
auto [stack_addr, stack_size] = Common::GetCurrentThreadStack();

const uintptr_t stack_base_addr = reinterpret_cast<uintptr_t>(stack_addr);
const uintptr_t stack_middle_addr = reinterpret_cast<uintptr_t>(&stack_addr);
if (stack_middle_addr < stack_base_addr || stack_middle_addr >= stack_base_addr + stack_size)
{
PanicAlertFmt("Failed to get correct stack base");
m_enable_blr_optimization = false;
return;
}

const long page_size = sysconf(_SC_PAGESIZE);
if (page_size <= 0)
{
PanicAlertFmt("Failed to get page size");
m_enable_blr_optimization = false;
return;
}

const uintptr_t stack_guard_addr = Common::AlignUp(stack_base_addr + GUARD_OFFSET, page_size);
if (stack_guard_addr >= stack_middle_addr ||
stack_middle_addr - stack_guard_addr < GUARD_SIZE + MIN_UNSAFE_STACK_SIZE)
{
PanicAlertFmt("Stack is too small for BLR optimization (size {:x}, base {:x}, current stack "
"pointer {:x}, alignment {:x})",
stack_size, stack_base_addr, stack_middle_addr, page_size);
m_enable_blr_optimization = false;
return;
}

m_stack_guard = reinterpret_cast<u8*>(stack_guard_addr);
Common::ReadProtectMemory(m_stack_guard, GUARD_SIZE);
#endif
}

void JitBase::UnprotectStack()
{
#ifndef _WIN32
if (m_stack_guard)
{
Common::UnWriteProtectMemory(m_stack_guard, GUARD_SIZE);
m_stack_guard = nullptr;
}
#endif
}

bool JitBase::HandleStackFault()
{
// It's possible the stack fault might have been caused by something other than
// the BLR optimization. If the fault was triggered from another thread, or
// when BLR optimization isn't enabled then there is nothing we can do about the fault.
// Return false so the regular stack overflow handler can trigger (which crashes)
if (!m_enable_blr_optimization || !Core::IsCPUThread())
return false;

WARN_LOG_FMT(POWERPC, "BLR cache disabled due to excessive BL in the emulated program.");

UnprotectStack();
m_enable_blr_optimization = false;

// We're going to need to clear the whole cache to get rid of the bad
// CALLs, but we can't yet. Fake the downcount so we're forced to the
// dispatcher (no block linking), and clear the cache so we're sent to
// Jit. In the case of Windows, we will also need to call _resetstkoflw()
// to reset the guard page.
// Yeah, it's kind of gross.
GetBlockCache()->InvalidateICache(0, 0xffffffff, true);
Core::System::GetInstance().GetCoreTiming().ForceExceptionCheck(0);
m_cleanup_after_stackfault = true;

return true;
}

void JitBase::CleanUpAfterStackFault()
{
if (m_cleanup_after_stackfault)
{
ClearCache();
m_cleanup_after_stackfault = false;
#ifdef _WIN32
// The stack is in an invalid state with no guard page, reset it.
_resetstkoflw();
#endif
}
}

bool JitBase::CanMergeNextInstructions(int count) const
{
if (CPU::IsStepping() || js.instructionsLeft < count)
@@ -54,6 +54,12 @@ class JitBase : public CPUCoreBase
#endif
};

static constexpr size_t SAFE_STACK_SIZE = 256 * 1024;
static constexpr size_t MIN_UNSAFE_STACK_SIZE = 192 * 1024;
static constexpr size_t MIN_STACK_SIZE = SAFE_STACK_SIZE + MIN_UNSAFE_STACK_SIZE;
static constexpr size_t GUARD_SIZE = 64 * 1024;
static constexpr size_t GUARD_OFFSET = SAFE_STACK_SIZE - GUARD_SIZE;

struct JitOptions
{
bool enableBlocklink;
@@ -138,8 +144,17 @@ class JitBase : public CPUCoreBase
bool m_pause_on_panic_enabled = false;
bool m_accurate_cpu_cache_enabled = false;

bool m_enable_blr_optimization = false;
bool m_cleanup_after_stackfault = false;
u8* m_stack_guard = nullptr;

void RefreshConfig();

void InitBLROptimization();
void ProtectStack();
void UnprotectStack();
void CleanUpAfterStackFault();

bool CanMergeNextInstructions(int count) const;

void UpdateMemoryAndExceptionOptions();
@@ -160,7 +175,7 @@ class JitBase : public CPUCoreBase
virtual const CommonAsmRoutinesBase* GetAsmRoutines() = 0;

virtual bool HandleFault(uintptr_t access_address, SContext* ctx) = 0;
virtual bool HandleStackFault() { return false; }
bool HandleStackFault();

static constexpr std::size_t code_buffer_size = 32000;