Skip to content
Permalink
Browse files
Merge pull request #11399 from JosJuice/jit-one-stack
Jit: Don't use a second stack
  • Loading branch information
JosJuice committed Mar 3, 2023
2 parents a93b5a4 + b6256a5 commit 95ce41a
Show file tree
Hide file tree
Showing 12 changed files with 248 additions and 258 deletions.
@@ -195,26 +195,18 @@ std::unique_ptr<GBAHostInterface> Host_CreateGBAHost(std::weak_ptr<HW::GBA::Core

static bool MsgAlert(const char* caption, const char* text, bool yes_no, Common::MsgType style)
{
// If a panic alert happens very early in the execution of a game, we can crash here with
// the error "JNI NewString called with pending exception java.lang.StackOverflowError".
// As a workaround, let's put the call on a new thread with a brand new stack.

jboolean result;

std::thread([&] {
JNIEnv* env = IDCache::GetEnvForThread();
JNIEnv* env = IDCache::GetEnvForThread();

jstring j_caption = ToJString(env, caption);
jstring j_text = ToJString(env, text);
jstring j_caption = ToJString(env, caption);
jstring j_text = ToJString(env, text);

// Execute the Java method.
result = env->CallStaticBooleanMethod(
IDCache::GetNativeLibraryClass(), IDCache::GetDisplayAlertMsg(), j_caption, j_text, yes_no,
style == Common::MsgType::Warning, s_need_nonblocking_alert_msg);
// Execute the Java method.
jboolean result = env->CallStaticBooleanMethod(
IDCache::GetNativeLibraryClass(), IDCache::GetDisplayAlertMsg(), j_caption, j_text, yes_no,
style == Common::MsgType::Warning, s_need_nonblocking_alert_msg);

env->DeleteLocalRef(j_caption);
env->DeleteLocalRef(j_text);
}).join();
env->DeleteLocalRef(j_caption);
env->DeleteLocalRef(j_text);

return result != JNI_FALSE;
}
@@ -7,6 +7,7 @@
#include <Windows.h>
#include <processthreadsapi.h>
#else
#include <pthread.h>
#include <unistd.h>
#endif

@@ -185,6 +186,41 @@ void SetCurrentThreadName(const char* name)
#endif
}

std::tuple<void*, size_t> GetCurrentThreadStack()
{
void* stack_addr;
size_t stack_size;

pthread_t self = pthread_self();

#ifdef __APPLE__
stack_size = pthread_get_stacksize_np(self);
stack_addr = reinterpret_cast<u8*>(pthread_get_stackaddr_np(self)) - stack_size;
#elif defined __OpenBSD__
stack_t stack;
pthread_stackseg_np(self, &stack);

stack_addr = reinterpret_cast<u8*>(stack->ss_sp) - stack->ss_size;
stack_size = stack->ss_size;
#else
pthread_attr_t attr;

#ifdef __FreeBSD__
pthread_attr_init(&attr);
pthread_attr_get_np(self, &attr);
#else
// Linux and NetBSD
pthread_getattr_np(self, &attr);
#endif

pthread_attr_getstack(&attr, &stack_addr, &stack_size);

pthread_attr_destroy(&attr);
#endif

return std::make_tuple(stack_addr, stack_size);
}

#endif

} // namespace Common
@@ -5,6 +5,10 @@

#include <thread>

#ifndef _WIN32
#include <tuple>
#endif

// Don't include Common.h here as it will break LogManager
#include "Common/CommonTypes.h"

@@ -35,4 +39,9 @@ inline void YieldCPU()

void SetCurrentThreadName(const char* name);

#ifndef _WIN32
// Returns the lowest address of the stack and the size of the stack
std::tuple<void*, size_t> GetCurrentThreadStack();
#endif

} // namespace Common
@@ -19,7 +19,6 @@
#include "Common/GekkoDisassembler.h"
#include "Common/IOFile.h"
#include "Common/Logging/Log.h"
#include "Common/MemoryUtil.h"
#include "Common/PerformanceCounter.h"
#include "Common/StringUtil.h"
#include "Common/Swap.h"
@@ -117,108 +116,21 @@ using namespace PowerPC;
and such, but it's currently limited to integer ops only. This can definitely be made better.
*/

// The BLR optimization is nice, but it means that JITted code can overflow the
// native stack by repeatedly running BL. (The chance of this happening in any
// retail game is close to 0, but correctness is correctness...) Also, the
// overflow might not happen directly in the JITted code but in a C++ function
// called from it, so we can't just adjust RSP in the case of a fault.
// Instead, we have to have extra stack space preallocated under the fault
// point which allows the code to continue, after wiping the JIT cache so we
// can reset things at a safe point. Once this condition trips, the
// optimization is permanently disabled, under the assumption this will never
// happen in practice.

// On Unix, we just mark an appropriate region of the stack as PROT_NONE and
// handle it the same way as fastmem faults. It's safe to take a fault with a
// bad RSP, because on Linux we can use sigaltstack and on OS X we're already
// on a separate thread.

// Windows is... under-documented.
// It already puts guard pages so it can automatically grow the stack and it
// doesn't look like there is a way to hook into a guard page fault and implement
// our own logic.
// But when windows reaches the last guard page, it raises a "Stack Overflow"
// exception which we can hook into, however by default it leaves you with less
// than 4kb of stack. So we use SetThreadStackGuarantee to trigger the Stack
// Overflow early while we still have 512kb of stack remaining.
// After resetting the stack to the top, we call _resetstkoflw() to restore
// the guard page at the 512kb mark.

enum
{
STACK_SIZE = 2 * 1024 * 1024,
SAFE_STACK_SIZE = 512 * 1024,
GUARD_SIZE = 0x10000, // two guards - bottom (permanent) and middle (see above)
GUARD_OFFSET = STACK_SIZE - SAFE_STACK_SIZE - GUARD_SIZE,
};

Jit64::Jit64() : QuantizedMemoryRoutines(*this)
{
}

Jit64::~Jit64() = default;

void Jit64::AllocStack()
{
#ifndef _WIN32
m_stack = static_cast<u8*>(Common::AllocateMemoryPages(STACK_SIZE));
Common::ReadProtectMemory(m_stack, GUARD_SIZE);
Common::ReadProtectMemory(m_stack + GUARD_OFFSET, GUARD_SIZE);
#else
// For windows we just keep using the system stack and reserve a large amount of memory at the end
// of the stack.
ULONG reserveSize = SAFE_STACK_SIZE;
SetThreadStackGuarantee(&reserveSize);
#endif
}

void Jit64::FreeStack()
{
#ifndef _WIN32
if (m_stack)
{
Common::FreeMemoryPages(m_stack, STACK_SIZE);
m_stack = nullptr;
}
#endif
}

bool Jit64::HandleStackFault()
{
// It's possible the stack fault might have been caused by something other than
// the BLR optimization. If the fault was triggered from another thread, or
// when BLR optimization isn't enabled then there is nothing we can do about the fault.
// Return false so the regular stack overflow handler can trigger (which crashes)
if (!m_enable_blr_optimization || !Core::IsCPUThread())
return false;

WARN_LOG_FMT(POWERPC, "BLR cache disabled due to excessive BL in the emulated program.");

m_enable_blr_optimization = false;
#ifndef _WIN32
// Windows does this automatically.
Common::UnWriteProtectMemory(m_stack + GUARD_OFFSET, GUARD_SIZE);
#endif
// We're going to need to clear the whole cache to get rid of the bad
// CALLs, but we can't yet. Fake the downcount so we're forced to the
// dispatcher (no block linking), and clear the cache so we're sent to
// Jit. In the case of Windows, we will also need to call _resetstkoflw()
// to reset the guard page.
// Yeah, it's kind of gross.
GetBlockCache()->InvalidateICache(0, 0xffffffff, true);
Core::System::GetInstance().GetCoreTiming().ForceExceptionCheck(0);
m_cleanup_after_stackfault = true;

return true;
}

bool Jit64::HandleFault(uintptr_t access_address, SContext* ctx)
{
uintptr_t stack = (uintptr_t)m_stack;
uintptr_t diff = access_address - stack;
const uintptr_t stack_guard = reinterpret_cast<uintptr_t>(m_stack_guard);
// In the trap region?
if (m_enable_blr_optimization && diff >= GUARD_OFFSET && diff < GUARD_OFFSET + GUARD_SIZE)
if (m_enable_blr_optimization && access_address >= stack_guard &&
access_address < stack_guard + GUARD_SIZE)
{
return HandleStackFault();
}

// This generates some fairly heavy trampolines, but it doesn't really hurt.
// Only instructions that access I/O will get these, and there won't be that
@@ -365,17 +277,10 @@ void Jit64::Init()
m_const_pool.Init(AllocChildCodeSpace(constpool_size), constpool_size);
ResetCodePtr();

// BLR optimization has the same consequences as block linking, as well as
// depending on the fault handler to be safe in the event of excessive BL.
m_enable_blr_optimization = jo.enableBlocklink && m_fastmem_enabled && !m_enable_debugging;
m_cleanup_after_stackfault = false;

m_stack = nullptr;
if (m_enable_blr_optimization)
AllocStack();
m_stack_guard = nullptr;

blocks.Init();
asm_routines.Init(m_stack ? (m_stack + STACK_SIZE) : nullptr);
asm_routines.Init();

// important: do this *after* generating the global asm routines, because we can't use farcode in
// them.
@@ -415,7 +320,6 @@ void Jit64::ResetFreeMemoryRanges()

void Jit64::Shutdown()
{
FreeStack();
FreeCodeSpace();

auto& system = Core::System::GetInstance();
@@ -735,14 +639,22 @@ void Jit64::WriteExternalExceptionExit()

void Jit64::Run()
{
ProtectStack();

CompiledCode pExecAddr = (CompiledCode)asm_routines.enter_code;
pExecAddr();

UnprotectStack();
}

void Jit64::SingleStep()
{
ProtectStack();

CompiledCode pExecAddr = (CompiledCode)asm_routines.enter_code;
pExecAddr();

UnprotectStack();
}

void Jit64::Trace()
@@ -779,15 +691,7 @@ void Jit64::Jit(u32 em_address)

void Jit64::Jit(u32 em_address, bool clear_cache_and_retry_on_failure)
{
if (m_cleanup_after_stackfault)
{
ClearCache();
m_cleanup_after_stackfault = false;
#ifdef _WIN32
// The stack is in an invalid state with no guard page, reset it.
_resetstkoflw();
#endif
}
CleanUpAfterStackFault();

if (trampolines.IsAlmostFull() || SConfig::GetInstance().bJITNoBlockCache)
{
@@ -50,7 +50,6 @@ class Jit64 : public JitBase, public QuantizedMemoryRoutines
void Shutdown() override;

bool HandleFault(uintptr_t access_address, SContext* ctx) override;
bool HandleStackFault() override;
bool BackPatch(SContext* ctx);

void EnableOptimization();
@@ -255,9 +254,6 @@ class Jit64 : public JitBase, public QuantizedMemoryRoutines

bool HandleFunctionHooking(u32 address);

void AllocStack();
void FreeStack();

void ResetFreeMemoryRanges();

JitBlockCache blocks{*this};
@@ -268,10 +264,6 @@ class Jit64 : public JitBase, public QuantizedMemoryRoutines

Jit64AsmRoutineManager asm_routines{*this};

bool m_enable_blr_optimization = false;
bool m_cleanup_after_stackfault = false;
u8* m_stack = nullptr;

HyoutaUtilities::RangeSizeSet<u8*> m_free_ranges_near;
HyoutaUtilities::RangeSizeSet<u8*> m_free_ranges_far;
};
@@ -24,10 +24,9 @@ Jit64AsmRoutineManager::Jit64AsmRoutineManager(Jit64& jit) : CommonAsmRoutines(j
{
}

void Jit64AsmRoutineManager::Init(u8* stack_top)
void Jit64AsmRoutineManager::Init()
{
m_const_pool.Init(AllocChildCodeSpace(4096), 4096);
m_stack_top = stack_top;
Generate();
WriteProtect();
}
@@ -50,17 +49,8 @@ void Jit64AsmRoutineManager::Generate()
// MOV(64, R(RMEM), Imm64((u64)Memory::physical_base));
MOV(64, R(RPPCSTATE), Imm64((u64)&PowerPC::ppcState + 0x80));

if (m_stack_top)
{
// Pivot the stack to our custom one.
MOV(64, R(RSCRATCH), R(RSP));
MOV(64, R(RSP), ImmPtr(m_stack_top - 0x20));
MOV(64, MDisp(RSP, 0x18), R(RSCRATCH));
}
else
{
MOV(64, PPCSTATE(stored_stack_pointer), R(RSP));
}
MOV(64, PPCSTATE(stored_stack_pointer), R(RSP));

// something that can't pass the BLR test
MOV(64, MDisp(RSP, 8), Imm32((u32)-1));

@@ -209,12 +199,9 @@ void Jit64AsmRoutineManager::Generate()
if (enable_debugging)
SetJumpTarget(dbg_exit);

// Reset the stack pointer, since the BLR optimization may have pushed things onto the stack
// without popping them.
ResetStack(*this);
if (m_stack_top)
{
ADD(64, R(RSP), Imm8(0x18));
POP(RSP);
}

ABI_PopRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8, 16);
RET();
@@ -226,10 +213,7 @@ void Jit64AsmRoutineManager::Generate()

void Jit64AsmRoutineManager::ResetStack(X64CodeBlock& emitter)
{
if (m_stack_top)
emitter.MOV(64, R(RSP), Imm64((u64)m_stack_top - 0x20));
else
emitter.MOV(64, R(RSP), PPCSTATE(stored_stack_pointer));
emitter.MOV(64, R(RSP), PPCSTATE(stored_stack_pointer));
}

void Jit64AsmRoutineManager::GenerateCommon()

0 comments on commit 95ce41a

Please sign in to comment.