Skip to content

Commit

Permalink
Jit: Don't use a second stack
Browse files Browse the repository at this point in the history
This second stack leads to JNI problems on Android, because ART fetches
the address and size of the original stack using pthread functions
(see GetThreadStack in art/runtime/thread.cc), and (presumably) treats
stack addresses outside of the original stack as invalid. (What I don't
understand is why some JNI operations on the CPU thread work fine
despite this but others don't.)

Instead of creating a second stack, let's borrow the approach ART uses:
Use pthread functions to find out the stack's address and size, then
install guard pages at an appropriate location. This lets us get rid
of a workaround we had in the MsgAlert function.

Because we're no longer choosing the stack size ourselves, I've made some
tweaks to where the put the guard pages. Previously we had a stack of
2 MiB and a safe zone of 512 KiB. We now accept stacks as small as 512 KiB
(used on macOS) and use a safe zone of 256 KiB. I feel like this should
be fine, but haven't done much testing beyond "it seems to work".

By the way, on Windows it was already the case that we didn't create
a second stack... But there was a bug in the implementation!
The code for protecting the stack has to run on the CPU thread, since
it's the CPU thread's stack we want to protect, but it was actually
running on EmuThread. This commit fixes that, since now this bug
matters on other operating systems too.
  • Loading branch information
JosJuice committed Feb 28, 2023
1 parent 0cdae98 commit 86c1f6e
Show file tree
Hide file tree
Showing 10 changed files with 203 additions and 121 deletions.
26 changes: 9 additions & 17 deletions Source/Android/jni/MainAndroid.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -195,26 +195,18 @@ std::unique_ptr<GBAHostInterface> Host_CreateGBAHost(std::weak_ptr<HW::GBA::Core

static bool MsgAlert(const char* caption, const char* text, bool yes_no, Common::MsgType style)
{
// If a panic alert happens very early in the execution of a game, we can crash here with
// the error "JNI NewString called with pending exception java.lang.StackOverflowError".
// As a workaround, let's put the call on a new thread with a brand new stack.

jboolean result;

std::thread([&] {
JNIEnv* env = IDCache::GetEnvForThread();
JNIEnv* env = IDCache::GetEnvForThread();

jstring j_caption = ToJString(env, caption);
jstring j_text = ToJString(env, text);
jstring j_caption = ToJString(env, caption);
jstring j_text = ToJString(env, text);

// Execute the Java method.
result = env->CallStaticBooleanMethod(
IDCache::GetNativeLibraryClass(), IDCache::GetDisplayAlertMsg(), j_caption, j_text, yes_no,
style == Common::MsgType::Warning, s_need_nonblocking_alert_msg);
// Execute the Java method.
jboolean result = env->CallStaticBooleanMethod(
IDCache::GetNativeLibraryClass(), IDCache::GetDisplayAlertMsg(), j_caption, j_text, yes_no,
style == Common::MsgType::Warning, s_need_nonblocking_alert_msg);

env->DeleteLocalRef(j_caption);
env->DeleteLocalRef(j_text);
}).join();
env->DeleteLocalRef(j_caption);
env->DeleteLocalRef(j_text);

return result != JNI_FALSE;
}
Expand Down
36 changes: 36 additions & 0 deletions Source/Core/Common/Thread.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <Windows.h>
#include <processthreadsapi.h>
#else
#include <pthread.h>
#include <unistd.h>
#endif

Expand Down Expand Up @@ -185,6 +186,41 @@ void SetCurrentThreadName(const char* name)
#endif
}

std::tuple<void*, size_t> GetCurrentThreadStack()
{
void* stack_addr;
size_t stack_size;

pthread_t self = pthread_self();

#ifdef __APPLE__
stack_size = pthread_get_stacksize_np(self);
stack_addr = reinterpret_cast<u8*>(pthread_get_stackaddr_np(self)) - stack_size;
#elif defined __OpenBSD__
stack_t stack;
pthread_stackseg_np(self, &stack);

stack_addr = reinterpret_cast<u8*>(stack->ss_sp) - stack->ss_size;
stack_size = stack->ss_size;
#else
pthread_attr_t attr;

#ifdef __FreeBSD__
pthread_attr_init(&attr);
pthread_attr_get_np(self, &attr);
#else
// Linux and NetBSD
pthread_getattr_np(self, &attr);
#endif

pthread_attr_getstack(&attr, &stack_addr, &stack_size);

pthread_attr_destroy(&attr);
#endif

return std::make_tuple(stack_addr, stack_size);
}

#endif

} // namespace Common
9 changes: 9 additions & 0 deletions Source/Core/Common/Thread.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@

#include <thread>

#ifndef _WIN32
#include <tuple>
#endif

// Don't include Common.h here as it will break LogManager
#include "Common/CommonTypes.h"

Expand Down Expand Up @@ -35,4 +39,9 @@ inline void YieldCPU()

void SetCurrentThreadName(const char* name);

#ifndef _WIN32
// Returns the lowest address of the stack and the size of the stack
std::tuple<void*, size_t> GetCurrentThreadStack();
#endif

} // namespace Common
100 changes: 70 additions & 30 deletions Source/Core/Core/PowerPC/Jit64/Jit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,14 @@
#include <disasm.h>
#include <fmt/format.h>

// for the PROFILER stuff
#ifdef _WIN32
#include <windows.h>
#include <processthreadsapi.h>
#else
#include <unistd.h>
#endif

#include "Common/Align.h"
#include "Common/CommonTypes.h"
#include "Common/GekkoDisassembler.h"
#include "Common/IOFile.h"
Expand All @@ -23,6 +26,7 @@
#include "Common/PerformanceCounter.h"
#include "Common/StringUtil.h"
#include "Common/Swap.h"
#include "Common/Thread.h"
#include "Common/x64ABI.h"
#include "Core/Core.h"
#include "Core/CoreTiming.h"
Expand Down Expand Up @@ -140,15 +144,16 @@ using namespace PowerPC;
// But when windows reaches the last guard page, it raises a "Stack Overflow"
// exception which we can hook into, however by default it leaves you with less
// than 4kb of stack. So we use SetThreadStackGuarantee to trigger the Stack
// Overflow early while we still have 512kb of stack remaining.
// Overflow early while we still have 256kb of stack remaining.
// After resetting the stack to the top, we call _resetstkoflw() to restore
// the guard page at the 512kb mark.
// the guard page at the 256kb mark.

enum
{
STACK_SIZE = 2 * 1024 * 1024,
SAFE_STACK_SIZE = 512 * 1024,
GUARD_SIZE = 0x10000, // two guards - bottom (permanent) and middle (see above)
SAFE_STACK_SIZE = 256 * 1024,
MIN_UNSAFE_STACK_SIZE = 192 * 1024,
MIN_STACK_SIZE = SAFE_STACK_SIZE + MIN_UNSAFE_STACK_SIZE,
GUARD_SIZE = 64 * 1024,
GUARD_OFFSET = SAFE_STACK_SIZE - GUARD_SIZE,
};

Expand All @@ -158,27 +163,57 @@ Jit64::Jit64() : QuantizedMemoryRoutines(*this)

Jit64::~Jit64() = default;

void Jit64::AllocStack()
void Jit64::ProtectStack()
{
#ifndef _WIN32
m_stack = static_cast<u8*>(Common::AllocateMemoryPages(STACK_SIZE));
Common::ReadProtectMemory(m_stack, GUARD_SIZE);
Common::ReadProtectMemory(m_stack + GUARD_OFFSET, GUARD_SIZE);
#else
// For windows we just keep using the system stack and reserve a large amount of memory at the end
// of the stack.
if (!m_enable_blr_optimization)
return;

#ifdef _WIN32
ULONG reserveSize = SAFE_STACK_SIZE;
SetThreadStackGuarantee(&reserveSize);
#else
auto [stack_addr, stack_size] = Common::GetCurrentThreadStack();

const uintptr_t stack_base_addr = reinterpret_cast<uintptr_t>(stack_addr);
const uintptr_t stack_middle_addr = reinterpret_cast<uintptr_t>(&stack_addr);
if (stack_middle_addr < stack_base_addr || stack_middle_addr >= stack_base_addr + stack_size)
{
PanicAlertFmt("Failed to get correct stack base");
m_enable_blr_optimization = false;
return;
}

const long page_size = sysconf(_SC_PAGESIZE);
if (page_size <= 0)
{
PanicAlertFmt("Failed to get page size");
m_enable_blr_optimization = false;
return;
}

const uintptr_t stack_guard_addr = Common::AlignUp(stack_base_addr + GUARD_OFFSET, page_size);
if (stack_guard_addr >= stack_middle_addr ||
stack_middle_addr - stack_guard_addr < GUARD_SIZE + MIN_UNSAFE_STACK_SIZE)
{
PanicAlertFmt("Stack is too small for BLR optimization (size {:x}, base {:x}, current stack "
"pointer {:x}, alignment {:x})",
stack_size, stack_base_addr, stack_middle_addr, page_size);
m_enable_blr_optimization = false;
return;
}

m_stack_guard = reinterpret_cast<u8*>(stack_guard_addr);
Common::ReadProtectMemory(m_stack_guard, GUARD_SIZE);
#endif
}

void Jit64::FreeStack()
void Jit64::UnprotectStack()
{
#ifndef _WIN32
if (m_stack)
if (m_stack_guard)
{
Common::FreeMemoryPages(m_stack, STACK_SIZE);
m_stack = nullptr;
Common::UnWriteProtectMemory(m_stack_guard, GUARD_SIZE);
m_stack_guard = nullptr;
}
#endif
}
Expand All @@ -194,11 +229,9 @@ bool Jit64::HandleStackFault()

WARN_LOG_FMT(POWERPC, "BLR cache disabled due to excessive BL in the emulated program.");

UnprotectStack();
m_enable_blr_optimization = false;
#ifndef _WIN32
// Windows does this automatically.
Common::UnWriteProtectMemory(m_stack + GUARD_OFFSET, GUARD_SIZE);
#endif

// We're going to need to clear the whole cache to get rid of the bad
// CALLs, but we can't yet. Fake the downcount so we're forced to the
// dispatcher (no block linking), and clear the cache so we're sent to
Expand All @@ -214,11 +247,13 @@ bool Jit64::HandleStackFault()

bool Jit64::HandleFault(uintptr_t access_address, SContext* ctx)
{
uintptr_t stack = (uintptr_t)m_stack;
uintptr_t diff = access_address - stack;
const uintptr_t stack_guard = reinterpret_cast<uintptr_t>(m_stack_guard);
// In the trap region?
if (m_enable_blr_optimization && diff >= GUARD_OFFSET && diff < GUARD_OFFSET + GUARD_SIZE)
if (m_enable_blr_optimization && access_address >= stack_guard &&
access_address < stack_guard + GUARD_SIZE)
{
return HandleStackFault();
}

// This generates some fairly heavy trampolines, but it doesn't really hurt.
// Only instructions that access I/O will get these, and there won't be that
Expand Down Expand Up @@ -370,12 +405,10 @@ void Jit64::Init()
m_enable_blr_optimization = jo.enableBlocklink && m_fastmem_enabled && !m_enable_debugging;
m_cleanup_after_stackfault = false;

m_stack = nullptr;
if (m_enable_blr_optimization)
AllocStack();
m_stack_guard = nullptr;

blocks.Init();
asm_routines.Init(m_stack ? (m_stack + STACK_SIZE) : nullptr);
asm_routines.Init();

// important: do this *after* generating the global asm routines, because we can't use farcode in
// them.
Expand Down Expand Up @@ -415,7 +448,6 @@ void Jit64::ResetFreeMemoryRanges()

void Jit64::Shutdown()
{
FreeStack();
FreeCodeSpace();

auto& system = Core::System::GetInstance();
Expand Down Expand Up @@ -735,14 +767,22 @@ void Jit64::WriteExternalExceptionExit()

void Jit64::Run()
{
ProtectStack();

CompiledCode pExecAddr = (CompiledCode)asm_routines.enter_code;
pExecAddr();

UnprotectStack();
}

void Jit64::SingleStep()
{
ProtectStack();

CompiledCode pExecAddr = (CompiledCode)asm_routines.enter_code;
pExecAddr();

UnprotectStack();
}

void Jit64::Trace()
Expand Down
6 changes: 3 additions & 3 deletions Source/Core/Core/PowerPC/Jit64/Jit.h
Original file line number Diff line number Diff line change
Expand Up @@ -255,8 +255,8 @@ class Jit64 : public JitBase, public QuantizedMemoryRoutines

bool HandleFunctionHooking(u32 address);

void AllocStack();
void FreeStack();
void ProtectStack();
void UnprotectStack();

void ResetFreeMemoryRanges();

Expand All @@ -270,7 +270,7 @@ class Jit64 : public JitBase, public QuantizedMemoryRoutines

bool m_enable_blr_optimization = false;
bool m_cleanup_after_stackfault = false;
u8* m_stack = nullptr;
u8* m_stack_guard = nullptr;

HyoutaUtilities::RangeSizeSet<u8*> m_free_ranges_near;
HyoutaUtilities::RangeSizeSet<u8*> m_free_ranges_far;
Expand Down
28 changes: 6 additions & 22 deletions Source/Core/Core/PowerPC/Jit64/JitAsm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,9 @@ Jit64AsmRoutineManager::Jit64AsmRoutineManager(Jit64& jit) : CommonAsmRoutines(j
{
}

void Jit64AsmRoutineManager::Init(u8* stack_top)
void Jit64AsmRoutineManager::Init()
{
m_const_pool.Init(AllocChildCodeSpace(4096), 4096);
m_stack_top = stack_top;
Generate();
WriteProtect();
}
Expand All @@ -50,17 +49,8 @@ void Jit64AsmRoutineManager::Generate()
// MOV(64, R(RMEM), Imm64((u64)Memory::physical_base));
MOV(64, R(RPPCSTATE), Imm64((u64)&PowerPC::ppcState + 0x80));

if (m_stack_top)
{
// Pivot the stack to our custom one.
MOV(64, R(RSCRATCH), R(RSP));
MOV(64, R(RSP), ImmPtr(m_stack_top - 0x20));
MOV(64, MDisp(RSP, 0x18), R(RSCRATCH));
}
else
{
MOV(64, PPCSTATE(stored_stack_pointer), R(RSP));
}
MOV(64, PPCSTATE(stored_stack_pointer), R(RSP));

// something that can't pass the BLR test
MOV(64, MDisp(RSP, 8), Imm32((u32)-1));

Expand Down Expand Up @@ -209,12 +199,9 @@ void Jit64AsmRoutineManager::Generate()
if (enable_debugging)
SetJumpTarget(dbg_exit);

// Reset the stack pointer, since the BLR optimization may have pushed things onto the stack
// without popping them.
ResetStack(*this);
if (m_stack_top)
{
ADD(64, R(RSP), Imm8(0x18));
POP(RSP);
}

ABI_PopRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8, 16);
RET();
Expand All @@ -226,10 +213,7 @@ void Jit64AsmRoutineManager::Generate()

void Jit64AsmRoutineManager::ResetStack(X64CodeBlock& emitter)
{
if (m_stack_top)
emitter.MOV(64, R(RSP), Imm64((u64)m_stack_top - 0x20));
else
emitter.MOV(64, R(RSP), PPCSTATE(stored_stack_pointer));
emitter.MOV(64, R(RSP), PPCSTATE(stored_stack_pointer));
}

void Jit64AsmRoutineManager::GenerateCommon()
Expand Down
3 changes: 1 addition & 2 deletions Source/Core/Core/PowerPC/Jit64/JitAsm.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,13 @@ class Jit64AsmRoutineManager : public CommonAsmRoutines

explicit Jit64AsmRoutineManager(Jit64& jit);

void Init(u8* stack_top);
void Init();

void ResetStack(Gen::X64CodeBlock& emitter);

private:
void Generate();
void GenerateCommon();

u8* m_stack_top = nullptr;
JitBase& m_jit;
};
Loading

0 comments on commit 86c1f6e

Please sign in to comment.