Skip to content

Commit

Permalink
Add EBREAK live-patching support, execute JIT-segments undecoded
Browse files Browse the repository at this point in the history
This allows running NodeJS to start 2x faster, and allows using faster decoding for main segment
  • Loading branch information
fwsGonzo committed Jul 12, 2024
1 parent 0ede592 commit 822fb8d
Show file tree
Hide file tree
Showing 10 changed files with 247 additions and 71 deletions.
53 changes: 32 additions & 21 deletions emulator/src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
static inline std::vector<uint8_t> load_file(const std::string&);
static constexpr uint64_t MAX_MEMORY = (riscv::encompassing_Nbit_arena == 0) ? uint64_t(4000) << 20 : uint64_t(1) << riscv::encompassing_Nbit_arena;
static const std::string DYNAMIC_LINKER = "/usr/riscv64-linux-gnu/lib/ld-linux-riscv64-lp64d.so.1";
//#define NODEJS_WORKAROUND
#define NODEJS_WORKAROUND

struct Arguments {
bool verbose = false;
Expand Down Expand Up @@ -224,6 +224,11 @@ static void run_program(
.ignore_text_section = cli_args.ignore_text,
.verbose_loader = cli_args.verbose,
.use_shared_execute_segments = false, // We are only creating one machine, disabling this can enable some optimizations
#ifdef NODEJS_WORKAROUND
.ebreak_locations = {
"pthread_rwlock_rdlock", "pthread_rwlock_wrlock" // Live-patch locations
},
#endif
#ifdef RISCV_BINARY_TRANSLATION
.translate_enabled = !cli_args.no_translate,
.translate_future_segments = cli_args.translate_future,
Expand Down Expand Up @@ -437,30 +442,36 @@ static void run_program(
machine.set_max_instructions(~0ULL);
machine.cpu.simulate_precise();
} else {
#ifdef NODEJS_WORKAROUND
// In order to get NodeJS to work we need to live-patch deadlocked rwlocks
// This is a temporary workaround until the issue is found and fixed.
static const auto rw_rdlock = machine.address_of("pthread_rwlock_rdlock");
static const auto rw_wrlock = machine.address_of("pthread_rwlock_wrlock");
machine.install_syscall_handler(riscv::SYSCALL_EBREAK,
[] (auto& machine)
{
auto& cpu = machine.cpu;
auto pc = cpu.pc();
if (pc == rw_rdlock || pc == rw_wrlock) {
// Execute 2 instruction and step over them
cpu.step_one(false);
cpu.step_one(false);
// Check for deadlock
if(cpu.reg(14) == cpu.reg(15)) {
// Deadlock detected, avoid branch (beq a4, a5) and reset the lock
cpu.reg(14) = 0xFF;
machine.memory.template write<uint32_t>(cpu.reg(10), 0);
}
} else {
throw riscv::MachineException(riscv::UNHANDLED_SYSCALL, "EBREAK instruction", pc);
}
});
#endif // NODEJS_WORKAROUND

// Normal RISC-V simulation
if (cli_args.accurate)
machine.simulate(cli_args.fuel);
else {
#ifdef NODEJS_WORKAROUND
const auto rw_rdlock = machine.address_of("pthread_rwlock_rdlock");
const auto rw_wrlock = machine.address_of("pthread_rwlock_wrlock");
const auto rw_unlock = machine.address_of("pthread_rwlock_unlock");

machine.set_max_instructions(cli_args.fuel);
do {
// Whenever we hit rwlock, step twice and check if it's a deadlock
if (machine.cpu.pc() == rw_rdlock || machine.cpu.pc() == rw_wrlock) {
machine.cpu.step_one();
machine.cpu.step_one();
if(machine.cpu.reg(14) == machine.cpu.reg(15)) {
// Deadlock detected, avoid branch (beq a4, a5) and reset the lock
machine.cpu.reg(14) = 0xFF;
machine.memory.template write<uint32_t>(machine.cpu.reg(10), 0);
}
}
machine.cpu.step_one();
} while (!machine.stopped());
#endif // NODEJS_WORKAROUND
#ifdef RISCV_TIMED_VMCALLS
// Simulation with experimental timeout
machine.execute_with_timeout(360.0f, machine.cpu.pc());
Expand Down
4 changes: 4 additions & 0 deletions lib/libriscv/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,10 @@ namespace riscv
/// @brief Provide a custom page-fault handler at construction.
riscv::Function<struct Page&(Memory<W>&, address_type<W>, bool)> page_fault_handler = nullptr;

/// @brief Call ebreak for each of the addresses in the vector.
/// @details This is useful for debugging and live-patching programs.
std::vector<std::variant<address_type<W>, std::string>> ebreak_locations {};

#ifdef RISCV_BINARY_TRANSLATION
/// @brief Enable the binary translator.
bool translate_enabled = true;
Expand Down
136 changes: 104 additions & 32 deletions lib/libriscv/cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include "instruction_counter.hpp"
#include "riscvbase.hpp"
#include "rv32i_instr.hpp"
#include "threaded_bytecodes.hpp"

namespace riscv
{
Expand Down Expand Up @@ -115,10 +116,8 @@ namespace riscv
// Find previously decoded execute segment,
// but skip segments tagged as likely JIT-compiled (as they are likely to be stale)
this->m_exec = machine().memory.exec_segment_for(pc).get();
if (LIKELY(!this->m_exec->empty() && !this->m_exec->is_likely_jit())) {
if (LIKELY(!this->m_exec->empty())) {
return {this->m_exec, pc};
} else if (this->m_exec->is_likely_jit()) {
machine().memory.evict_execute_segment(*this->m_exec);
}

// Find decoded execute segment via override
Expand Down Expand Up @@ -151,18 +150,34 @@ namespace riscv
if (UNLIKELY(end_pageno <= base_pageno))
throw MachineException(INVALID_PROGRAM, "Failed to create execute segment");
const size_t n_pages = end_pageno - base_pageno;
std::unique_ptr<uint8_t[]> area (new uint8_t[n_pages * Page::size()]);
thread_local std::vector<uint8_t> area;
area.resize(n_pages * Page::size());
// Copy from each individual page
for (address_t p = base_pageno; p < end_pageno; p++) {
// Cannot use get_exec_pageno here as we may need
// access to read fault handler.
auto& page = machine().memory.get_pageno(p);
const size_t offset = (p - base_pageno) * Page::size();
std::memcpy(area.get() + offset, page.data(), Page::size());
std::memcpy(area.data() + offset, page.data(), Page::size());
}

// Check for write + execute
if (UNLIKELY(current_page.attr.write && current_page.attr.exec))
{
// This is a JIT-compiled page, we need execute it directly
const address_t basepc = base_pageno * Page::size();
const address_t endpc = basepc + n_pages * Page::size();
this->simulate_precise_single_segment(area.data(), basepc, endpc, pc);
pc = this->pc();

if (UNLIKELY(++restarts == MAX_RESTARTS))
trigger_exception(EXECUTION_LOOP_DETECTED, pc);

goto restart_next_execute_segment;
}

// Decode and store it for later
return {&this->init_execute_area(area.get(), base_pageno * Page::size(), n_pages * Page::size()), pc};
return {&this->init_execute_area(area.data(), base_pageno * Page::size(), n_pages * Page::size()), pc};
} // CPU::next_execute_segment

template <int W> RISCV_NOINLINE RISCV_INTERNAL
Expand Down Expand Up @@ -219,6 +234,57 @@ namespace riscv
return read_next_instruction_slowpath();
}

template <int W>
static inline rv32i_instruction decode_safely(const uint8_t* exec_seg_data, address_type<W> pc)
{
// Instructions may be unaligned with C-extension
// On amd64 we take the cost, because it's faster
# if defined(RISCV_EXT_COMPRESSED) && !defined(__x86_64__)
return rv32i_instruction { *(UnderAlign32*) &exec_seg_data[pc] };
# else // aligned/unaligned loads
return rv32i_instruction { *(uint32_t*) &exec_seg_data[pc] };
# endif // aligned/unaligned loads
}

template<int W> RISCV_HOT_PATH()
void CPU<W>::simulate_precise_single_segment(
const uint8_t* exec_seg_data, address_t begin_pc, address_t end_pc, address_t pc)
{
exec_seg_data -= begin_pc;
this->registers().pc = pc;

// Inaccurate simulation doesn't use instruction counting
if (machine().instruction_counter() == 0 && machine().max_instructions() == 1) {
for (; pc >= begin_pc && pc < end_pc; ) {
auto instruction = decode_safely<W>(exec_seg_data, pc);
this->execute(instruction);

registers().pc += instruction.length();
pc = registers().pc;
}
return;
}

for (; machine().instruction_counter() < machine().max_instructions();
machine().increment_counter(1)) {

auto pc = this->pc();
if (UNLIKELY(pc < begin_pc || pc >= end_pc))
return;

auto instruction = decode_safely<W>(exec_seg_data, pc);
this->execute(instruction);

// increment PC
if constexpr (compressed_enabled)
registers().pc += instruction.length();
else
registers().pc += 4;

} // while not stopped

} // CPU::simulate_precise_single_segment

template<int W> RISCV_HOT_PATH()
void CPU<W>::simulate_precise()
{
Expand All @@ -230,13 +296,11 @@ namespace riscv

auto* exec = this->m_exec;
restart_precise_sim:
auto* exec_decoder = exec->decoder_cache();
auto* exec_seg_data = exec->exec_data();

for (; machine().instruction_counter() < machine().max_instructions();
machine().increment_counter(1)) {

format_t instruction;
auto pc = this->pc();

// TODO: This can me made much faster
Expand All @@ -249,28 +313,8 @@ namespace riscv
goto restart_precise_sim;
}

// Instructions may be unaligned with C-extension
// On amd64 we take the cost, because it's faster
# if defined(RISCV_EXT_COMPRESSED) && !defined(__x86_64__)
instruction = format_t { *(UnderAlign32*) &exec_seg_data[pc] };
# else // aligned/unaligned loads
instruction = format_t { *(uint32_t*) &exec_seg_data[pc] };
# endif // aligned/unaligned loads

constexpr bool enable_cache =
!binary_translation_enabled;

if constexpr (enable_cache)
{
// Retrieve handler directly from the instruction handler cache
auto& cache_entry =
exec_decoder[pc / DecoderCache<W>::DIVISOR];
cache_entry.execute(*this, instruction);
}
else // Not the slowest path, since we have the instruction already
{
this->execute(instruction);
}
auto instruction = decode_safely<W>(exec_seg_data, pc);
this->execute(instruction);

// increment PC
if constexpr (compressed_enabled)
Expand All @@ -282,7 +326,7 @@ namespace riscv
} // CPU::simulate_precise

template<int W>
void CPU<W>::step_one()
void CPU<W>::step_one(bool use_instruction_counter)
{
// Read, decode & execute instructions directly
auto instruction = this->read_next_instruction();
Expand All @@ -293,7 +337,7 @@ namespace riscv
else
registers().pc += 4;

machine().increment_counter(1);
machine().increment_counter(use_instruction_counter ? 1 : 0);
}

template<int W>
Expand Down Expand Up @@ -324,6 +368,34 @@ namespace riscv
return retval;
}

template <int W>
uint32_t CPU<W>::install_ebreak_at(address_t addr)
{
if (!is_executable(addr)) {
this->next_execute_segment(addr);
}

auto* exec = this->m_exec;
auto* exec_decoder = exec->decoder_cache();

// Install an ebreak instruction at the given address
// This is used to break into the debugger
// when the instruction is executed
auto& cache_entry = exec_decoder[addr / DecoderCache<W>::DIVISOR];
cache_entry.set_bytecode(RV32I_BC_SYSTEM);
const auto old_instruction = cache_entry.instr;
rv32i_instruction new_instruction;
new_instruction.Itype.opcode = 0b1110011; // SYSTEM
new_instruction.Itype.rd = 0;
new_instruction.Itype.funct3 = 0b000;
new_instruction.Itype.rs1 = 0;
new_instruction.Itype.imm = 1; // EBREAK
cache_entry.instr = new_instruction.whole;

// Return the old instruction
return old_instruction;
}

template<int W> RISCV_COLD_PATH()
void CPU<W>::trigger_exception(int intr, address_t data)
{
Expand Down
12 changes: 10 additions & 2 deletions lib/libriscv/cpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,16 @@ namespace riscv
void simulate_inaccurate(address_t pc);

// Step precisely one instruction forward from current PC.
void step_one();
void step_one(bool use_instruction_counter = true);

// Executes one instruction at a time, and can stop at
/// @brief Executes one instruction at a time, and can stop at
// any instruction. Can be used for debugging.
void simulate_precise();

/// @brief Executes one instruction at a time, until the execute
/// segment is left. Used to execute JIT-compiled code.
void simulate_precise_single_segment(const uint8_t* exec_seg, address_t begin_pc, address_t end_pc, address_t pc);

/// @brief Get the current PC
/// @return The current PC address
address_t pc() const noexcept { return registers().pc; }
Expand Down Expand Up @@ -137,6 +141,10 @@ namespace riscv
static std::shared_ptr<DecodedExecuteSegment<W>>& empty_execute_segment() noexcept;
bool is_executable(address_t addr) const noexcept;

//-- Debugging functions --//
/// @brief Install a breakpoint at a specific address, returning the old instruction
uint32_t install_ebreak_at(address_t addr);

// Override the function that gets called when the CPU
// throws an execute space protection fault.
void set_fault_handler(execute_fault_t func) noexcept { m_fault = func; }
Expand Down
9 changes: 7 additions & 2 deletions lib/libriscv/cpu_dispatch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,12 @@ INSTRUCTION(RV32I_BC_SYSTEM, rv32i_system) {
// Invoke SYSTEM
MACHINE().system(instr);
// Restore counters
counter.retrieve_max_counter(MACHINE());
counter.retrieve_counters(MACHINE());
if (UNLIKELY(counter.overflowed() || pc != REGISTERS().pc))
{
pc = REGISTERS().pc;
goto check_jump;
}
// Overflow-check, next block
NEXT_BLOCK(4, true);
}
Expand Down Expand Up @@ -198,7 +203,7 @@ INSTRUCTION(RV32I_BC_SYSCALL, rv32i_syscall) {
counter.apply(MACHINE());
// Invoke system call
MACHINE().system_call(REG(REG_ECALL));
// Restore max counter
// Restore counters
counter.retrieve_counters(MACHINE());
if (UNLIKELY(counter.overflowed() || pc != REGISTERS().pc))
{
Expand Down
6 changes: 6 additions & 0 deletions lib/libriscv/cpu_inaccurate_dispatch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,12 @@ INSTRUCTION(RV32I_BC_SYSTEM, rv32i_system)
REGISTERS().pc = pc;
// Invoke SYSTEM
MACHINE().system(instr);
// Check if we need to jump
if (UNLIKELY(pc != REGISTERS().pc))
{
pc = REGISTERS().pc;
goto check_jump;
}
// Overflow-check, next block
NEXT_BLOCK(4, true);
}
Expand Down
Loading

0 comments on commit 822fb8d

Please sign in to comment.