Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Jit: Improve block lookup performance through a shm memory segment. #11737

Merged
merged 1 commit into from May 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
61 changes: 44 additions & 17 deletions Source/Core/Core/PowerPC/Jit64/JitAsm.cpp
Expand Up @@ -20,6 +20,10 @@

using namespace Gen;

// These need to be next of each other so that the assembly
// code can compare them easily.
static_assert(offsetof(JitBlockData, effectiveAddress) + 4 == offsetof(JitBlockData, msrBits));

Jit64AsmRoutineManager::Jit64AsmRoutineManager(Jit64& jit) : CommonAsmRoutines(jit)
{
}
Expand Down Expand Up @@ -103,35 +107,58 @@ void Jit64AsmRoutineManager::Generate()
const bool assembly_dispatcher = true;
if (assembly_dispatcher)
{
// Fast block number lookup.
// ((PC >> 2) & mask) * sizeof(JitBlock*) = (PC & (mask << 2)) * 2
MOV(32, R(RSCRATCH), PPCSTATE(pc));
// Keep a copy for later.
MOV(32, R(RSCRATCH_EXTRA), R(RSCRATCH));
u64 icache = reinterpret_cast<u64>(m_jit.GetBlockCache()->GetFastBlockMap());
AND(32, R(RSCRATCH), Imm32(JitBaseBlockCache::FAST_BLOCK_MAP_MASK << 2));
if (icache <= INT_MAX)
if (m_jit.GetBlockCache()->GetFastBlockMap())
{
MOV(64, R(RSCRATCH), MScaled(RSCRATCH, SCALE_2, static_cast<s32>(icache)));
u64 icache = reinterpret_cast<u64>(m_jit.GetBlockCache()->GetFastBlockMap());
MOV(32, R(RSCRATCH), PPCSTATE(pc));

MOV(64, R(RSCRATCH2), Imm64(icache));
// Each 4-byte offset of the PC register corresponds to a 8-byte offset
// in the lookup table due to host pointers being 8-bytes long.
MOV(64, R(RSCRATCH), MComplex(RSCRATCH2, RSCRATCH, SCALE_2, 0));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This could also use a comment for how this actually arrives at the correct address, it took me a minute to figure out why SCALE_2 is correct.

(It's because each 4-byte offset of the PC register corresponds to a 8-byte offset in the lookup table, due to 8 byte host pointers.)

}
else
{
MOV(64, R(RSCRATCH2), Imm64(icache));
MOV(64, R(RSCRATCH), MComplex(RSCRATCH2, RSCRATCH, SCALE_2, 0));
// Fast block number lookup.
// ((PC >> 2) & mask) * sizeof(JitBlock*) = (PC & (mask << 2)) * 2
MOV(32, R(RSCRATCH), PPCSTATE(pc));
// Keep a copy for later.
MOV(32, R(RSCRATCH_EXTRA), R(RSCRATCH));
u64 icache = reinterpret_cast<u64>(m_jit.GetBlockCache()->GetFastBlockMapFallback());
AND(32, R(RSCRATCH), Imm32(JitBaseBlockCache::FAST_BLOCK_MAP_FALLBACK_MASK << 2));
if (icache <= INT_MAX)
{
MOV(64, R(RSCRATCH), MScaled(RSCRATCH, SCALE_2, static_cast<s32>(icache)));
}
else
{
MOV(64, R(RSCRATCH2), Imm64(icache));
MOV(64, R(RSCRATCH), MComplex(RSCRATCH2, RSCRATCH, SCALE_2, 0));
}
}

// Check if we found a block.
TEST(64, R(RSCRATCH), R(RSCRATCH));
FixupBranch not_found = J_CC(CC_Z);

// Check both block.effectiveAddress and block.msrBits.
// Check block.msrBits.
MOV(32, R(RSCRATCH2), PPCSTATE(msr));
AND(32, R(RSCRATCH2), Imm32(JitBaseBlockCache::JIT_CACHE_MSR_MASK));
SHL(64, R(RSCRATCH2), Imm8(32));
// RSCRATCH_EXTRA still has the PC.
OR(64, R(RSCRATCH2), R(RSCRATCH_EXTRA));
CMP(64, R(RSCRATCH2),
MDisp(RSCRATCH, static_cast<s32>(offsetof(JitBlockData, effectiveAddress))));

if (m_jit.GetBlockCache()->GetFastBlockMap())
{
CMP(32, R(RSCRATCH2), MDisp(RSCRATCH, static_cast<s32>(offsetof(JitBlockData, msrBits))));
}
else
{
// Also check the block.effectiveAddress
SHL(64, R(RSCRATCH2), Imm8(32));
// RSCRATCH_EXTRA still has the PC.
OR(64, R(RSCRATCH2), R(RSCRATCH_EXTRA));
CMP(64, R(RSCRATCH2),
MDisp(RSCRATCH, static_cast<s32>(offsetof(JitBlockData, effectiveAddress))));
Comment on lines +158 to +159
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not your fault but jeez this is evil and only works because effectiveAddress and msrBits are right next to eachother in the JitBlockData. This probably needs some comments and asserts...

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

More specifically, I would recommend something like:

static_assert(offsetof(JitBlockData, effectiveAddress) + 4 == offsetof(JitBlockData, msrBits));

This way, any violations will be caught at compile time, and the assert is clear enough that I don't think writing a comment is necessary.

}

FixupBranch state_mismatch = J_CC(CC_NE);

// Success; branch to the block we found.
Expand Down
90 changes: 61 additions & 29 deletions Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp
Expand Up @@ -110,35 +110,67 @@ void JitArm64::GenerateAsm()
jo.fastmem_arena ? memory.GetLogicalBase() : memory.GetLogicalPageMappingsBase());
SetJumpTarget(membaseend);

// iCache[(address >> 2) & iCache_Mask];
ARM64Reg pc_masked = ARM64Reg::W25;
ARM64Reg cache_base = ARM64Reg::X27;
ARM64Reg block = ARM64Reg::X30;
ORR(pc_masked, ARM64Reg::WZR, LogicalImm(JitBaseBlockCache::FAST_BLOCK_MAP_MASK << 3, 32));
AND(pc_masked, pc_masked, DISPATCHER_PC, ArithOption(DISPATCHER_PC, ShiftType::LSL, 1));
MOVP2R(cache_base, GetBlockCache()->GetFastBlockMap());
LDR(block, cache_base, EncodeRegTo64(pc_masked));
FixupBranch not_found = CBZ(block);

// b.effectiveAddress != addr || b.msrBits != msr
ARM64Reg pc_and_msr = ARM64Reg::W25;
ARM64Reg pc_and_msr2 = ARM64Reg::W24;
LDR(IndexType::Unsigned, pc_and_msr, block, offsetof(JitBlockData, effectiveAddress));
CMP(pc_and_msr, DISPATCHER_PC);
FixupBranch pc_missmatch = B(CC_NEQ);

LDR(IndexType::Unsigned, pc_and_msr2, PPC_REG, PPCSTATE_OFF(msr));
AND(pc_and_msr2, pc_and_msr2, LogicalImm(JitBaseBlockCache::JIT_CACHE_MSR_MASK, 32));
LDR(IndexType::Unsigned, pc_and_msr, block, offsetof(JitBlockData, msrBits));
CMP(pc_and_msr, pc_and_msr2);
FixupBranch msr_missmatch = B(CC_NEQ);

// return blocks[block_num].normalEntry;
LDR(IndexType::Unsigned, block, block, offsetof(JitBlockData, normalEntry));
BR(block);
SetJumpTarget(not_found);
SetJumpTarget(pc_missmatch);
SetJumpTarget(msr_missmatch);
if (GetBlockCache()->GetFastBlockMap())
{
// Check if there is a block
ARM64Reg pc_masked = ARM64Reg::X25;
ARM64Reg cache_base = ARM64Reg::X27;
ARM64Reg block = ARM64Reg::X30;
LSL(pc_masked, DISPATCHER_PC, 1);
MOVP2R(cache_base, GetBlockCache()->GetFastBlockMap());
LDR(block, cache_base, pc_masked);
FixupBranch not_found = CBZ(block);

// b.msrBits != msr
ARM64Reg msr = ARM64Reg::W25;
ARM64Reg msr2 = ARM64Reg::W24;
LDR(IndexType::Unsigned, msr, PPC_REG, PPCSTATE_OFF(msr));
AND(msr, msr, LogicalImm(JitBaseBlockCache::JIT_CACHE_MSR_MASK, 32));
LDR(IndexType::Unsigned, msr2, block, offsetof(JitBlockData, msrBits));
CMP(msr, msr2);

FixupBranch msr_missmatch = B(CC_NEQ);

// return blocks[block_num].normalEntry;
LDR(IndexType::Unsigned, block, block, offsetof(JitBlockData, normalEntry));
BR(block);
SetJumpTarget(not_found);
SetJumpTarget(msr_missmatch);
}
else
{
// iCache[(address >> 2) & iCache_Mask];
ARM64Reg pc_masked = ARM64Reg::W25;
ARM64Reg cache_base = ARM64Reg::X27;
ARM64Reg block = ARM64Reg::X30;
ORR(pc_masked, ARM64Reg::WZR,
LogicalImm(JitBaseBlockCache::FAST_BLOCK_MAP_FALLBACK_MASK << 3, 32));
AND(pc_masked, pc_masked, DISPATCHER_PC, ArithOption(DISPATCHER_PC, ShiftType::LSL, 1));
MOVP2R(cache_base, GetBlockCache()->GetFastBlockMap());
LDR(block, cache_base, EncodeRegTo64(pc_masked));
FixupBranch not_found = CBZ(block);

// b.effectiveAddress != addr || b.msrBits != msr
ARM64Reg pc_and_msr = ARM64Reg::W25;
ARM64Reg pc_and_msr2 = ARM64Reg::W24;
LDR(IndexType::Unsigned, pc_and_msr, block, offsetof(JitBlockData, effectiveAddress));
CMP(pc_and_msr, DISPATCHER_PC);
FixupBranch pc_missmatch = B(CC_NEQ);

LDR(IndexType::Unsigned, pc_and_msr2, PPC_REG, PPCSTATE_OFF(msr));
AND(pc_and_msr2, pc_and_msr2, LogicalImm(JitBaseBlockCache::JIT_CACHE_MSR_MASK, 32));
LDR(IndexType::Unsigned, pc_and_msr, block, offsetof(JitBlockData, msrBits));
CMP(pc_and_msr, pc_and_msr2);

FixupBranch msr_missmatch = B(CC_NEQ);

// return blocks[block_num].normalEntry;
LDR(IndexType::Unsigned, block, block, offsetof(JitBlockData, normalEntry));
BR(block);
SetJumpTarget(not_found);
SetJumpTarget(pc_missmatch);
SetJumpTarget(msr_missmatch);
}
}

// Call C version of Dispatch().
Expand Down
58 changes: 48 additions & 10 deletions Source/Core/Core/PowerPC/JitCommon/JitCache.cpp
Expand Up @@ -42,12 +42,21 @@ void JitBaseBlockCache::Init()
{
Common::JitRegister::Init(Config::Get(Config::MAIN_PERF_MAP_DIR));

m_block_map_arena.GrabSHMSegment(FAST_BLOCK_MAP_SIZE, "dolphin-emu-jitblock");

Clear();
}

void JitBaseBlockCache::Shutdown()
{
Common::JitRegister::Shutdown();

if (m_fast_block_map)
{
m_block_map_arena.ReleaseView(m_fast_block_map, FAST_BLOCK_MAP_SIZE);
}

m_block_map_arena.ReleaseSHMSegment();
}

// This clears the JIT cache. It's called from JitCache.cpp when the JIT cache
Expand All @@ -70,7 +79,24 @@ void JitBaseBlockCache::Clear()

valid_block.ClearAll();

fast_block_map.fill(nullptr);
if (m_fast_block_map)
{
m_block_map_arena.ReleaseView(m_fast_block_map, FAST_BLOCK_MAP_SIZE);
m_block_map_arena.ReleaseSHMSegment();
m_block_map_arena.GrabSHMSegment(FAST_BLOCK_MAP_SIZE, "dolphin-emu-jitblock");
}

m_fast_block_map =
reinterpret_cast<JitBlock**>(m_block_map_arena.CreateView(0, FAST_BLOCK_MAP_SIZE));

if (m_fast_block_map)
{
m_fast_block_map_ptr = m_fast_block_map;
}
else
{
m_fast_block_map_ptr = m_fast_block_map_fallback.data();
}
}

void JitBaseBlockCache::Reset()
Expand All @@ -81,7 +107,12 @@ void JitBaseBlockCache::Reset()

JitBlock** JitBaseBlockCache::GetFastBlockMap()
{
return fast_block_map.data();
return m_fast_block_map;
}

JitBlock** JitBaseBlockCache::GetFastBlockMapFallback()
{
return m_fast_block_map_fallback.data();
}

void JitBaseBlockCache::RunOnBlocks(std::function<void(const JitBlock&)> f)
Expand All @@ -106,7 +137,7 @@ void JitBaseBlockCache::FinalizeBlock(JitBlock& block, bool block_link,
const std::set<u32>& physical_addresses)
{
size_t index = FastLookupIndexForAddress(block.effectiveAddress);
fast_block_map[index] = &block;
m_fast_block_map_ptr[index] = &block;
block.fast_block_map_index = index;

block.physical_addresses = physical_addresses;
Expand Down Expand Up @@ -169,7 +200,7 @@ JitBlock* JitBaseBlockCache::GetBlockFromStartAddress(u32 addr, u32 msr)
const u8* JitBaseBlockCache::Dispatch()
{
const auto& ppc_state = m_jit.m_ppc_state;
JitBlock* block = fast_block_map[FastLookupIndexForAddress(ppc_state.pc)];
JitBlock* block = m_fast_block_map_ptr[FastLookupIndexForAddress(ppc_state.pc)];

if (!block || block->effectiveAddress != ppc_state.pc ||
block->msrBits != (ppc_state.msr.Hex & JIT_CACHE_MSR_MASK))
Expand Down Expand Up @@ -390,8 +421,8 @@ void JitBaseBlockCache::UnlinkBlock(const JitBlock& block)

void JitBaseBlockCache::DestroyBlock(JitBlock& block)
{
if (fast_block_map[block.fast_block_map_index] == &block)
fast_block_map[block.fast_block_map_index] = nullptr;
if (m_fast_block_map_ptr[block.fast_block_map_index] == &block)
m_fast_block_map_ptr[block.fast_block_map_index] = nullptr;

UnlinkBlock(block);

Expand All @@ -418,18 +449,25 @@ JitBlock* JitBaseBlockCache::MoveBlockIntoFastCache(u32 addr, u32 msr)
return nullptr;

// Drop old fast block map entry
if (fast_block_map[block->fast_block_map_index] == block)
fast_block_map[block->fast_block_map_index] = nullptr;
if (m_fast_block_map_ptr[block->fast_block_map_index] == block)
m_fast_block_map_ptr[block->fast_block_map_index] = nullptr;

// And create a new one
size_t index = FastLookupIndexForAddress(addr);
fast_block_map[index] = block;
m_fast_block_map_ptr[index] = block;
block->fast_block_map_index = index;

return block;
}

size_t JitBaseBlockCache::FastLookupIndexForAddress(u32 address)
{
return (address >> 2) & FAST_BLOCK_MAP_MASK;
if (m_fast_block_map)
{
return address >> 2;
}
else
{
return (address >> 2) & FAST_BLOCK_MAP_FALLBACK_MASK;
}
}
22 changes: 18 additions & 4 deletions Source/Core/Core/PowerPC/JitCommon/JitCache.h
Expand Up @@ -16,6 +16,7 @@
#include <vector>

#include "Common/CommonTypes.h"
#include "Core/HW/Memmap.h"

class JitBase;

Expand Down Expand Up @@ -131,8 +132,11 @@ class JitBaseBlockCache
// is valid (MSR.IR and MSR.DR, the address translation bits).
static constexpr u32 JIT_CACHE_MSR_MASK = 0x30;

static constexpr u32 FAST_BLOCK_MAP_ELEMENTS = 0x10000;
static constexpr u32 FAST_BLOCK_MAP_MASK = FAST_BLOCK_MAP_ELEMENTS - 1;
// The value for the map is determined like this:
// ((4 GB guest memory space) / (4 bytes per address)) * sizeof(JitBlock*)
static constexpr u64 FAST_BLOCK_MAP_SIZE = 0x2'0000'0000;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should document how one arrives at this number. You have a guest memory space of 4 GB and PPC instructions are always aligned to 4 bytes, so that gives you 4 GB / 4 bytes = 0x4000'0000 possible entries in the map (I guess it's a lookup table now technically but whatever). Each entry points to a JitBlock*, which is 8 bytes on 64 bit machines. That's 8 GB, or 0x2'0000'0000 bytes.

static constexpr u32 FAST_BLOCK_MAP_FALLBACK_ELEMENTS = 0x10000;
static constexpr u32 FAST_BLOCK_MAP_FALLBACK_MASK = FAST_BLOCK_MAP_FALLBACK_ELEMENTS - 1;

explicit JitBaseBlockCache(JitBase& jit);
virtual ~JitBaseBlockCache();
Expand All @@ -144,6 +148,7 @@ class JitBaseBlockCache

// Code Cache
JitBlock** GetFastBlockMap();
JitBlock** GetFastBlockMapFallback();
void RunOnBlocks(std::function<void(const JitBlock&)> f);

JitBlock* AllocateBlock(u32 em_address);
Expand Down Expand Up @@ -203,7 +208,16 @@ class JitBaseBlockCache
// It is used to provide a fast way to query if no icache invalidation is needed.
ValidBlockBitSet valid_block;

// This array is indexed with the masked PC and likely holds the correct block id.
// This array is indexed with the shifted PC and likely holds the correct block id.
// This is used as a fast cache of block_map used in the assembly dispatcher.
std::array<JitBlock*, FAST_BLOCK_MAP_ELEMENTS> fast_block_map{}; // start_addr & mask -> number
// It is implemented via a shm segment using m_block_map_arena.
JitBlock** m_fast_block_map = 0;
Common::MemArena m_block_map_arena;

// An alternative for the above fast_block_map but without a shm segment
// in case the shm memory region couldn't be allocated.
std::array<JitBlock*, FAST_BLOCK_MAP_FALLBACK_ELEMENTS>
m_fast_block_map_fallback{}; // start_addr & mask -> number

JitBlock** m_fast_block_map_ptr = 0;
};