Skip to content

Commit

Permalink
JitArm64: Initial implementation of the BLR optimization.
Browse files Browse the repository at this point in the history
  • Loading branch information
degasus committed Feb 1, 2017
1 parent 24951fd commit 40695c5
Show file tree
Hide file tree
Showing 8 changed files with 221 additions and 17 deletions.
144 changes: 141 additions & 3 deletions Source/Core/Core/PowerPC/JitArm64/Jit.cpp
Expand Up @@ -179,6 +179,7 @@ void JitArm64::Cleanup()
MOVP2R(X0, &GPFifo::FastCheckGatherPipe);
BLR(X0);
gpr.Unlock(W0);
js.fifoBytesSinceCheck = 0;
}
}

Expand All @@ -192,6 +193,143 @@ void JitArm64::DoDownCount()
gpr.Unlock(WA, WB);
}

void JitArm64::ResetStack()
{
LDR(INDEX_UNSIGNED, X0, PPC_REG, PPCSTATE_OFF(stored_stack_pointer));
SUB(SP, X0, 16);
}

void JitArm64::WriteLKExit(u32 destination, u32 after)
{
Cleanup();
DoDownCount();

if (Profiler::g_ProfileBlocks)
EndTimeProfile(js.curBlock);

// Push {ARM_PC+20; PPC_PC} on the stack
MOVI2R(X1, after);
ADR(X0, 20);
STP(INDEX_PRE, X0, X1, SP, -16);

JitBlock* b = js.curBlock;
JitBlock::LinkData linkData;
linkData.exitAddress = destination;
linkData.exitPtrs = GetWritableCodePtr();
linkData.linkStatus = false;
linkData.call = true;
b->linkData.push_back(linkData);

MOVI2R(DISPATCHER_PC, destination);
BL(dispatcher);

// MOVI2R might only require one instruction. So the const offset of 20 bytes
// might be wrong. Be sure and just add a NOP here.
HINT(HINT_NOP);

// Write the regular exit node after the return.
linkData.exitAddress = after;
linkData.exitPtrs = GetWritableCodePtr();
linkData.linkStatus = false;
linkData.call = false;
b->linkData.push_back(linkData);

MOVI2R(DISPATCHER_PC, after);
B(dispatcher);
}

void JitArm64::WriteLKExit(Arm64Gen::ARM64Reg dest, u32 after)
{
Cleanup();
DoDownCount();

if (dest != DISPATCHER_PC)
MOV(DISPATCHER_PC, dest);
gpr.Unlock(dest);

if (Profiler::g_ProfileBlocks)
EndTimeProfile(js.curBlock);

MOVI2R(X1, after);
ADR(X0, 12);
STP(INDEX_PRE, X0, X1, SP, -16);

BL(dispatcher);

// Write the regular exit node after the return.
JitBlock* b = js.curBlock;
JitBlock::LinkData linkData;
linkData.exitAddress = after;
linkData.exitPtrs = GetWritableCodePtr();
linkData.linkStatus = false;
linkData.call = false;
b->linkData.push_back(linkData);

MOVI2R(DISPATCHER_PC, after);
B(dispatcher);
}

void JitArm64::FakeLKExit(u32 after)
{
// We may need to fake the BLR stack on inlined CALL instructions.
// Else we can't return to this location any more.
ARM64Reg after_reg = gpr.GetReg();
ARM64Reg code_reg = gpr.GetReg();
MOVI2R(after_reg, after);
ADR(EncodeRegTo64(code_reg), 12);
STP(INDEX_PRE, EncodeRegTo64(code_reg), EncodeRegTo64(after_reg), SP, -16);
gpr.Unlock(after_reg, code_reg);

FixupBranch skip_exit = BL();

// Write the regular exit node after the return.
JitBlock* b = js.curBlock;
JitBlock::LinkData linkData;
linkData.exitAddress = after;
linkData.exitPtrs = GetWritableCodePtr();
linkData.linkStatus = false;
linkData.call = false;
b->linkData.push_back(linkData);

MOVI2R(DISPATCHER_PC, after);
B(dispatcher);

SetJumpTarget(skip_exit);
}

void JitArm64::WriteBLRExit(Arm64Gen::ARM64Reg dest)
{
Cleanup();

if (Profiler::g_ProfileBlocks)
EndTimeProfile(js.curBlock);

ARM64Reg code = gpr.GetReg();
ARM64Reg pc = gpr.GetReg();

// Check if {ARM_PC, PPC_PC} matches the current state.
LDP(INDEX_POST, EncodeRegTo64(code), EncodeRegTo64(pc), SP, 16);
CMP(pc, dest);
FixupBranch no_match = B(CC_NEQ);

DoDownCount();

RET(EncodeRegTo64(code));

SetJumpTarget(no_match);

DoDownCount();

if (dest != DISPATCHER_PC)
MOV(DISPATCHER_PC, dest);

ResetStack();

B(dispatcher);

gpr.Unlock(dest, pc, code);
}

// Exits
void JitArm64::WriteExit(u32 destination)
{
Expand All @@ -207,6 +345,7 @@ void JitArm64::WriteExit(u32 destination)
linkData.exitAddress = destination;
linkData.exitPtrs = GetWritableCodePtr();
linkData.linkStatus = false;
linkData.call = false;
b->linkData.push_back(linkData);

MOVI2R(DISPATCHER_PC, destination);
Expand Down Expand Up @@ -399,11 +538,11 @@ void JitArm64::Jit(u32)
}

JitBlock* b = blocks.AllocateBlock(em_address);
const u8* BlockPtr = DoJit(em_address, &code_buffer, b, nextPC);
DoJit(em_address, &code_buffer, b, nextPC);
blocks.FinalizeBlock(*b, jo.enableBlocklink, code_block.m_physical_addresses);
}

const u8* JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBlock* b, u32 nextPC)
void JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBlock* b, u32 nextPC)
{
if (em_address == 0)
{
Expand Down Expand Up @@ -629,5 +768,4 @@ const u8* JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitB

FlushIcache();
farcode.FlushIcache();
return start;
}
7 changes: 6 additions & 1 deletion Source/Core/Core/PowerPC/JitArm64/Jit.h
Expand Up @@ -219,10 +219,11 @@ class JitArm64 : public JitBase, public Arm64Gen::ARM64CodeBlock, public CommonA
void SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 offset, bool update);
void SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s32 offset);

const u8* DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBlock* b, u32 nextPC);
void DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBlock* b, u32 nextPC);

void DoDownCount();
void Cleanup();
void ResetStack();

// AsmRoutines
void GenerateAsm();
Expand All @@ -238,6 +239,10 @@ class JitArm64 : public JitBase, public Arm64Gen::ARM64CodeBlock, public CommonA
void WriteExit(Arm64Gen::ARM64Reg dest);
void WriteExceptionExit(u32 destination, bool only_external = false);
void WriteExceptionExit(Arm64Gen::ARM64Reg dest, bool only_external = false);
void WriteLKExit(u32 destination, u32 after);
void WriteLKExit(Arm64Gen::ARM64Reg dest, u32 after);
void FakeLKExit(u32 after);
void WriteBLRExit(Arm64Gen::ARM64Reg dest);

FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set);

Expand Down
28 changes: 19 additions & 9 deletions Source/Core/Core/PowerPC/JitArm64/JitArm64Cache.cpp
Expand Up @@ -19,21 +19,31 @@ void JitArm64BlockCache::WriteLinkBlock(const JitBlock::LinkData& source, const

if (dest)
{
// Are we able to jump directly to the normal entry?
s64 distance = ((s64)dest->normalEntry - (s64)location) >> 2;
if (distance >= -0x40000 && distance <= 0x3FFFF)
if (source.call)
{
emit.B(CC_PL, dest->normalEntry);
emit.BL(dest->checkedEntry);
}
else
{
// Are we able to jump directly to the normal entry?
s64 distance = ((s64)dest->normalEntry - (s64)location) >> 2;
if (distance >= -0x40000 && distance <= 0x3FFFF)
{
emit.B(CC_PL, dest->normalEntry);
}

// Use the checked entry if either downcount is smaller zero,
// or if we're not able to inline the downcount check here.
emit.B(dest->checkedEntry);
}

// Use the checked entry if either downcount is smaller zero,
// or if we're not able to inline the downcount check here.
emit.B(dest->checkedEntry);
}
else
{
emit.MOVI2R(DISPATCHER_PC, source.exitAddress);
emit.B(m_jit.GetAsmRoutines()->dispatcher);
if (source.call)
emit.BL(m_jit.GetAsmRoutines()->dispatcher);
else
emit.B(m_jit.GetAsmRoutines()->dispatcher);
}
emit.FlushIcache();
}
Expand Down
37 changes: 33 additions & 4 deletions Source/Core/Core/PowerPC/JitArm64/JitArm64_Branch.cpp
Expand Up @@ -92,6 +92,13 @@ void JitArm64::bx(UGeckoInstruction inst)

if (!js.isLastInstruction)
{
if (inst.LK && !js.op->skipLRStack)
{
// We have to fake the stack as the RET instruction was not
// found in the same block. This is a big overhead, but still
// better than calling the dispatcher.
FakeLKExit(js.compilerPC + 4);
}
return;
}

Expand All @@ -112,7 +119,14 @@ void JitArm64::bx(UGeckoInstruction inst)
return;
}

WriteExit(destination);
if (inst.LK)
{
WriteLKExit(destination, js.compilerPC + 4);
}
else
{
WriteExit(destination);
}
}

void JitArm64::bcx(UGeckoInstruction inst)
Expand Down Expand Up @@ -162,7 +176,14 @@ void JitArm64::bcx(UGeckoInstruction inst)
gpr.Flush(FlushMode::FLUSH_MAINTAIN_STATE);
fpr.Flush(FlushMode::FLUSH_MAINTAIN_STATE);

WriteExit(destination);
if (inst.LK)
{
WriteLKExit(destination, js.compilerPC + 4);
}
else
{
WriteExit(destination);
}

SwitchToNearCode();

Expand Down Expand Up @@ -211,7 +232,15 @@ void JitArm64::bcctrx(UGeckoInstruction inst)

LDR(INDEX_UNSIGNED, WA, PPC_REG, PPCSTATE_OFF(spr[SPR_CTR]));
AND(WA, WA, 30, 29); // Wipe the bottom 2 bits.
WriteExit(WA);

if (inst.LK_3)
{
WriteLKExit(WA, js.compilerPC + 4);
}
else
{
WriteExit(WA);
}
}

void JitArm64::bclrx(UGeckoInstruction inst)
Expand Down Expand Up @@ -264,7 +293,7 @@ void JitArm64::bclrx(UGeckoInstruction inst)
gpr.Flush(conditional ? FlushMode::FLUSH_MAINTAIN_STATE : FlushMode::FLUSH_ALL);
fpr.Flush(conditional ? FlushMode::FLUSH_MAINTAIN_STATE : FlushMode::FLUSH_ALL);

WriteExit(WA);
WriteBLRExit(WA);

if (conditional)
SwitchToNearCode();
Expand Down
Expand Up @@ -56,6 +56,10 @@ void JitArm64::mtmsr(UGeckoInstruction inst)
gpr.Flush(FlushMode::FLUSH_ALL);
fpr.Flush(FlushMode::FLUSH_ALL);

// Our jit cache also stores some MSR bits, as they have changed, we either
// have to validate them in the BLR/RET check, or just flush the stack here.
ResetStack();

WriteExceptionExit(js.compilerPC + 4, true);
}

Expand Down
14 changes: 14 additions & 0 deletions Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp
Expand Up @@ -28,6 +28,14 @@ void JitArm64::GenerateAsm()

MOVP2R(PPC_REG, &PowerPC::ppcState);

// Store the stack pointer, so we can reset it if the BLR optimization fails.
ADD(X0, SP, 0);
STR(INDEX_UNSIGNED, X0, PPC_REG, PPCSTATE_OFF(stored_stack_pointer));

// Push {nullptr; -1} as invalid destination on the stack.
MOVI2R(X0, 0xFFFFFFFF);
STP(INDEX_PRE, ZR, X0, SP, -16);

// The PC will be loaded into DISPATCHER_PC after the call to CoreTiming::Advance().
// Advance() does an exception check so we don't know what PC to use until afterwards.
FixupBranch to_start_of_timing_slice = B();
Expand Down Expand Up @@ -119,6 +127,7 @@ void JitArm64::GenerateAsm()

// Call JIT
SetJumpTarget(no_block_available);
ResetStack();
MOV(W0, DISPATCHER_PC);
MOVP2R(X30, reinterpret_cast<void*>(&JitTrampoline));
BLR(X30);
Expand Down Expand Up @@ -150,6 +159,11 @@ void JitArm64::GenerateAsm()
B(dispatcherNoCheck);

SetJumpTarget(Exit);

// Reset the stack pointer, as the BLR optimization have touched it.
LDR(INDEX_UNSIGNED, X0, PPC_REG, PPCSTATE_OFF(stored_stack_pointer));
ADD(SP, X0, 0);

ABI_PopRegisters(regs_to_save);
RET(X30);

Expand Down
1 change: 1 addition & 0 deletions Source/Core/Core/PowerPC/JitCommon/JitCache.h
Expand Up @@ -58,6 +58,7 @@ struct JitBlock
u8* exitPtrs; // to be able to rewrite the exit jump
u32 exitAddress;
bool linkStatus; // is it already linked?
bool call;
};
std::vector<LinkData> linkData;

Expand Down
3 changes: 3 additions & 0 deletions Source/Core/Core/PowerPC/PowerPC.h
Expand Up @@ -116,6 +116,9 @@ struct PowerPCState
// also for power management, but we don't care about that.
u32 spr[1024];

// Storage for the stack pointer of the BLR optimization.
u8* stored_stack_pointer;

std::array<std::array<tlb_entry, TLB_SIZE / TLB_WAYS>, NUM_TLBS> tlb;

u32 pagetable_base;
Expand Down

0 comments on commit 40695c5

Please sign in to comment.