Skip to content

Commit

Permalink
JitArm64: Initial implementation of the BLR optimization.
Browse files Browse the repository at this point in the history
  • Loading branch information
degasus committed Feb 2, 2017
1 parent f20113f commit 384efb0
Show file tree
Hide file tree
Showing 8 changed files with 210 additions and 29 deletions.
162 changes: 149 additions & 13 deletions Source/Core/Core/PowerPC/JitArm64/Jit.cpp
Expand Up @@ -46,16 +46,17 @@ void JitArm64::Init()
UpdateMemoryOptions();
gpr.Init(this);
fpr.Init(this);

blocks.Init();
GenerateAsm();

code_block.m_stats = &js.st;
code_block.m_gpa = &js.gpa;
code_block.m_fpa = &js.fpa;
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE);
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CARRY_MERGE);
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_FOLLOW);
m_enable_blr_optimization = true;

GenerateAsm();

m_supports_cycle_counter = HasCycleCounters();
}
Expand Down Expand Up @@ -192,40 +193,176 @@ void JitArm64::DoDownCount()
gpr.Unlock(WA, WB);
}

// Exits
void JitArm64::WriteExit(u32 destination)
void JitArm64::ResetStack()
{
if (!m_enable_blr_optimization)
return;

LDR(INDEX_UNSIGNED, X0, PPC_REG, PPCSTATE_OFF(stored_stack_pointer));
SUB(SP, X0, 16);
}

void JitArm64::WriteExit(u32 destination, bool LK, u32 exit_address_after_return)
{
Cleanup();
DoDownCount();

if (Profiler::g_ProfileBlocks)
EndTimeProfile(js.curBlock);

// If nobody has taken care of this yet (this can be removed when all branches are done)
LK &= m_enable_blr_optimization;

if (LK)
{
// Push {ARM_PC+20; PPC_PC} on the stack
MOVI2R(X1, exit_address_after_return);
ADR(X0, 20);
STP(INDEX_PRE, X0, X1, SP, -16);
}

JitBlock* b = js.curBlock;
JitBlock::LinkData linkData;
linkData.exitAddress = destination;
linkData.exitPtrs = GetWritableCodePtr();
linkData.linkStatus = false;
linkData.call = LK;
b->linkData.push_back(linkData);

MOVI2R(DISPATCHER_PC, destination);
B(dispatcher);

if (!LK)
{
B(dispatcher);
}
else
{
BL(dispatcher);

// MOVI2R might only require one instruction. So the const offset of 20 bytes
// might be wrong. Be sure and just add a NOP here.
HINT(HINT_NOP);

// Write the regular exit node after the return.
linkData.exitAddress = exit_address_after_return;
linkData.exitPtrs = GetWritableCodePtr();
linkData.linkStatus = false;
linkData.call = false;
b->linkData.push_back(linkData);

MOVI2R(DISPATCHER_PC, exit_address_after_return);
B(dispatcher);
}
}

void JitArm64::WriteExit(ARM64Reg Reg)
void JitArm64::WriteExit(Arm64Gen::ARM64Reg dest, bool LK, u32 exit_address_after_return)
{
Cleanup();
DoDownCount();

if (Reg != DISPATCHER_PC)
MOV(DISPATCHER_PC, Reg);
gpr.Unlock(Reg);
LK &= m_enable_blr_optimization;

if (dest != DISPATCHER_PC)
MOV(DISPATCHER_PC, dest);
gpr.Unlock(dest);

if (Profiler::g_ProfileBlocks)
EndTimeProfile(js.curBlock);

if (!LK)
{
B(dispatcher);
}
else
{
// Push {ARM_PC, PPC_PC} on the stack
MOVI2R(X1, exit_address_after_return);
ADR(X0, 12);
STP(INDEX_PRE, X0, X1, SP, -16);

BL(dispatcher);

// Write the regular exit node after the return.
JitBlock* b = js.curBlock;
JitBlock::LinkData linkData;
linkData.exitAddress = exit_address_after_return;
linkData.exitPtrs = GetWritableCodePtr();
linkData.linkStatus = false;
linkData.call = false;
b->linkData.push_back(linkData);

MOVI2R(DISPATCHER_PC, exit_address_after_return);
B(dispatcher);
}
}

void JitArm64::FakeLKExit(u32 exit_address_after_return)
{
if (!m_enable_blr_optimization)
return;

// We may need to fake the BLR stack on inlined CALL instructions.
// Else we can't return to this location any more.
ARM64Reg after_reg = gpr.GetReg();
ARM64Reg code_reg = gpr.GetReg();
MOVI2R(after_reg, exit_address_after_return);
ADR(EncodeRegTo64(code_reg), 12);
STP(INDEX_PRE, EncodeRegTo64(code_reg), EncodeRegTo64(after_reg), SP, -16);
gpr.Unlock(after_reg, code_reg);

FixupBranch skip_exit = BL();

// Write the regular exit node after the return.
JitBlock* b = js.curBlock;
JitBlock::LinkData linkData;
linkData.exitAddress = exit_address_after_return;
linkData.exitPtrs = GetWritableCodePtr();
linkData.linkStatus = false;
linkData.call = false;
b->linkData.push_back(linkData);

MOVI2R(DISPATCHER_PC, exit_address_after_return);
B(dispatcher);

SetJumpTarget(skip_exit);
}

void JitArm64::WriteBLRExit(Arm64Gen::ARM64Reg dest)
{
if (!m_enable_blr_optimization)
{
WriteExit(dest);
return;
}

Cleanup();

if (Profiler::g_ProfileBlocks)
EndTimeProfile(js.curBlock);

ARM64Reg code = gpr.GetReg();
ARM64Reg pc = gpr.GetReg();

// Check if {ARM_PC, PPC_PC} matches the current state.
LDP(INDEX_POST, EncodeRegTo64(code), EncodeRegTo64(pc), SP, 16);
CMP(pc, dest);
FixupBranch no_match = B(CC_NEQ);

DoDownCount();

RET(EncodeRegTo64(code));

SetJumpTarget(no_match);

DoDownCount();

if (dest != DISPATCHER_PC)
MOV(DISPATCHER_PC, dest);

ResetStack();

B(dispatcher);

gpr.Unlock(dest, pc, code);
}

void JitArm64::WriteExceptionExit(u32 destination, bool only_external)
Expand Down Expand Up @@ -399,11 +536,11 @@ void JitArm64::Jit(u32)
}

JitBlock* b = blocks.AllocateBlock(em_address);
const u8* BlockPtr = DoJit(em_address, &code_buffer, b, nextPC);
DoJit(em_address, &code_buffer, b, nextPC);
blocks.FinalizeBlock(*b, jo.enableBlocklink, code_block.m_physical_addresses);
}

const u8* JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBlock* b, u32 nextPC)
void JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBlock* b, u32 nextPC)
{
if (em_address == 0)
{
Expand Down Expand Up @@ -629,5 +766,4 @@ const u8* JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitB

FlushIcache();
farcode.FlushIcache();
return start;
}
11 changes: 8 additions & 3 deletions Source/Core/Core/PowerPC/JitArm64/Jit.h
Expand Up @@ -190,6 +190,8 @@ class JitArm64 : public JitBase, public Arm64Gen::ARM64CodeBlock, public CommonA
// Do we support cycle counter profiling?
bool m_supports_cycle_counter;

bool m_enable_blr_optimization;

void EmitResetCycleCounters();
void EmitGetCycles(Arm64Gen::ARM64Reg reg);

Expand Down Expand Up @@ -219,10 +221,11 @@ class JitArm64 : public JitBase, public Arm64Gen::ARM64CodeBlock, public CommonA
void SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 offset, bool update);
void SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s32 offset);

const u8* DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBlock* b, u32 nextPC);
void DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBlock* b, u32 nextPC);

void DoDownCount();
void Cleanup();
void ResetStack();

// AsmRoutines
void GenerateAsm();
Expand All @@ -234,10 +237,12 @@ class JitArm64 : public JitBase, public Arm64Gen::ARM64CodeBlock, public CommonA
void EndTimeProfile(JitBlock* b);

// Exits
void WriteExit(u32 destination);
void WriteExit(Arm64Gen::ARM64Reg dest);
void WriteExit(u32 destination, bool LK = false, u32 exit_address_after_return = 0);
void WriteExit(Arm64Gen::ARM64Reg dest, bool LK = false, u32 exit_address_after_return = 0);
void WriteExceptionExit(u32 destination, bool only_external = false);
void WriteExceptionExit(Arm64Gen::ARM64Reg dest, bool only_external = false);
void FakeLKExit(u32 exit_address_after_return);
void WriteBLRExit(Arm64Gen::ARM64Reg dest);

FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set);

Expand Down
28 changes: 19 additions & 9 deletions Source/Core/Core/PowerPC/JitArm64/JitArm64Cache.cpp
Expand Up @@ -19,21 +19,31 @@ void JitArm64BlockCache::WriteLinkBlock(const JitBlock::LinkData& source, const

if (dest)
{
// Are we able to jump directly to the normal entry?
s64 distance = ((s64)dest->normalEntry - (s64)location) >> 2;
if (distance >= -0x40000 && distance <= 0x3FFFF)
if (source.call)
{
emit.B(CC_PL, dest->normalEntry);
emit.BL(dest->checkedEntry);
}
else
{
// Are we able to jump directly to the normal entry?
s64 distance = ((s64)dest->normalEntry - (s64)location) >> 2;
if (distance >= -0x40000 && distance <= 0x3FFFF)
{
emit.B(CC_PL, dest->normalEntry);
}

// Use the checked entry if either downcount is smaller zero,
// or if we're not able to inline the downcount check here.
emit.B(dest->checkedEntry);
}

// Use the checked entry if either downcount is smaller zero,
// or if we're not able to inline the downcount check here.
emit.B(dest->checkedEntry);
}
else
{
emit.MOVI2R(DISPATCHER_PC, source.exitAddress);
emit.B(m_jit.GetAsmRoutines()->dispatcher);
if (source.call)
emit.BL(m_jit.GetAsmRoutines()->dispatcher);
else
emit.B(m_jit.GetAsmRoutines()->dispatcher);
}
emit.FlushIcache();
}
Expand Down
16 changes: 12 additions & 4 deletions Source/Core/Core/PowerPC/JitArm64/JitArm64_Branch.cpp
Expand Up @@ -92,6 +92,13 @@ void JitArm64::bx(UGeckoInstruction inst)

if (!js.isLastInstruction)
{
if (inst.LK && !js.op->skipLRStack)
{
// We have to fake the stack as the RET instruction was not
// found in the same block. This is a big overhead, but still
// better than calling the dispatcher.
FakeLKExit(js.compilerPC + 4);
}
return;
}

Expand All @@ -112,7 +119,7 @@ void JitArm64::bx(UGeckoInstruction inst)
return;
}

WriteExit(destination);
WriteExit(destination, inst.LK, js.compilerPC + 4);
}

void JitArm64::bcx(UGeckoInstruction inst)
Expand Down Expand Up @@ -162,7 +169,7 @@ void JitArm64::bcx(UGeckoInstruction inst)
gpr.Flush(FlushMode::FLUSH_MAINTAIN_STATE);
fpr.Flush(FlushMode::FLUSH_MAINTAIN_STATE);

WriteExit(destination);
WriteExit(destination, inst.LK, js.compilerPC + 4);

SwitchToNearCode();

Expand Down Expand Up @@ -211,7 +218,8 @@ void JitArm64::bcctrx(UGeckoInstruction inst)

LDR(INDEX_UNSIGNED, WA, PPC_REG, PPCSTATE_OFF(spr[SPR_CTR]));
AND(WA, WA, 30, 29); // Wipe the bottom 2 bits.
WriteExit(WA);

WriteExit(WA, inst.LK_3, js.compilerPC + 4);
}

void JitArm64::bclrx(UGeckoInstruction inst)
Expand Down Expand Up @@ -264,7 +272,7 @@ void JitArm64::bclrx(UGeckoInstruction inst)
gpr.Flush(conditional ? FlushMode::FLUSH_MAINTAIN_STATE : FlushMode::FLUSH_ALL);
fpr.Flush(conditional ? FlushMode::FLUSH_MAINTAIN_STATE : FlushMode::FLUSH_ALL);

WriteExit(WA);
WriteBLRExit(WA);

if (conditional)
SwitchToNearCode();
Expand Down
Expand Up @@ -56,6 +56,10 @@ void JitArm64::mtmsr(UGeckoInstruction inst)
gpr.Flush(FlushMode::FLUSH_ALL);
fpr.Flush(FlushMode::FLUSH_ALL);

// Our jit cache also stores some MSR bits, as they have changed, we either
// have to validate them in the BLR/RET check, or just flush the stack here.
ResetStack();

WriteExceptionExit(js.compilerPC + 4, true);
}

Expand Down
14 changes: 14 additions & 0 deletions Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp
Expand Up @@ -28,6 +28,14 @@ void JitArm64::GenerateAsm()

MOVP2R(PPC_REG, &PowerPC::ppcState);

// Store the stack pointer, so we can reset it if the BLR optimization fails.
ADD(X0, SP, 0);
STR(INDEX_UNSIGNED, X0, PPC_REG, PPCSTATE_OFF(stored_stack_pointer));

// Push {nullptr; -1} as invalid destination on the stack.
MOVI2R(X0, 0xFFFFFFFF);
STP(INDEX_PRE, ZR, X0, SP, -16);

// The PC will be loaded into DISPATCHER_PC after the call to CoreTiming::Advance().
// Advance() does an exception check so we don't know what PC to use until afterwards.
FixupBranch to_start_of_timing_slice = B();
Expand Down Expand Up @@ -119,6 +127,7 @@ void JitArm64::GenerateAsm()

// Call JIT
SetJumpTarget(no_block_available);
ResetStack();
MOV(W0, DISPATCHER_PC);
MOVP2R(X30, reinterpret_cast<void*>(&JitTrampoline));
BLR(X30);
Expand Down Expand Up @@ -150,6 +159,11 @@ void JitArm64::GenerateAsm()
B(dispatcherNoCheck);

SetJumpTarget(Exit);

// Reset the stack pointer, as the BLR optimization have touched it.
LDR(INDEX_UNSIGNED, X0, PPC_REG, PPCSTATE_OFF(stored_stack_pointer));
ADD(SP, X0, 0);

ABI_PopRegisters(regs_to_save);
RET(X30);

Expand Down

0 comments on commit 384efb0

Please sign in to comment.