Skip to content

Commit

Permalink
Jit64: Optimized idle skipping detection.
Browse files Browse the repository at this point in the history
  • Loading branch information
degasus committed Jul 28, 2018
1 parent 5333c17 commit a130b9a
Show file tree
Hide file tree
Showing 8 changed files with 144 additions and 108 deletions.
4 changes: 2 additions & 2 deletions Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ static std::array<GekkoOPTemplate, 54> primarytable =
{59, Interpreter::RunTable59, {"RunTable59", OpType::Subtable, 0, 0, 0, 0, 0}},
{63, Interpreter::RunTable63, {"RunTable63", OpType::Subtable, 0, 0, 0, 0, 0}},

{16, Interpreter::bcx, {"bcx", OpType::System, FL_ENDBLOCK, 1, 0, 0, 0}},
{18, Interpreter::bx, {"bx", OpType::System, FL_ENDBLOCK, 1, 0, 0, 0}},
{16, Interpreter::bcx, {"bcx", OpType::Branch, FL_ENDBLOCK, 1, 0, 0, 0}},
{18, Interpreter::bx, {"bx", OpType::Branch, FL_ENDBLOCK, 1, 0, 0, 0}},

{3, Interpreter::twi, {"twi", OpType::System, FL_ENDBLOCK, 1, 0, 0, 0}},
{17, Interpreter::sc, {"sc", OpType::System, FL_ENDBLOCK, 2, 0, 0, 0}},
Expand Down
35 changes: 19 additions & 16 deletions Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,25 +87,22 @@ void Jit64::bx(UGeckoInstruction inst)
gpr.Flush();
fpr.Flush();

u32 destination;
if (inst.AA)
destination = SignExt26(inst.LI << 2);
else
destination = js.compilerPC + SignExt26(inst.LI << 2);
#ifdef ACID_TEST
if (inst.LK)
AND(32, PPCSTATE(cr), Imm32(~(0xFF000000)));
#endif
if (destination == js.compilerPC)
if (js.op->branchIsIdleLoop)
{
ABI_PushRegistersAndAdjustStack({}, 0);
ABI_CallFunction(CoreTiming::Idle);
ABI_PopRegistersAndAdjustStack({}, 0);
MOV(32, PPCSTATE(pc), Imm32(destination));
MOV(32, PPCSTATE(pc), Imm32(js.op->branchTo));
WriteExceptionExit();
return;
}
WriteExit(destination, inst.LK, js.compilerPC + 4);
else
{
WriteExit(js.op->branchTo, inst.LK, js.compilerPC + 4);
}
}

// TODO - optimize to hell and beyond
Expand Down Expand Up @@ -154,15 +151,21 @@ void Jit64::bcx(UGeckoInstruction inst)
return;
}

u32 destination;
if (inst.AA)
destination = SignExt16(inst.BD << 2);
else
destination = js.compilerPC + SignExt16(inst.BD << 2);

gpr.Flush(RegCache::FlushMode::MaintainState);
fpr.Flush(RegCache::FlushMode::MaintainState);
WriteExit(destination, inst.LK, js.compilerPC + 4);

if (js.op->branchIsIdleLoop)
{
ABI_PushRegistersAndAdjustStack({}, 0);
ABI_CallFunction(CoreTiming::Idle);
ABI_PopRegistersAndAdjustStack({}, 0);
MOV(32, PPCSTATE(pc), Imm32(js.op->branchTo));
WriteExceptionExit();
}
else
{
WriteExit(js.op->branchTo, inst.LK, js.compilerPC + 4);
}

if ((inst.BO & BO_DONT_CHECK_CONDITION) == 0)
SetJumpTarget(pConditionDontBranch);
Expand Down
12 changes: 11 additions & 1 deletion Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include "Common/CommonTypes.h"
#include "Common/MathUtil.h"
#include "Common/x64Emitter.h"
#include "Core/CoreTiming.h"
#include "Core/PowerPC/Jit64/Jit.h"
#include "Core/PowerPC/Jit64/JitRegCache.h"
#include "Core/PowerPC/Jit64Common/Jit64PowerPCState.h"
Expand Down Expand Up @@ -372,7 +373,16 @@ void Jit64::DoMergedBranch()
// Code that handles successful PPC branching.
const UGeckoInstruction& next = js.op[1].inst;
const u32 nextPC = js.op[1].address;
if (next.OPCD == 16) // bcx

if (js.op[1].branchIsIdleLoop)
{
ABI_PushRegistersAndAdjustStack({}, 0);
ABI_CallFunction(CoreTiming::Idle);
ABI_PopRegistersAndAdjustStack({}, 0);
MOV(32, PPCSTATE(pc), Imm32(js.op[1].branchTo));
WriteExceptionExit();
}
else if (next.OPCD == 16) // bcx
{
if (next.LK)
MOV(32, PPCSTATE(spr[SPR_LR]), Imm32(nextPC + 4));
Expand Down
33 changes: 0 additions & 33 deletions Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,39 +119,6 @@ void Jit64::lXXx(UGeckoInstruction inst)
signExtend = true;
}

if (!CPU::IsStepping() && inst.OPCD == 32 && CanMergeNextInstructions(2) &&
(inst.hex & 0xFFFF0000) == 0x800D0000 &&
(js.op[1].inst.hex == 0x28000000 ||
(SConfig::GetInstance().bWii && js.op[1].inst.hex == 0x2C000000)) &&
js.op[2].inst.hex == 0x4182fff8)
{
s32 offset = (s32)(s16)inst.SIMM_16;
gpr.BindToRegister(a, true, false);
gpr.BindToRegister(d, false, true);
SafeLoadToReg(gpr.RX(d), gpr.R(a), accessSize, offset, CallerSavedRegistersInUse(), signExtend);

// if it's still 0, we can wait until the next event
TEST(32, gpr.R(d), gpr.R(d));
FixupBranch noIdle = J_CC(CC_NZ);

BitSet32 registersInUse = CallerSavedRegistersInUse();
ABI_PushRegistersAndAdjustStack(registersInUse, 0);

ABI_CallFunction(CoreTiming::Idle);

ABI_PopRegistersAndAdjustStack(registersInUse, 0);

// ! we must continue executing of the loop after exception handling, maybe there is still 0 in
// r0
// MOV(32, PPCSTATE(pc), Imm32(js.compilerPC));
WriteExceptionExit();

SetJumpTarget(noIdle);

// js.compilerPC += 8;
return;
}

// Determine whether this instruction updates inst.RA
bool update;
if (inst.OPCD == 31)
Expand Down
35 changes: 19 additions & 16 deletions Source/Core/Core/PowerPC/JitArm64/JitArm64_Branch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,12 +76,6 @@ void JitArm64::bx(UGeckoInstruction inst)
INSTRUCTION_START
JITDISABLE(bJITBranchOff);

u32 destination;
if (inst.AA)
destination = SignExt26(inst.LI << 2);
else
destination = js.compilerPC + SignExt26(inst.LI << 2);

if (inst.LK)
{
ARM64Reg WA = gpr.GetReg();
Expand All @@ -105,7 +99,7 @@ void JitArm64::bx(UGeckoInstruction inst)
gpr.Flush(FlushMode::FLUSH_ALL);
fpr.Flush(FlushMode::FLUSH_ALL);

if (destination == js.compilerPC)
if (js.op->branchIsIdleLoop)
{
// make idle loops go faster
ARM64Reg WA = gpr.GetReg();
Expand All @@ -115,11 +109,11 @@ void JitArm64::bx(UGeckoInstruction inst)
BLR(XA);
gpr.Unlock(WA);

WriteExceptionExit(js.compilerPC);
WriteExceptionExit(js.op->branchTo);
return;
}

WriteExit(destination, inst.LK, js.compilerPC + 4);
WriteExit(js.op->branchTo, inst.LK, js.compilerPC + 4);
}

void JitArm64::bcx(UGeckoInstruction inst)
Expand Down Expand Up @@ -160,16 +154,25 @@ void JitArm64::bcx(UGeckoInstruction inst)
}
gpr.Unlock(WA);

u32 destination;
if (inst.AA)
destination = SignExt16(inst.BD << 2);
else
destination = js.compilerPC + SignExt16(inst.BD << 2);

gpr.Flush(FlushMode::FLUSH_MAINTAIN_STATE);
fpr.Flush(FlushMode::FLUSH_MAINTAIN_STATE);

WriteExit(destination, inst.LK, js.compilerPC + 4);
if (js.op->branchIsIdleLoop)
{
// make idle loops go faster
ARM64Reg WA = gpr.GetReg();
ARM64Reg XA = EncodeRegTo64(WA);

MOVP2R(XA, &CoreTiming::Idle);
BLR(XA);
gpr.Unlock(WA);

WriteExceptionExit(js.op->branchTo);
}
else
{
WriteExit(js.op->branchTo, inst.LK, js.compilerPC + 4);
}

SwitchToNearCode();

Expand Down
31 changes: 0 additions & 31 deletions Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -346,37 +346,6 @@ void JitArm64::lXX(UGeckoInstruction inst)
}

SafeLoadToReg(d, update ? a : (a ? a : -1), offsetReg, flags, offset, update);

// LWZ idle skipping
if (inst.OPCD == 32 && CanMergeNextInstructions(2) &&
(inst.hex & 0xFFFF0000) == 0x800D0000 && // lwz r0, XXXX(r13)
(js.op[1].inst.hex == 0x28000000 ||
(SConfig::GetInstance().bWii && js.op[1].inst.hex == 0x2C000000)) && // cmpXwi r0,0
js.op[2].inst.hex == 0x4182fff8) // beq -8
{
ARM64Reg WA = gpr.GetReg();
ARM64Reg XA = EncodeRegTo64(WA);

// if it's still 0, we can wait until the next event
FixupBranch noIdle = CBNZ(gpr.R(d));

FixupBranch far = B();
SwitchToFarCode();
SetJumpTarget(far);

gpr.Flush(FLUSH_MAINTAIN_STATE);
fpr.Flush(FLUSH_MAINTAIN_STATE);

MOVP2R(XA, &CoreTiming::Idle);
BLR(XA);
gpr.Unlock(WA);

WriteExceptionExit(js.compilerPC);

SwitchToNearCode();

SetJumpTarget(noIdle);
}
}

void JitArm64::stX(UGeckoInstruction inst)
Expand Down
96 changes: 89 additions & 7 deletions Source/Core/Core/PowerPC/PPCAnalyst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -640,6 +640,90 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock* block, CodeOp* code, const Gekk
code->outputCR0 = true;
code->outputCR1 = true;
}

code->branchUsesCtr = false;
code->branchTo = UINT32_MAX;

// For branch with immediate addresses (bx/bcx), compute the destination.
if (code->inst.OPCD == 18) // bx
{
if (code->inst.AA) // absolute
code->branchTo = SignExt26(code->inst.LI << 2);
else
code->branchTo = code->address + SignExt26(code->inst.LI << 2);
}
else if (code->inst.OPCD == 16) // bcx
{
if (code->inst.AA) // absolute
code->branchTo = SignExt16(code->inst.BD << 2);
else
code->branchTo = code->address + SignExt16(code->inst.BD << 2);
if (!(code->inst.BO & BO_DONT_DECREMENT_FLAG))
code->branchUsesCtr = true;
}
else if (code->inst.OPCD == 19 && code->inst.SUBOP10 == 16) // bclrx
{
if (!(code->inst.BO & BO_DONT_DECREMENT_FLAG))
code->branchUsesCtr = true;
}
else if (code->inst.OPCD == 19 && code->inst.SUBOP10 == 528) // bcctrx
{
if (!(code->inst.BO & BO_DONT_DECREMENT_FLAG))
code->branchUsesCtr = true;
}
}

bool PPCAnalyzer::IsBusyWaitLoop(CodeBlock* block, CodeOp* code, size_t instructions)
{
// Very basic algorithm to detect busy wait loops:
// * It loops to itself and does not contain any other branches.
// * It does not write to memory.
// * It only reads from registers it wrote to earlier in the loop, or it
// does not write to these registers.
//
// Would benefit a lot from basic inlining support - a lot of the most
// used busy loops are DSP register interactions, which are bl/cmp/bne
// (with the bl target a pure function that follows the above rules). We
// don't detect these at the moment.
std::bitset<32> write_disallowed_regs;
std::bitset<32> written_regs;
for (size_t i = 0; i <= instructions; ++i)
{
if (code[i].opinfo->type == OpType::Branch)
{
if (code[i].branchUsesCtr)
return false;
if (code[i].branchTo == block->m_address && i == instructions)
return true;
}
else if (code[i].opinfo->type != OpType::Integer && code[i].opinfo->type != OpType::Load)
{
// In the future, some subsets of other instruction types might get
// supported. Right now, only try loops that have this very
// restricted instruction set.
return false;
}
else
{
for (int reg : code[i].regsIn)
{
if (reg == -1)
continue;
if (written_regs[reg])
continue;
write_disallowed_regs[reg] = true;
}
for (int reg : code[i].regsOut)
{
if (reg == -1)
continue;
if (write_disallowed_regs[reg])
return false;
written_regs[reg] = true;
}
}
}
return false;
}

u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std::size_t block_size)
Expand Down Expand Up @@ -692,16 +776,16 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std:
code[i].opinfo = opinfo;
code[i].address = address;
code[i].inst = inst;
code[i].branchTo = UINT32_MAX;
code[i].branchToIndex = UINT32_MAX;
code[i].skip = false;
block->m_stats->numCycles += opinfo->numCycles;
block->m_physical_addresses.insert(result.physical_address);

SetInstructionStats(block, &code[i], opinfo, static_cast<u32>(i));

code[i].branchIsIdleLoop =
code[i].branchTo == block->m_address && IsBusyWaitLoop(block, code, i);

bool follow = false;
u32 destination = 0;

bool conditional_continue = false;

Expand All @@ -715,7 +799,6 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std:
{
// Always follow BX instructions.
follow = true;
destination = SignExt26(inst.LI << 2) + (inst.AA ? 0 : address);
if (inst.LK)
{
found_call = true;
Expand All @@ -727,7 +810,6 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std:
{
// Always follow unconditional BCX instructions, but they are very rare.
follow = true;
destination = SignExt16(inst.BD << 2) + (inst.AA ? 0 : address);
if (inst.LK)
{
found_call = true;
Expand All @@ -744,7 +826,7 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std:
// the LR value on the stack as there are no spare registers. So we'd need
// to check all store instruction to not alias with the stack.
follow = true;
destination = code[caller].address + 4;
code[i].branchTo = code[caller].address + 4;
found_call = false;
code[i].skip = true;

Expand Down Expand Up @@ -796,7 +878,7 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std:
{
// Follow the unconditional branch.
numFollows++;
address = destination;
address = code[i].branchTo;
}
else
{
Expand Down
Loading

0 comments on commit a130b9a

Please sign in to comment.