Skip to content

Commit

Permalink
Merge pull request #7287 from degasus/idle_skipping
Browse files Browse the repository at this point in the history
Jit64 / JitArm64: Optimized idle skipping detection.
  • Loading branch information
degasus committed Apr 22, 2019
2 parents e575fa9 + 6ec4ade commit 2abe333
Show file tree
Hide file tree
Showing 12 changed files with 202 additions and 155 deletions.
12 changes: 12 additions & 0 deletions Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.cpp
Expand Up @@ -181,6 +181,15 @@ static bool CheckBreakpoint(u32 data)
return false;
}

static bool CheckIdle(u32 idle_pc)
{
if (PowerPC::ppcState.npc == idle_pc)
{
CoreTiming::Idle();
}
return false;
}

bool CachedInterpreter::HandleFunctionHooking(u32 address)
{
return HLE::ReplaceFunctionIfPossible(address, [&](u32 function, HLE::HookType type) {
Expand Down Expand Up @@ -242,6 +251,7 @@ void CachedInterpreter::Jit(u32 address)
const bool check_fpu = (op.opinfo->flags & FL_USE_FPU) && !js.firstFPInstructionFound;
const bool endblock = (op.opinfo->flags & FL_ENDBLOCK) != 0;
const bool memcheck = (op.opinfo->flags & FL_LOADSTORE) && jo.memcheck;
const bool idle_loop = op.branchIsIdleLoop;

if (breakpoint)
{
Expand All @@ -261,6 +271,8 @@ void CachedInterpreter::Jit(u32 address)
m_code.emplace_back(PPCTables::GetInterpreterOp(op.inst), op.inst);
if (memcheck)
m_code.emplace_back(CheckDSI, js.downcountAmount);
if (idle_loop)
m_code.emplace_back(CheckIdle, js.blockStart);
if (endblock)
m_code.emplace_back(EndBlock, js.downcountAmount);
}
Expand Down
24 changes: 0 additions & 24 deletions Source/Core/Core/PowerPC/Interpreter/Interpreter_Branch.cpp
Expand Up @@ -5,7 +5,6 @@
#include "Common/Assert.h"
#include "Common/CommonTypes.h"
#include "Core/ConfigManager.h"
#include "Core/CoreTiming.h"
#include "Core/HLE/HLE.h"
#include "Core/PowerPC/Interpreter/ExceptionUtils.h"
#include "Core/PowerPC/Interpreter/Interpreter.h"
Expand All @@ -23,11 +22,6 @@ void Interpreter::bx(UGeckoInstruction inst)
NPC = PC + SignExt26(inst.LI << 2);

m_end_block = true;

if (NPC == PC)
{
CoreTiming::Idle();
}
}

// bcx - ugly, straight from PPC manual equations :)
Expand Down Expand Up @@ -56,24 +50,6 @@ void Interpreter::bcx(UGeckoInstruction inst)
}

m_end_block = true;

// this code trys to detect the most common idle loop:
// lwz r0, XXXX(r13)
// cmpXwi r0,0
// beq -8
if (NPC == PC - 8 && inst.hex == 0x4182fff8 /* beq */)
{
if (PowerPC::HostRead_U32(PC - 8) >> 16 == 0x800D /* lwz */)
{
u32 last_inst = PowerPC::HostRead_U32(PC - 4);

if (last_inst == 0x28000000 /* cmplwi */ ||
(last_inst == 0x2C000000 /* cmpwi */ && SConfig::GetInstance().bWii))
{
CoreTiming::Idle();
}
}
}
}

void Interpreter::bcctrx(UGeckoInstruction inst)
Expand Down
4 changes: 2 additions & 2 deletions Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp
Expand Up @@ -29,8 +29,8 @@ static std::array<GekkoOPTemplate, 54> primarytable =
{59, Interpreter::RunTable59, {"RunTable59", OpType::Subtable, 0, 0, 0, 0, 0}},
{63, Interpreter::RunTable63, {"RunTable63", OpType::Subtable, 0, 0, 0, 0, 0}},

{16, Interpreter::bcx, {"bcx", OpType::System, FL_ENDBLOCK, 1, 0, 0, 0}},
{18, Interpreter::bx, {"bx", OpType::System, FL_ENDBLOCK, 1, 0, 0, 0}},
{16, Interpreter::bcx, {"bcx", OpType::Branch, FL_ENDBLOCK, 1, 0, 0, 0}},
{18, Interpreter::bx, {"bx", OpType::Branch, FL_ENDBLOCK, 1, 0, 0, 0}},

{3, Interpreter::twi, {"twi", OpType::System, FL_ENDBLOCK, 1, 0, 0, 0}},
{17, Interpreter::sc, {"sc", OpType::System, FL_ENDBLOCK, 2, 0, 0, 0}},
Expand Down
9 changes: 9 additions & 0 deletions Source/Core/Core/PowerPC/Jit64/Jit.cpp
Expand Up @@ -646,6 +646,15 @@ void Jit64::WriteRfiExitDestInRSCRATCH()
JMP(asm_routines.dispatcher, true);
}

void Jit64::WriteIdleExit(u32 destination)
{
ABI_PushRegistersAndAdjustStack({}, 0);
ABI_CallFunction(CoreTiming::Idle);
ABI_PopRegistersAndAdjustStack({}, 0);
MOV(32, PPCSTATE(pc), Imm32(destination));
WriteExceptionExit();
}

void Jit64::WriteExceptionExit()
{
Cleanup();
Expand Down
1 change: 1 addition & 0 deletions Source/Core/Core/PowerPC/Jit64/Jit.h
Expand Up @@ -84,6 +84,7 @@ class Jit64 : public JitBase, public QuantizedMemoryRoutines
void WriteExceptionExit();
void WriteExternalExceptionExit();
void WriteRfiExitDestInRSCRATCH();
void WriteIdleExit(u32 destination);
bool Cleanup();

void GenerateConstantOverflow(bool overflow);
Expand Down
45 changes: 24 additions & 21 deletions Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp
Expand Up @@ -87,25 +87,18 @@ void Jit64::bx(UGeckoInstruction inst)
gpr.Flush();
fpr.Flush();

u32 destination;
if (inst.AA)
destination = SignExt26(inst.LI << 2);
else
destination = js.compilerPC + SignExt26(inst.LI << 2);
#ifdef ACID_TEST
if (inst.LK)
AND(32, PPCSTATE(cr), Imm32(~(0xFF000000)));
#endif
if (destination == js.compilerPC)
if (js.op->branchIsIdleLoop)
{
ABI_PushRegistersAndAdjustStack({}, 0);
ABI_CallFunction(CoreTiming::Idle);
ABI_PopRegistersAndAdjustStack({}, 0);
MOV(32, PPCSTATE(pc), Imm32(destination));
WriteExceptionExit();
return;
WriteIdleExit(js.op->branchTo);
}
else
{
WriteExit(js.op->branchTo, inst.LK, js.compilerPC + 4);
}
WriteExit(destination, inst.LK, js.compilerPC + 4);
}

// TODO - optimize to hell and beyond
Expand Down Expand Up @@ -154,18 +147,20 @@ void Jit64::bcx(UGeckoInstruction inst)
return;
}

u32 destination;
if (inst.AA)
destination = SignExt16(inst.BD << 2);
else
destination = js.compilerPC + SignExt16(inst.BD << 2);

{
RCForkGuard gpr_guard = gpr.Fork();
RCForkGuard fpr_guard = fpr.Fork();
gpr.Flush();
fpr.Flush();
WriteExit(destination, inst.LK, js.compilerPC + 4);

if (js.op->branchIsIdleLoop)
{
WriteIdleExit(js.op->branchTo);
}
else
{
WriteExit(js.op->branchTo, inst.LK, js.compilerPC + 4);
}
}

if ((inst.BO & BO_DONT_CHECK_CONDITION) == 0)
Expand Down Expand Up @@ -282,7 +277,15 @@ void Jit64::bclrx(UGeckoInstruction inst)
RCForkGuard fpr_guard = fpr.Fork();
gpr.Flush();
fpr.Flush();
WriteBLRExit();

if (js.op->branchIsIdleLoop)
{
WriteIdleExit(js.op->branchTo);
}
else
{
WriteBLRExit();
}
}

if ((inst.BO & BO_DONT_CHECK_CONDITION) == 0)
Expand Down
11 changes: 10 additions & 1 deletion Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
Expand Up @@ -12,6 +12,7 @@
#include "Common/CommonTypes.h"
#include "Common/MathUtil.h"
#include "Common/x64Emitter.h"
#include "Core/CoreTiming.h"
#include "Core/PowerPC/Jit64/Jit.h"
#include "Core/PowerPC/Jit64/RegCache/JitRegCache.h"
#include "Core/PowerPC/Jit64Common/Jit64PowerPCState.h"
Expand Down Expand Up @@ -361,7 +362,15 @@ void Jit64::DoMergedBranch()
// Code that handles successful PPC branching.
const UGeckoInstruction& next = js.op[1].inst;
const u32 nextPC = js.op[1].address;
if (next.OPCD == 16) // bcx

if (js.op[1].branchIsIdleLoop)
{
if (next.LK)
MOV(32, PPCSTATE(spr[SPR_LR]), Imm32(nextPC + 4));

WriteIdleExit(js.op[1].branchTo);
}
else if (next.OPCD == 16) // bcx
{
if (next.LK)
MOV(32, PPCSTATE(spr[SPR_LR]), Imm32(nextPC + 4));
Expand Down
35 changes: 0 additions & 35 deletions Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp
Expand Up @@ -119,41 +119,6 @@ void Jit64::lXXx(UGeckoInstruction inst)
signExtend = true;
}

if (!CPU::IsStepping() && inst.OPCD == 32 && CanMergeNextInstructions(2) &&
(inst.hex & 0xFFFF0000) == 0x800D0000 &&
(js.op[1].inst.hex == 0x28000000 ||
(SConfig::GetInstance().bWii && js.op[1].inst.hex == 0x2C000000)) &&
js.op[2].inst.hex == 0x4182fff8)
{
s32 offset = (s32)(s16)inst.SIMM_16;
RCX64Reg Ra = gpr.Bind(a, RCMode::Read);
RCX64Reg Rd = gpr.Bind(d, RCMode::Write);
RegCache::Realize(Ra, Rd);

SafeLoadToReg(Rd, Ra, accessSize, offset, CallerSavedRegistersInUse(), signExtend);

// if it's still 0, we can wait until the next event
TEST(32, Rd, Rd);
FixupBranch noIdle = J_CC(CC_NZ);

BitSet32 registersInUse = CallerSavedRegistersInUse();
ABI_PushRegistersAndAdjustStack(registersInUse, 0);

ABI_CallFunction(CoreTiming::Idle);

ABI_PopRegistersAndAdjustStack(registersInUse, 0);

// ! we must continue executing of the loop after exception handling, maybe there is still 0 in
// r0
// MOV(32, PPCSTATE(pc), Imm32(js.compilerPC));
WriteExceptionExit();

SetJumpTarget(noIdle);

// js.compilerPC += 8;
return;
}

// Determine whether this instruction updates inst.RA
bool update;
if (inst.OPCD == 31)
Expand Down
50 changes: 33 additions & 17 deletions Source/Core/Core/PowerPC/JitArm64/JitArm64_Branch.cpp
Expand Up @@ -76,12 +76,6 @@ void JitArm64::bx(UGeckoInstruction inst)
INSTRUCTION_START
JITDISABLE(bJITBranchOff);

u32 destination;
if (inst.AA)
destination = SignExt26(inst.LI << 2);
else
destination = js.compilerPC + SignExt26(inst.LI << 2);

if (inst.LK)
{
ARM64Reg WA = gpr.GetReg();
Expand All @@ -105,7 +99,7 @@ void JitArm64::bx(UGeckoInstruction inst)
gpr.Flush(FlushMode::FLUSH_ALL);
fpr.Flush(FlushMode::FLUSH_ALL);

if (destination == js.compilerPC)
if (js.op->branchIsIdleLoop)
{
// make idle loops go faster
ARM64Reg WA = gpr.GetReg();
Expand All @@ -115,11 +109,11 @@ void JitArm64::bx(UGeckoInstruction inst)
BLR(XA);
gpr.Unlock(WA);

WriteExceptionExit(js.compilerPC);
WriteExceptionExit(js.op->branchTo);
return;
}

WriteExit(destination, inst.LK, js.compilerPC + 4);
WriteExit(js.op->branchTo, inst.LK, js.compilerPC + 4);
}

void JitArm64::bcx(UGeckoInstruction inst)
Expand Down Expand Up @@ -160,16 +154,25 @@ void JitArm64::bcx(UGeckoInstruction inst)
}
gpr.Unlock(WA);

u32 destination;
if (inst.AA)
destination = SignExt16(inst.BD << 2);
else
destination = js.compilerPC + SignExt16(inst.BD << 2);

gpr.Flush(FlushMode::FLUSH_MAINTAIN_STATE);
fpr.Flush(FlushMode::FLUSH_MAINTAIN_STATE);

WriteExit(destination, inst.LK, js.compilerPC + 4);
if (js.op->branchIsIdleLoop)
{
// make idle loops go faster
ARM64Reg WA = gpr.GetReg();
ARM64Reg XA = EncodeRegTo64(WA);

MOVP2R(XA, &CoreTiming::Idle);
BLR(XA);
gpr.Unlock(WA);

WriteExceptionExit(js.op->branchTo);
}
else
{
WriteExit(js.op->branchTo, inst.LK, js.compilerPC + 4);
}

SwitchToNearCode();

Expand Down Expand Up @@ -275,7 +278,20 @@ void JitArm64::bclrx(UGeckoInstruction inst)
gpr.Flush(conditional ? FlushMode::FLUSH_MAINTAIN_STATE : FlushMode::FLUSH_ALL);
fpr.Flush(conditional ? FlushMode::FLUSH_MAINTAIN_STATE : FlushMode::FLUSH_ALL);

WriteBLRExit(WA);
if (js.op->branchIsIdleLoop)
{
// make idle loops go faster
ARM64Reg XA = EncodeRegTo64(WA);

MOVP2R(XA, &CoreTiming::Idle);
BLR(XA);

WriteExceptionExit(js.op->branchTo);
}
else
{
WriteBLRExit(WA);
}

gpr.Unlock(WA);

Expand Down
31 changes: 0 additions & 31 deletions Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp
Expand Up @@ -346,37 +346,6 @@ void JitArm64::lXX(UGeckoInstruction inst)
}

SafeLoadToReg(d, update ? a : (a ? a : -1), offsetReg, flags, offset, update);

// LWZ idle skipping
if (inst.OPCD == 32 && CanMergeNextInstructions(2) &&
(inst.hex & 0xFFFF0000) == 0x800D0000 && // lwz r0, XXXX(r13)
(js.op[1].inst.hex == 0x28000000 ||
(SConfig::GetInstance().bWii && js.op[1].inst.hex == 0x2C000000)) && // cmpXwi r0,0
js.op[2].inst.hex == 0x4182fff8) // beq -8
{
ARM64Reg WA = gpr.GetReg();
ARM64Reg XA = EncodeRegTo64(WA);

// if it's still 0, we can wait until the next event
FixupBranch noIdle = CBNZ(gpr.R(d));

FixupBranch far = B();
SwitchToFarCode();
SetJumpTarget(far);

gpr.Flush(FLUSH_MAINTAIN_STATE);
fpr.Flush(FLUSH_MAINTAIN_STATE);

MOVP2R(XA, &CoreTiming::Idle);
BLR(XA);
gpr.Unlock(WA);

WriteExceptionExit(js.compilerPC);

SwitchToNearCode();

SetJumpTarget(noIdle);
}
}

void JitArm64::stX(UGeckoInstruction inst)
Expand Down

0 comments on commit 2abe333

Please sign in to comment.