@@ -640,6 +640,90 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock* block, CodeOp* code, const Gekk
code->outputCR0 = true;
code->outputCR1 = true;
}

code->branchUsesCtr = false;
code->branchTo = UINT32_MAX;

// For branch with immediate addresses (bx/bcx), compute the destination.
if (code->inst.OPCD == 18) // bx
{
if (code->inst.AA) // absolute
code->branchTo = SignExt26(code->inst.LI << 2);
else
code->branchTo = code->address + SignExt26(code->inst.LI << 2);
}
else if (code->inst.OPCD == 16) // bcx
{
if (code->inst.AA) // absolute
code->branchTo = SignExt16(code->inst.BD << 2);
else
code->branchTo = code->address + SignExt16(code->inst.BD << 2);
if (!(code->inst.BO & BO_DONT_DECREMENT_FLAG))
code->branchUsesCtr = true;
}
else if (code->inst.OPCD == 19 && code->inst.SUBOP10 == 16) // bclrx
{
if (!(code->inst.BO & BO_DONT_DECREMENT_FLAG))
code->branchUsesCtr = true;
}
else if (code->inst.OPCD == 19 && code->inst.SUBOP10 == 528) // bcctrx
{
if (!(code->inst.BO & BO_DONT_DECREMENT_FLAG))
code->branchUsesCtr = true;
}
}

bool PPCAnalyzer::IsBusyWaitLoop(CodeBlock* block, CodeOp* code, size_t instructions)
{
// Very basic algorithm to detect busy wait loops:
// * It loops to itself and does not contain any other branches.
// * It does not write to memory.
// * It only reads from registers it wrote to earlier in the loop, or it
// does not write to these registers.
//
// Would benefit a lot from basic inlining support - a lot of the most
// used busy loops are DSP register interactions, which are bl/cmp/bne
// (with the bl target a pure function that follows the above rules). We
// don't detect these at the moment.
std::bitset<32> write_disallowed_regs;
std::bitset<32> written_regs;
for (size_t i = 0; i <= instructions; ++i)
{
if (code[i].opinfo->type == OpType::Branch)
{
if (code[i].branchUsesCtr)
return false;
if (code[i].branchTo == block->m_address && i == instructions)
return true;
}
else if (code[i].opinfo->type != OpType::Integer && code[i].opinfo->type != OpType::Load)
{
// In the future, some subsets of other instruction types might get
// supported. Right now, only try loops that have this very
// restricted instruction set.
return false;
}
else
{
for (int reg : code[i].regsIn)
{
if (reg == -1)
continue;
if (written_regs[reg])
continue;
write_disallowed_regs[reg] = true;
}
for (int reg : code[i].regsOut)
{
if (reg == -1)
continue;
if (write_disallowed_regs[reg])
return false;
written_regs[reg] = true;
}
}
}
return false;
}

u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std::size_t block_size)
@@ -692,30 +776,26 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std:
code[i].opinfo = opinfo;
code[i].address = address;
code[i].inst = inst;
code[i].branchTo = UINT32_MAX;
code[i].branchToIndex = UINT32_MAX;
code[i].skip = false;
block->m_stats->numCycles += opinfo->numCycles;
block->m_physical_addresses.insert(result.physical_address);

SetInstructionStats(block, &code[i], opinfo, static_cast<u32>(i));

bool follow = false;
u32 destination = 0;

bool conditional_continue = false;

// TODO: Find the optimal value for BRANCH_FOLLOWING_THRESHOLD.
// If it is small, the performance will be down.
// If it is big, the size of generated code will be big and
// cache clearning will happen many times.
if (enable_follow && HasOption(OPTION_BRANCH_FOLLOW) && numFollows < BRANCH_FOLLOWING_THRESHOLD)
if (enable_follow && HasOption(OPTION_BRANCH_FOLLOW))
{
if (inst.OPCD == 18 && block_size > 1)
{
// Always follow BX instructions.
follow = true;
destination = SignExt26(inst.LI << 2) + (inst.AA ? 0 : address);
if (inst.LK)
{
found_call = true;
@@ -727,29 +807,31 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std:
{
// Always follow unconditional BCX instructions, but they are very rare.
follow = true;
destination = SignExt16(inst.BD << 2) + (inst.AA ? 0 : address);
if (inst.LK)
{
found_call = true;
caller = i;
}
}
else if (inst.OPCD == 19 && inst.SUBOP10 == 16 && !inst.LK && found_call &&
(inst.BO & BO_DONT_DECREMENT_FLAG) && (inst.BO & BO_DONT_CHECK_CONDITION))
else if (inst.OPCD == 19 && inst.SUBOP10 == 16 && !inst.LK && found_call)
{
// bclrx with unconditional branch = return
// Follow it if we can propagate the LR value of the last CALL instruction.
// Through it would be easy to track the upper level of call/return,
// we can't guarantee the LR value. The PPC ABI forces all functions to push
// the LR value on the stack as there are no spare registers. So we'd need
// to check all store instruction to not alias with the stack.
follow = true;
destination = code[caller].address + 4;
found_call = false;
code[i].skip = true;
code[i].branchTo = code[caller].address + 4;
if ((inst.BO & BO_DONT_DECREMENT_FLAG) && (inst.BO & BO_DONT_CHECK_CONDITION) &&
numFollows < BRANCH_FOLLOWING_THRESHOLD)
{
// bclrx with unconditional branch = return
// Follow it if we can propagate the LR value of the last CALL instruction.
// Through it would be easy to track the upper level of call/return,
// we can't guarantee the LR value. The PPC ABI forces all functions to push
// the LR value on the stack as there are no spare registers. So we'd need
// to check all store instruction to not alias with the stack.
follow = true;
found_call = false;
code[i].skip = true;

// Skip the RET, so also don't generate the stack entry for the BLR optimization.
code[caller].skipLRStack = true;
// Skip the RET, so also don't generate the stack entry for the BLR optimization.
code[caller].skipLRStack = true;
}
}
else if (inst.OPCD == 31 && inst.SUBOP10 == 467)
{
@@ -792,11 +874,14 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std:
}
}

if (follow)
code[i].branchIsIdleLoop =
code[i].branchTo == block->m_address && IsBusyWaitLoop(block, code, i);

if (follow && numFollows < BRANCH_FOLLOWING_THRESHOLD)
{
// Follow the unconditional branch.
numFollows++;
address = destination;
address = code[i].branchTo;
}
else
{
@@ -27,13 +27,14 @@ struct CodeOp // 16B
UGeckoInstruction inst;
GekkoOPInfo* opinfo;
u32 address;
u32 branchTo; // if 0, not a branch
int branchToIndex; // index of target block
u32 branchTo; // if UINT32_MAX, not a branch
BitSet32 regsOut;
BitSet32 regsIn;
BitSet32 fregsIn;
s8 fregOut;
bool isBranchTarget;
bool branchUsesCtr;
bool branchIsIdleLoop;
bool wantsCR0;
bool wantsCR1;
bool wantsFPRF;
@@ -213,6 +214,7 @@ class PPCAnalyzer
void ReorderInstructionsCore(u32 instructions, CodeOp* code, bool reverse, ReorderType type);
void ReorderInstructions(u32 instructions, CodeOp* code);
void SetInstructionStats(CodeBlock* block, CodeOp* code, const GekkoOPInfo* opinfo, u32 index);
bool IsBusyWaitLoop(CodeBlock* block, CodeOp* code, size_t instructions);

// Options
u32 m_options = 0;