69 changes: 32 additions & 37 deletions Source/Core/Core/PowerPC/JitArm32/Jit.cpp
Expand Up @@ -40,6 +40,11 @@ void JitArm::Init()
fpr.Init(this);
jo.enableBlocklink = true;
jo.optimizeGatherPipe = true;

code_block.m_stats = &js.st;
code_block.m_gpa = &js.gpa;
code_block.m_fpa = &js.fpa;
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE);
}

void JitArm::ClearCache()
Expand Down Expand Up @@ -250,27 +255,27 @@ void JitArm::Trace()
void JitArm::PrintDebug(UGeckoInstruction inst, u32 level)
{
if (level > 0)
printf("Start: %08x OP '%s' Info\n", (u32)GetCodePtr(), PPCTables::GetInstructionName(inst));
WARN_LOG(DYNA_REC, "Start: %08x OP '%s' Info", (u32)GetCodePtr(), PPCTables::GetInstructionName(inst));
if (level > 1)
{
GekkoOPInfo* Info = GetOpInfo(inst.hex);
printf("\tOuts\n");
WARN_LOG(DYNA_REC, "\tOuts");
if (Info->flags & FL_OUT_A)
printf("\t-OUT_A: %x\n", inst.RA);
WARN_LOG(DYNA_REC, "\t-OUT_A: %x", inst.RA);
if (Info->flags & FL_OUT_D)
printf("\t-OUT_D: %x\n", inst.RD);
printf("\tIns\n");
WARN_LOG(DYNA_REC, "\t-OUT_D: %x", inst.RD);
WARN_LOG(DYNA_REC, "\tIns");
// A, AO, B, C, S
if (Info->flags & FL_IN_A)
printf("\t-IN_A: %x\n", inst.RA);
WARN_LOG(DYNA_REC, "\t-IN_A: %x", inst.RA);
if (Info->flags & FL_IN_A0)
printf("\t-IN_A0: %x\n", inst.RA);
WARN_LOG(DYNA_REC, "\t-IN_A0: %x", inst.RA);
if (Info->flags & FL_IN_B)
printf("\t-IN_B: %x\n", inst.RB);
WARN_LOG(DYNA_REC, "\t-IN_B: %x", inst.RB);
if (Info->flags & FL_IN_C)
printf("\t-IN_C: %x\n", inst.RC);
WARN_LOG(DYNA_REC, "\t-IN_C: %x", inst.RC);
if (Info->flags & FL_IN_S)
printf("\t-IN_S: %x\n", inst.RS);
WARN_LOG(DYNA_REC, "\t-IN_S: %x", inst.RS);
}
}

Expand Down Expand Up @@ -298,14 +303,10 @@ const u8* JitArm::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBlo
// Memory exception on instruction fetch
bool memory_exception = false;

// A broken block is a block that does not end in a branch
bool broken_block = false;

if (Core::g_CoreStartupParameter.bEnableDebugging)
{
// Comment out the following to disable breakpoints (speed-up)
blockSize = 1;
broken_block = true;
Trace();
}

Expand All @@ -324,26 +325,19 @@ const u8* JitArm::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBlo
}
}


int size = 0;
js.isLastInstruction = false;
js.blockStart = em_address;
js.fifoBytesThisBlock = 0;
js.curBlock = b;
js.block_flags = 0;
js.cancel = false;

u32 nextPC = em_address;
// Analyze the block, collect all instructions it is made of (including inlining,
// if that is enabled), reorder instructions for optimal performance, and join joinable instructions.
u32 nextPC = em_address;
u32 merged_addresses[32];
const int capacity_of_merged_addresses = sizeof(merged_addresses) / sizeof(merged_addresses[0]);
int size_of_merged_addresses = 0;
if (!memory_exception)
{
// If there is a memory exception inside a block (broken_block==true), compile up to that instruction.
nextPC = PPCAnalyst::Flatten(em_address, &size, &js.st, &js.gpa, &js.fpa, broken_block, code_buf, blockSize, merged_addresses, capacity_of_merged_addresses, size_of_merged_addresses);
}
nextPC = analyzer.Analyze(em_address, &code_block, code_buf, blockSize);

PPCAnalyst::CodeOp *ops = code_buf->codebuffer;

const u8 *start = GetCodePtr();
Expand Down Expand Up @@ -398,28 +392,29 @@ const u8* JitArm::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBlo
gpr.Start(js.gpa);
fpr.Start(js.fpa);
js.downcountAmount = 0;

if (!Core::g_CoreStartupParameter.bEnableDebugging)
{
for (int i = 0; i < size_of_merged_addresses; ++i)
{
const u32 address = merged_addresses[i];
js.downcountAmount += PatchEngine::GetSpeedhackCycles(address);
}
}
js.downcountAmount += PatchEngine::GetSpeedhackCycles(em_address);

js.skipnext = false;
js.blockSize = size;
js.blockSize = code_block.m_num_instructions;
js.compilerPC = nextPC;

const int DEBUG_OUTPUT = 0;

if (DEBUG_OUTPUT)
WARN_LOG(DYNA_REC, "-------0x%08x-------", em_address);

// Translate instructions
for (int i = 0; i < (int)size; i++)
for (u32 i = 0; i < code_block.m_num_instructions; i++)
{
js.compilerPC = ops[i].address;
js.op = &ops[i];
js.instructionNumber = i;
const GekkoOPInfo *opinfo = ops[i].opinfo;
js.downcountAmount += opinfo->numCycles;

if (i == (int)size - 1)
if (i == (code_block.m_num_instructions - 1))
{
// WARNING - cmp->branch merging will screw this up.
js.isLastInstruction = true;
Expand Down Expand Up @@ -468,7 +463,7 @@ const u8* JitArm::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBlo
}
if (!ops[i].skip)
{
PrintDebug(ops[i].inst, 0);
PrintDebug(ops[i].inst, DEBUG_OUTPUT);
if (js.memcheck && (opinfo->flags & FL_USE_FPU))
{
// Don't do this yet
Expand All @@ -485,15 +480,15 @@ const u8* JitArm::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBlo
}
if (memory_exception)
BKPT(0x500);
if (broken_block)
if (code_block.m_broken)
{
printf("Broken Block going to 0x%08x\n", nextPC);
WriteExit(nextPC);
}

b->flags = js.block_flags;
b->codeSize = (u32)(GetCodePtr() - normalEntry);
b->originalSize = size;
b->originalSize = code_block.m_num_instructions;
FlushIcache();
return start;
}
Expand Down
1 change: 1 addition & 0 deletions Source/Core/Core/PowerPC/JitArm32/Jit.h
Expand Up @@ -42,6 +42,7 @@ class JitArm : public JitBase, public ArmGen::ARMCodeBlock
ArmFPRCache fpr;

PPCAnalyst::CodeBuffer code_buffer;

void DoDownCount();

void PrintDebug(UGeckoInstruction inst, u32 level);
Expand Down
25 changes: 9 additions & 16 deletions Source/Core/Core/PowerPC/JitArm32/JitArm_Branch.cpp
Expand Up @@ -149,7 +149,6 @@ void JitArm::bcx(UGeckoInstruction inst)
INSTRUCTION_START
JITDISABLE(bJITBranchOff)
// USES_CR
_assert_msg_(DYNA_REC, js.isLastInstruction, "bcx not last instruction of block");

gpr.Flush();
fpr.Flush();
Expand Down Expand Up @@ -203,7 +202,8 @@ void JitArm::bcx(UGeckoInstruction inst)
if ((inst.BO & BO_DONT_DECREMENT_FLAG) == 0)
SetJumpTarget( pCTRDontBranch );

WriteExit(js.compilerPC + 4);
if (!analyzer.HasOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE))
WriteExit(js.compilerPC + 4);
}
void JitArm::bcctrx(UGeckoInstruction inst)
{
Expand Down Expand Up @@ -265,25 +265,16 @@ void JitArm::bcctrx(UGeckoInstruction inst)
WriteExitDestInR(rA);

SetJumpTarget(b);
WriteExit(js.compilerPC + 4);

if (!analyzer.HasOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE))
WriteExit(js.compilerPC + 4);
}
}
void JitArm::bclrx(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITBranchOff)
if (!js.isLastInstruction &&
(inst.BO & (1 << 4)) && (inst.BO & (1 << 2))) {
if (inst.LK)
{
ARMReg rA = gpr.GetReg(false);
u32 Jumpto = js.compilerPC + 4;
MOVI2R(rA, Jumpto);
STR(rA, R9, PPCSTATE_OFF(spr[SPR_LR]));
// ARMABI_MOVI2M((u32)&LR, js.compilerPC + 4);
}
return;
}

gpr.Flush();
fpr.Flush();

Expand Down Expand Up @@ -342,5 +333,7 @@ void JitArm::bclrx(UGeckoInstruction inst)
SetJumpTarget( pConditionDontBranch );
if ((inst.BO & BO_DONT_DECREMENT_FLAG) == 0)
SetJumpTarget( pCTRDontBranch );
WriteExit(js.compilerPC + 4);

if (!analyzer.HasOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE))
WriteExit(js.compilerPC + 4);
}
4 changes: 3 additions & 1 deletion Source/Core/Core/PowerPC/JitArm32/JitArm_Integer.cpp
Expand Up @@ -1014,7 +1014,9 @@ void JitArm::twx(UGeckoInstruction inst)
SetJumpTarget(exit3);
SetJumpTarget(exit4);
SetJumpTarget(exit5);
WriteExit(js.compilerPC + 4);

if (!analyzer.HasOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE))
WriteExit(js.compilerPC + 4);

gpr.Unlock(RA, RB);
}
44 changes: 15 additions & 29 deletions Source/Core/Core/PowerPC/JitArmIL/JitIL.cpp
Expand Up @@ -34,6 +34,10 @@ void JitArmIL::Init()
AllocCodeSpace(CODE_SIZE);
blocks.Init();
asm_routines.Init();

code_block.m_stats = &js.st;
code_block.m_gpa = &js.gpa;
code_block.m_fpa = &js.fpa;
}

void JitArmIL::ClearCache()
Expand Down Expand Up @@ -202,14 +206,10 @@ const u8* JitArmIL::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitB
// Memory exception on instruction fetch
bool memory_exception = false;

// A broken block is a block that does not end in a branch
bool broken_block = false;

if (Core::g_CoreStartupParameter.bEnableDebugging)
{
// Comment out the following to disable breakpoints (speed-up)
blockSize = 1;
broken_block = true;
}

if (em_address == 0)
Expand All @@ -228,25 +228,19 @@ const u8* JitArmIL::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitB
}


int size = 0;
js.isLastInstruction = false;
js.blockStart = em_address;
js.fifoBytesThisBlock = 0;
js.curBlock = b;
js.block_flags = 0;
js.cancel = false;

u32 nextPC = em_address;
// Analyze the block, collect all instructions it is made of (including inlining,
// if that is enabled), reorder instructions for optimal performance, and join joinable instructions.
u32 nextPC = em_address;
u32 merged_addresses[32];
const int capacity_of_merged_addresses = sizeof(merged_addresses) / sizeof(merged_addresses[0]);
int size_of_merged_addresses = 0;
if (!memory_exception)
{
// If there is a memory exception inside a block (broken_block==true), compile up to that instruction.
nextPC = PPCAnalyst::Flatten(em_address, &size, &js.st, &js.gpa, &js.fpa, broken_block, code_buf, blockSize, merged_addresses, capacity_of_merged_addresses, size_of_merged_addresses);
}
nextPC = analyzer.Analyze(em_address, &code_block, code_buf, blockSize);

PPCAnalyst::CodeOp *ops = code_buf->codebuffer;

const u8 *start = GetCodePtr();
Expand All @@ -271,7 +265,7 @@ const u8* JitArmIL::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitB
u64 codeHash = -1;
{
// For profiling and IR Writer
for (int i = 0; i < (int)size; i++)
for (u32 i = 0; i < code_block.m_num_instructions; i++)
{
const u64 inst = ops[i].inst.hex;
// Ported from boost::hash
Expand All @@ -289,27 +283,21 @@ const u8* JitArmIL::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitB

js.downcountAmount = 0;
if (!Core::g_CoreStartupParameter.bEnableDebugging)
{
for (int i = 0; i < size_of_merged_addresses; ++i)
{
const u32 address = merged_addresses[i];
js.downcountAmount += PatchEngine::GetSpeedhackCycles(address);
}
}
js.downcountAmount += PatchEngine::GetSpeedhackCycles(em_address);

js.skipnext = false;
js.blockSize = size;
js.blockSize = code_block.m_num_instructions;
js.compilerPC = nextPC;
// Translate instructions
for (int i = 0; i < (int)size; i++)
for (u32 i = 0; i < code_block.m_num_instructions; i++)
{
js.compilerPC = ops[i].address;
js.op = &ops[i];
js.instructionNumber = i;
const GekkoOPInfo *opinfo = ops[i].opinfo;
js.downcountAmount += opinfo->numCycles;

if (i == (int)size - 1)
if (i == (code_block.m_num_instructions - 1))
{
// WARNING - cmp->branch merging will screw this up.
js.isLastInstruction = true;
Expand Down Expand Up @@ -348,21 +336,19 @@ const u8* JitArmIL::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitB
}
if (memory_exception)
BKPT(0x500);
if (broken_block)

if (code_block.m_broken)
{
printf("Broken Block going to 0x%08x\n", nextPC);
WriteExit(nextPC);
}

// Perform actual code generation

WriteCode(nextPC);
b->flags = js.block_flags;
b->codeSize = (u32)(GetCodePtr() - normalEntry);
b->originalSize = size;
b->originalSize = code_block.m_num_instructions;;

{
}
FlushIcache();
return start;

Expand Down
3 changes: 3 additions & 0 deletions Source/Core/Core/PowerPC/JitCommon/JitBase.h
Expand Up @@ -83,6 +83,9 @@ class JitBase : public CPUCoreBase
std::unordered_set<u32> fifoWriteAddresses;
};

PPCAnalyst::CodeBlock code_block;
PPCAnalyst::PPCAnalyzer analyzer;

public:
// This should probably be removed from public:
JitOptions jo;
Expand Down
609 changes: 313 additions & 296 deletions Source/Core/Core/PowerPC/PPCAnalyst.cpp

Large diffs are not rendered by default.

89 changes: 83 additions & 6 deletions Source/Core/Core/PowerPC/PPCAnalyst.h
Expand Up @@ -64,19 +64,32 @@ struct BlockRegStats
return std::max(lastRead[reg], lastWrite[reg]) -
std::min(firstRead[reg], firstWrite[reg]);}

inline void SetInputRegister(int reg, short opindex) {
inline void SetInputRegister(int reg, short opindex)
{
if (firstRead[reg] == -1)
firstRead[reg] = (short)(opindex);
lastRead[reg] = (short)(opindex);
numReads[reg]++;
}

inline void SetOutputRegister(int reg, short opindex) {
inline void SetOutputRegister(int reg, short opindex)
{
if (firstWrite[reg] == -1)
firstWrite[reg] = (short)(opindex);
lastWrite[reg] = (short)(opindex);
numWrites[reg]++;
}

inline void Clear()
{
for (int i = 0; i < 32; ++i)
{
firstRead[i] = -1;
firstWrite[i] = -1;
numReads[i] = 0;
numWrites[i] = 0;
}
}
};


Expand All @@ -94,10 +107,74 @@ class CodeBuffer

};

u32 Flatten(u32 address, int *realsize, BlockStats *st, BlockRegStats *gpa,
BlockRegStats *fpa, bool &broken_block, CodeBuffer *buffer,
int blockSize, u32* merged_addresses,
int capacity_of_merged_addresses, int& size_of_merged_addresses);
struct CodeBlock
{
// Beginning PPC address.
u32 m_address;

// Number of instructions
// Gives us the size of the block.
u32 m_num_instructions;

// Some basic statistics about the block.
BlockStats *m_stats;

// Register statistics about the block.
BlockRegStats *m_gpa, *m_fpa;

// Are we a broken block?
bool m_broken;
};

class PPCAnalyzer
{
private:

void ReorderInstructions(u32 instructions, CodeOp *code);
void SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInfo *opinfo, u32 index);

// Options
u32 m_options;
public:

enum AnalystOption
{
// Conditional branch continuing
// If the JIT core supports conditional branches within the blocks
// Block will end on unconditional branch or other ENDBLOCK flagged instruction.
// Requires JIT support to be enabled.
OPTION_CONDITIONAL_CONTINUE = (1 << 0),

// If there is a unconditional branch that jumps to a leaf function then inline it.
// Might require JIT intervention to support it correctly.
// Requires JITBLock support for inlined code
// XXX: NOT COMPLETE
OPTION_LEAF_INLINE = (1 << 1),

// Complex blocks support jumping backwards on to themselves.
// Happens commonly in loops, pretty complex to support.
// May require register caches to use register usage metrics.
// XXX: NOT COMPLETE
OPTION_COMPLEX_BLOCK = (1 << 2),

// Similar to complex blocks.
// Instead of jumping backwards, this jumps forwards within the block.
// Requires JIT support to work.
// XXX: NOT COMPLETE
OPTION_FORWARD_JUMP = (1 << 3),
};


PPCAnalyzer() : m_options(0) {}

// Option setting/getting
void SetOption(AnalystOption option) { m_options |= option; }
void ClearOption(AnalystOption option) { m_options &= ~(option); }
bool HasOption(AnalystOption option) { return !!(m_options & option); }

u32 Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32 blockSize);
};

void LogFunctionCall(u32 addr);
void FindFunctions(u32 startAddr, u32 endAddr, PPCSymbolDB *func_db);
bool AnalyzeFunction(u32 startAddr, Symbol &func, int max_size = 0);
Expand Down
22 changes: 12 additions & 10 deletions Source/Core/DolphinWX/Debugger/JitWindow.cpp
Expand Up @@ -129,8 +129,7 @@ void CJitWindow::Compare(u32 em_address)

const u8 *code = (const u8 *)jit->GetBlockCache()->GetCompiledCodeFromBlock(block_num);
u64 disasmPtr = (u64)code;
int size = block->codeSize;
const u8 *end = code + size;
const u8 *end = code + block->codeSize;
char *sptr = (char*)xDis;

int num_x86_instructions = 0;
Expand All @@ -154,14 +153,17 @@ void CJitWindow::Compare(u32 em_address)
PPCAnalyst::BlockStats st;
PPCAnalyst::BlockRegStats gpa;
PPCAnalyst::BlockRegStats fpa;
bool broken_block = false;
u32 merged_addresses[32];
const int capacity_of_merged_addresses = sizeof(merged_addresses) / sizeof(merged_addresses[0]);
int size_of_merged_addresses;
if (PPCAnalyst::Flatten(ppc_addr, &size, &st, &gpa, &fpa, broken_block, &code_buffer, size, merged_addresses, capacity_of_merged_addresses, size_of_merged_addresses) != 0xffffffff)
PPCAnalyst::CodeBlock code_block;
PPCAnalyst::PPCAnalyzer analyzer;

code_block.m_stats = &st;
code_block.m_gpa = &gpa;
code_block.m_fpa = &fpa;

if (analyzer.Analyze(ppc_addr, &code_block, &code_buffer, block->codeSize) != 0xFFFFFFFF)
{
sptr = (char*)xDis;
for (int i = 0; i < size; i++)
for (u32 i = 0; i < code_block.m_num_instructions; i++)
{
const PPCAnalyst::CodeOp &op = code_buffer.codebuffer[i];
char temp[256];
Expand All @@ -181,9 +183,9 @@ void CJitWindow::Compare(u32 em_address)
sptr += sprintf(sptr, "%i estimated cycles\n", st.numCycles);

sptr += sprintf(sptr, "Num instr: PPC: %i x86: %i (blowup: %i%%)\n",
size, num_x86_instructions, 100 * (num_x86_instructions / size - 1));
code_block.m_num_instructions, num_x86_instructions, 100 * (num_x86_instructions / code_block.m_num_instructions - 1));
sptr += sprintf(sptr, "Num bytes: PPC: %i x86: %i (blowup: %i%%)\n",
size * 4, block->codeSize, 100 * (block->codeSize / (4 * size) - 1));
code_block.m_num_instructions * 4, block->codeSize, 100 * (block->codeSize / (4 * code_block.m_num_instructions) - 1));

ppc_box->SetValue(StrToWxStr((char*)xDis));
}
Expand Down