@@ -36,15 +36,6 @@ constexpr size_t SAFE_STACK_SIZE = 512 * 1024;
constexpr size_t GUARD_SIZE = 0x10000; // two guards - bottom (permanent) and middle (see above)
constexpr size_t GUARD_OFFSET = STACK_SIZE - SAFE_STACK_SIZE - GUARD_SIZE;

static bool HasCycleCounters()
{
// Bit needs to be set to support cycle counters
const u32 PMUSERENR_CR = 0x4;
u32 reg;
asm("mrs %[val], PMUSERENR_EL0" : [val] "=r"(reg));
return !!(reg & PMUSERENR_CR);
}

void JitArm64::Init()
{
InitializeInstructionTables();
@@ -72,8 +63,6 @@ void JitArm64::Init()

AllocStack();
GenerateAsm();

m_supports_cycle_counter = HasCycleCounters();
}

bool JitArm64::HandleFault(uintptr_t access_address, SContext* ctx)
@@ -172,6 +161,7 @@ void JitArm64::FallBackToInterpreter(UGeckoInstruction inst)
ARM64Reg WA = gpr.GetReg();
LDR(INDEX_UNSIGNED, WA, PPC_REG, PPCSTATE_OFF(npc));
WriteExceptionExit(WA);
gpr.Unlock(WA);
}
else
{
@@ -185,6 +175,7 @@ void JitArm64::FallBackToInterpreter(UGeckoInstruction inst)
FixupBranch c = B(CC_EQ);
WriteExceptionExit(WA);
SetJumpTarget(c);
gpr.Unlock(WA);
}
}

@@ -222,6 +213,7 @@ void JitArm64::HLEFunction(UGeckoInstruction inst)
ARM64Reg WA = gpr.GetReg();
LDR(INDEX_UNSIGNED, WA, PPC_REG, PPCSTATE_OFF(npc));
WriteExit(WA);
gpr.Unlock(WA);
}

void JitArm64::DoNothing(UGeckoInstruction inst)
@@ -239,21 +231,16 @@ void JitArm64::Cleanup()
{
if (jo.optimizeGatherPipe && js.fifoBytesSinceCheck > 0)
{
gpr.Lock(W0);
MOVP2R(X0, &GPFifo::FastCheckGatherPipe);
BLR(X0);
gpr.Unlock(W0);
}
}

void JitArm64::DoDownCount()
{
ARM64Reg WA = gpr.GetReg();
LDR(INDEX_UNSIGNED, WA, PPC_REG, PPCSTATE_OFF(downcount));
ARM64Reg WB = gpr.GetReg();
SUBSI2R(WA, WA, js.downcountAmount, WB);
STR(INDEX_UNSIGNED, WA, PPC_REG, PPCSTATE_OFF(downcount));
gpr.Unlock(WA, WB);
LDR(INDEX_UNSIGNED, W0, PPC_REG, PPCSTATE_OFF(downcount));
SUBSI2R(W0, W0, js.downcountAmount, W1);
STR(INDEX_UNSIGNED, W0, PPC_REG, PPCSTATE_OFF(downcount));
}

void JitArm64::ResetStack()
@@ -303,9 +290,7 @@ void JitArm64::WriteExit(u32 destination, bool LK, u32 exit_address_after_return
{
Cleanup();
DoDownCount();

if (Profiler::g_ProfileBlocks)
EndTimeProfile(js.curBlock);
EndTimeProfile(js.curBlock);

LK &= m_enable_blr_optimization;

@@ -342,18 +327,15 @@ void JitArm64::WriteExit(u32 destination, bool LK, u32 exit_address_after_return

void JitArm64::WriteExit(Arm64Gen::ARM64Reg dest, bool LK, u32 exit_address_after_return)
{
if (dest != DISPATCHER_PC)
MOV(DISPATCHER_PC, dest);

Cleanup();
DoDownCount();
EndTimeProfile(js.curBlock);

LK &= m_enable_blr_optimization;

if (dest != DISPATCHER_PC)
MOV(DISPATCHER_PC, dest);
gpr.Unlock(dest);

if (Profiler::g_ProfileBlocks)
EndTimeProfile(js.curBlock);

if (!LK)
{
B(dispatcher);
@@ -418,35 +400,28 @@ void JitArm64::WriteBLRExit(Arm64Gen::ARM64Reg dest)
return;
}

Cleanup();

if (Profiler::g_ProfileBlocks)
EndTimeProfile(js.curBlock);
if (dest != DISPATCHER_PC)
MOV(DISPATCHER_PC, dest);

ARM64Reg code = gpr.GetReg();
ARM64Reg pc = gpr.GetReg();
Cleanup();
EndTimeProfile(js.curBlock);

// Check if {ARM_PC, PPC_PC} matches the current state.
LDP(INDEX_POST, EncodeRegTo64(code), EncodeRegTo64(pc), SP, 16);
CMP(pc, dest);
LDP(INDEX_POST, X2, X1, SP, 16);
CMP(W1, DISPATCHER_PC);
FixupBranch no_match = B(CC_NEQ);

DoDownCount();
DoDownCount(); // overwrites X0 + X1

RET(EncodeRegTo64(code));
RET(X2);

SetJumpTarget(no_match);

DoDownCount();

if (dest != DISPATCHER_PC)
MOV(DISPATCHER_PC, dest);

ResetStack();

B(dispatcher);

gpr.Unlock(dest, pc, code);
}

void JitArm64::WriteExceptionExit(u32 destination, bool only_external)
@@ -469,39 +444,34 @@ void JitArm64::WriteExceptionExit(u32 destination, bool only_external)

SetJumpTarget(no_exceptions);

if (Profiler::g_ProfileBlocks)
EndTimeProfile(js.curBlock);
EndTimeProfile(js.curBlock);

B(dispatcher);
}

void JitArm64::WriteExceptionExit(ARM64Reg dest, bool only_external)
{
if (dest != DISPATCHER_PC)
MOV(DISPATCHER_PC, dest);

Cleanup();
DoDownCount();

ARM64Reg WA = gpr.GetReg();
LDR(INDEX_UNSIGNED, WA, PPC_REG, PPCSTATE_OFF(Exceptions));
FixupBranch no_exceptions = CBZ(WA);
gpr.Unlock(WA);
LDR(INDEX_UNSIGNED, W30, PPC_REG, PPCSTATE_OFF(Exceptions));
FixupBranch no_exceptions = CBZ(W30);

STR(INDEX_UNSIGNED, dest, PPC_REG, PPCSTATE_OFF(pc));
STR(INDEX_UNSIGNED, dest, PPC_REG, PPCSTATE_OFF(npc));
STR(INDEX_UNSIGNED, DISPATCHER_PC, PPC_REG, PPCSTATE_OFF(pc));
STR(INDEX_UNSIGNED, DISPATCHER_PC, PPC_REG, PPCSTATE_OFF(npc));
if (only_external)
MOVP2R(EncodeRegTo64(dest), &PowerPC::CheckExternalExceptions);
MOVP2R(EncodeRegTo64(DISPATCHER_PC), &PowerPC::CheckExternalExceptions);
else
MOVP2R(EncodeRegTo64(dest), &PowerPC::CheckExceptions);
BLR(EncodeRegTo64(dest));
LDR(INDEX_UNSIGNED, dest, PPC_REG, PPCSTATE_OFF(npc));
MOVP2R(EncodeRegTo64(DISPATCHER_PC), &PowerPC::CheckExceptions);
BLR(EncodeRegTo64(DISPATCHER_PC));
LDR(INDEX_UNSIGNED, DISPATCHER_PC, PPC_REG, PPCSTATE_OFF(npc));

SetJumpTarget(no_exceptions);

if (dest != DISPATCHER_PC)
MOV(DISPATCHER_PC, dest);
gpr.Unlock(dest);

if (Profiler::g_ProfileBlocks)
EndTimeProfile(js.curBlock);
EndTimeProfile(js.curBlock);

B(dispatcher);
}
@@ -514,66 +484,39 @@ void JitArm64::DumpCode(const u8* start, const u8* end)
WARN_LOG(DYNA_REC, "Code dump from %p to %p:\n%s", start, end, output.c_str());
}

void JitArm64::EmitResetCycleCounters()
{
const u32 PMCR_EL0_E = 1;
const u32 PMCR_EL0_P = 2;
const u32 PMCR_EL0_C = 4;
const u32 PMCR_EL0_LC = 0x40;
_MSR(FIELD_PMCR_EL0, X0);
MOVI2R(X1, PMCR_EL0_E | PMCR_EL0_P | PMCR_EL0_C | PMCR_EL0_LC);
ORR(X0, X0, X1);
MRS(X0, FIELD_PMCR_EL0);
}

void JitArm64::EmitGetCycles(Arm64Gen::ARM64Reg reg)
{
_MSR(FIELD_PMCCNTR_EL0, reg);
}

void JitArm64::BeginTimeProfile(JitBlock* b)
{
b->ticCounter = 0;
b->ticStart = 0;
b->ticStop = 0;
MOVP2R(X0, &b->profile_data);
LDR(INDEX_UNSIGNED, X1, X0, offsetof(JitBlock::ProfileData, runCount));
ADD(X1, X1, 1);

if (m_supports_cycle_counter)
{
EmitResetCycleCounters();
EmitGetCycles(X1);
MOVP2R(X0, &b->ticStart);
STR(INDEX_UNSIGNED, X1, X0, 0);
}
else
{
MOVP2R(X1, &QueryPerformanceCounter);
MOVP2R(X0, &b->ticStart);
BLR(X1);
}
// Fetch the current counter register
CNTVCT(X2);

// stores runCount and ticStart
STP(INDEX_SIGNED, X1, X2, X0, offsetof(JitBlock::ProfileData, runCount));
}

void JitArm64::EndTimeProfile(JitBlock* b)
{
if (m_supports_cycle_counter)
{
EmitGetCycles(X2);
MOVP2R(X0, &b->ticStart);
}
else
{
MOVP2R(X1, &QueryPerformanceCounter);
MOVP2R(X0, &b->ticStop);
BLR(X1);
if (!Profiler::g_ProfileBlocks)
return;

MOVP2R(X0, &b->ticStart);
LDR(INDEX_UNSIGNED, X2, X0, 8); // Stop
}
// Fetch the current counter register
CNTVCT(X1);

LDR(INDEX_UNSIGNED, X1, X0, 0); // Start
LDR(INDEX_UNSIGNED, X3, X0, 16); // Counter
SUB(X2, X2, X1);
ADD(X3, X3, X2);
STR(INDEX_UNSIGNED, X3, X0, 16);
MOVP2R(X0, &b->profile_data);

LDR(INDEX_UNSIGNED, X2, X0, offsetof(JitBlock::ProfileData, ticStart));
SUB(X1, X1, X2);

// loads ticCounter and downcountCounter
LDP(INDEX_SIGNED, X2, X3, X0, offsetof(JitBlock::ProfileData, ticCounter));
ADD(X2, X2, X1);
ADDI2R(X3, X3, js.downcountAmount, X1);

// stores ticCounter and downcountCounter
STP(INDEX_SIGNED, X2, X3, X0, offsetof(JitBlock::ProfileData, ticCounter));
}

void JitArm64::Run()
@@ -657,7 +600,6 @@ void JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBlock*

const u8* start = GetCodePtr();
b->checkedEntry = start;
b->runCount = 0;

// Downcount flag check, Only valid for linked blocks
{
@@ -673,15 +615,6 @@ void JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBlock*
// Conditionally add profiling code.
if (Profiler::g_ProfileBlocks)
{
ARM64Reg WA = gpr.GetReg();
ARM64Reg WB = gpr.GetReg();
ARM64Reg XA = EncodeRegTo64(WA);
ARM64Reg XB = EncodeRegTo64(WB);
MOVP2R(XA, &b->runCount);
LDR(INDEX_UNSIGNED, XB, XA, 0);
ADD(XB, XB, 1);
STR(INDEX_UNSIGNED, XB, XA, 0);
gpr.Unlock(WA, WB);
// get start tic
BeginTimeProfile(b);
}
@@ -174,9 +174,6 @@ class JitArm64 : public JitBase, public Arm64Gen::ARM64CodeBlock, public CommonA
static void InitializeInstructionTables();
void CompileInstruction(PPCAnalyst::CodeOp& op);

void EmitResetCycleCounters();
void EmitGetCycles(Arm64Gen::ARM64Reg reg);

// Simple functions to switch between near and far code emitting
void SwitchToFarCode()
{
@@ -253,9 +250,6 @@ class JitArm64 : public JitBase, public Arm64Gen::ARM64CodeBlock, public CommonA
Arm64Gen::ARM64CodeBlock farcode;
u8* nearcode; // Backed up when we switch to far code.

// Do we support cycle counter profiling?
bool m_supports_cycle_counter;

bool m_enable_blr_optimization;
bool m_cleanup_after_stackfault = false;
u8* m_stack_base = nullptr;
@@ -67,8 +67,8 @@ void JitArm64::rfi(UGeckoInstruction inst)
LDR(INDEX_UNSIGNED, WA, PPC_REG, PPCSTATE_OFF(spr[SPR_SRR0]));
gpr.Unlock(WB, WC);

// WA is unlocked in this function
WriteExceptionExit(WA);
gpr.Unlock(WA);
}

void JitArm64::bx(UGeckoInstruction inst)
@@ -220,6 +220,8 @@ void JitArm64::bcctrx(UGeckoInstruction inst)
AND(WA, WA, 30, 29); // Wipe the bottom 2 bits.

WriteExit(WA, inst.LK_3, js.compilerPC + 4);

gpr.Unlock(WA);
}

void JitArm64::bclrx(UGeckoInstruction inst)
@@ -275,6 +277,8 @@ void JitArm64::bclrx(UGeckoInstruction inst)

WriteBLRExit(WA);

gpr.Unlock(WA);

if (conditional)
SwitchToNearCode();

@@ -49,7 +49,6 @@ struct JitBlock
// The number of PPC instructions represented by this block. Mostly
// useful for logging.
u32 originalSize;
int runCount; // for profiling.

// Information about exits to a known address from this block.
// This is used to implement block linking.
@@ -65,11 +64,15 @@ struct JitBlock
// This set stores all physical addresses of all occupied instructions.
std::set<u32> physical_addresses;

// we don't really need to save start and stop
// TODO (mb2): ticStart and ticStop -> "local var" mean "in block" ... low priority ;)
u64 ticStart; // for profiling - time.
u64 ticStop; // for profiling - time.
u64 ticCounter; // for profiling - time.
// Block profiling data, structure is inlined in Jit.cpp
struct ProfileData
{
u64 ticCounter;
u64 downcountCounter;
u64 runCount;
u64 ticStart;
u64 ticStop;
} profile_data = {};

// This tracks the position if this block within the fast block cache.
// We allow each block to have only one map entry.
@@ -119,12 +119,12 @@ void GetProfileResults(ProfileStats* prof_stats)

QueryPerformanceFrequency((LARGE_INTEGER*)&prof_stats->countsPerSec);
g_jit->GetBlockCache()->RunOnBlocks([&prof_stats](const JitBlock& block) {
// Rough heuristic. Mem instructions should cost more.
u64 cost = block.originalSize * (block.runCount / 4);
u64 timecost = block.ticCounter;
const auto& data = block.profile_data;
u64 cost = data.downcountCounter;
u64 timecost = data.ticCounter;
// Todo: tweak.
if (block.runCount >= 1)
prof_stats->block_stats.emplace_back(block.effectiveAddress, cost, timecost, block.runCount,
if (data.runCount >= 1)
prof_stats->block_stats.emplace_back(block.effectiveAddress, cost, timecost, data.runCount,
block.codeSize);
prof_stats->cost_sum += cost;
prof_stats->timecost_sum += timecost;
@@ -5,11 +5,12 @@
#include "Core/PowerPC/Profiler.h"

#include <string>
#include "Common/PerformanceCounter.h"
#include "Core/PowerPC/JitInterface.h"

namespace Profiler
{
bool g_ProfileBlocks;
bool g_ProfileBlocks = false;

void WriteProfileResults(const std::string& filename)
{
@@ -10,37 +10,6 @@

#include "Common/CommonTypes.h"

#include "Common/PerformanceCounter.h"

#if defined(_M_X86_64)

#define PROFILER_QUERY_PERFORMANCE_COUNTER(pt) \
MOV(64, R(ABI_PARAM1), Imm64(reinterpret_cast<u64>(pt))); \
ABI_CallFunction(QueryPerformanceCounter)

// block->ticCounter += block->ticStop - block->ticStart
#define PROFILER_UPDATE_TIME(block) \
MOV(64, R(RSCRATCH2), Imm64((u64)block)); \
MOV(64, R(RSCRATCH), MDisp(RSCRATCH2, offsetof(struct JitBlock, ticStop))); \
SUB(64, R(RSCRATCH), MDisp(RSCRATCH2, offsetof(struct JitBlock, ticStart))); \
ADD(64, R(RSCRATCH), MDisp(RSCRATCH2, offsetof(struct JitBlock, ticCounter))); \
MOV(64, MDisp(RSCRATCH2, offsetof(struct JitBlock, ticCounter)), R(RSCRATCH));

#define PROFILER_VPUSH \
BitSet32 registersInUse = CallerSavedRegistersInUse(); \
ABI_PushRegistersAndAdjustStack(registersInUse, 0);

#define PROFILER_VPOP ABI_PopRegistersAndAdjustStack(registersInUse, 0);

#else

#define PROFILER_QUERY_PERFORMANCE_COUNTER(pt)
#define PROFILER_UPDATE_TIME(b)
#define PROFILER_VPUSH
#define PROFILER_VPOP

#endif

struct BlockStat
{
BlockStat(u32 _addr, u64 c, u64 ticks, u64 run, u32 size)
@@ -87,6 +87,7 @@ void CJitWindow::Compare(u32 em_address)
PPCAnalyst::CodeBlock code_block;
PPCAnalyst::PPCAnalyzer analyzer;
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE);
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_FOLLOW);

code_block.m_stats = &st;
code_block.m_gpa = &gpa;