From b6f4dc29d0e694c07d6910a5b4e129a0c5eb8ab5 Mon Sep 17 00:00:00 2001 From: Fiora Date: Sun, 4 Jan 2015 04:20:59 -0800 Subject: [PATCH] JIT: optimize for the common case of unquantized psq_l/st Optimistically assume used GQRs are 0 in blocks that only use one GQR, and bail at the start of the block and recompile if that assumption fails. Many games use almost entirely unquantized stores (e.g. Rebel Strike, Sonic Colors), so this will likely be a big performance improvement across the board for games with heavy use of paired singles. --- Source/Core/Common/BitSet.h | 9 + .../Interpreter/Interpreter_Tables.cpp | 8 +- Source/Core/Core/PowerPC/Jit64/Jit.cpp | 22 +++ .../PowerPC/Jit64/Jit_LoadStorePaired.cpp | 176 +++++++++++++++++- .../Core/PowerPC/JitCommon/JitAsmCommon.cpp | 6 +- .../Core/PowerPC/JitCommon/JitAsmCommon.h | 4 + Source/Core/Core/PowerPC/JitCommon/JitBase.h | 2 + .../Core/Core/PowerPC/JitCommon/JitCache.cpp | 4 + Source/Core/Core/PowerPC/JitInterface.cpp | 13 +- Source/Core/Core/PowerPC/JitInterface.h | 3 +- Source/Core/Core/PowerPC/PPCAnalyst.cpp | 18 ++ Source/Core/Core/PowerPC/PPCAnalyst.h | 3 + 12 files changed, 250 insertions(+), 18 deletions(-) diff --git a/Source/Core/Common/BitSet.h b/Source/Core/Common/BitSet.h index 9be7ccbe9226..56545929c3cd 100644 --- a/Source/Core/Common/BitSet.h +++ b/Source/Core/Common/BitSet.h @@ -21,6 +21,12 @@ static inline int CountSetBits(T v) v = (v + (v >> 4)) & (T)~(T)0/255*15; return (T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * 8; } +static inline int LeastSignificantSetBit(u8 val) +{ + unsigned long index; + _BitScanForward(&index, val); + return (int)index; +} static inline int LeastSignificantSetBit(u32 val) { unsigned long index; @@ -34,8 +40,10 @@ static inline int LeastSignificantSetBit(u64 val) return (int)index; } #else +static inline int CountSetBits(u8 val) { return __builtin_popcount(val); } static inline int CountSetBits(u32 val) { return __builtin_popcount(val); } static inline int CountSetBits(u64 val) { return __builtin_popcountll(val); } +static inline int LeastSignificantSetBit(u8 val) { return __builtin_ctz(val); } static inline int LeastSignificantSetBit(u32 val) { return __builtin_ctz(val); } static inline int LeastSignificantSetBit(u64 val) { return __builtin_ctzll(val); } #endif @@ -163,5 +171,6 @@ class BitSet } +typedef BS::BitSet BitSet8; typedef BS::BitSet BitSet32; typedef BS::BitSet BitSet64; diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp index b6ff10b52e11..639a899f5734 100644 --- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp +++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp @@ -137,10 +137,10 @@ static GekkoOPTemplate table4_2[] = static GekkoOPTemplate table4_3[] = { - {6, Interpreter::psq_lx, {"psq_lx", OPTYPE_PS, FL_OUT_FLOAT_S | FL_IN_A0B | FL_USE_FPU | FL_LOADSTORE, 1, 0, 0, 0}}, - {7, Interpreter::psq_stx, {"psq_stx", OPTYPE_PS, FL_IN_FLOAT_S | FL_IN_A0B | FL_USE_FPU | FL_LOADSTORE, 1, 0, 0, 0}}, - {38, Interpreter::psq_lux, {"psq_lux", OPTYPE_PS, FL_OUT_FLOAT_S | FL_OUT_A | FL_IN_AB | FL_USE_FPU | FL_LOADSTORE, 1, 0, 0, 0}}, - {39, Interpreter::psq_stux, {"psq_stux", OPTYPE_PS, FL_IN_FLOAT_S | FL_OUT_A | FL_IN_AB | FL_USE_FPU | FL_LOADSTORE, 1, 0, 0, 0}}, + {6, Interpreter::psq_lx, {"psq_lx", OPTYPE_LOADPS, FL_OUT_FLOAT_S | FL_IN_A0B | FL_USE_FPU | FL_LOADSTORE, 1, 0, 0, 0}}, + {7, Interpreter::psq_stx, {"psq_stx", OPTYPE_STOREPS, FL_IN_FLOAT_S | FL_IN_A0B | FL_USE_FPU | FL_LOADSTORE, 1, 0, 0, 0}}, + {38, Interpreter::psq_lux, {"psq_lux", OPTYPE_LOADPS, FL_OUT_FLOAT_S | FL_OUT_A | FL_IN_AB | FL_USE_FPU | FL_LOADSTORE, 1, 0, 0, 0}}, + {39, Interpreter::psq_stux, {"psq_stux", OPTYPE_STOREPS, FL_IN_FLOAT_S | FL_OUT_A | FL_IN_AB | FL_USE_FPU | FL_LOADSTORE, 1, 0, 0, 0}}, }; static GekkoOPTemplate table19[] = diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index 368602c3e78e..c3f4cccfbe83 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -15,6 +15,7 @@ #include "Core/PatchEngine.h" #include "Core/HLE/HLE.h" #include "Core/HW/ProcessorInterface.h" +#include "Core/PowerPC/JitInterface.h" #include "Core/PowerPC/Profiler.h" #include "Core/PowerPC/Jit64/Jit.h" #include "Core/PowerPC/Jit64/Jit64_Tables.h" @@ -605,6 +606,27 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc js.skipnext = false; js.carryFlagSet = false; js.carryFlagInverted = false; + js.assumeNoPairedQuantize = false; + + if (code_block.m_gqr_used.Count() == 1 && js.pairedQuantizeAddresses.find(js.blockStart) == js.pairedQuantizeAddresses.end()) + { + int gqr; + for (int i = 0; i < 8; i++) + if (code_block.m_gqr_used[i]) + gqr = i; + CMP(32, PPCSTATE(spr[SPR_GQR0 + gqr]), Imm8(0)); + FixupBranch failure = J_CC(CC_NZ, true); + SwitchToFarCode(); + SetJumpTarget(failure); + MOV(32, PPCSTATE(pc), Imm32(js.blockStart)); + ABI_PushRegistersAndAdjustStack({}, 0); + ABI_CallFunctionC((void *)&JitInterface::CompileExceptionCheck, (u32)JitInterface::ExceptionType::EXCEPTIONS_PAIRED_QUANTIZE); + ABI_PopRegistersAndAdjustStack({}, 0); + WriteExceptionExit(); + SwitchToNearCode(); + js.assumeNoPairedQuantize = true; + } + // Translate instructions for (u32 i = 0; i < code_block.m_num_instructions; i++) { diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp index b6dac78f866b..0196f5deec64 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp @@ -11,6 +11,7 @@ #include "Core/PowerPC/Jit64/Jit.h" #include "Core/PowerPC/Jit64/JitAsm.h" #include "Core/PowerPC/Jit64/JitRegCache.h" +#include "Core/PowerPC/JitCommon/JitAsmCommon.h" using namespace Gen; @@ -20,7 +21,6 @@ void Jit64::psq_stXX(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITLoadStorePairedOff); - FALLBACK_IF(!inst.RA); s32 offset = inst.SIMM_12; bool indexed = inst.OPCD == 4; @@ -30,12 +30,75 @@ void Jit64::psq_stXX(UGeckoInstruction inst) int s = inst.FS; int i = indexed ? inst.Ix : inst.I; int w = indexed ? inst.Wx : inst.W; + FALLBACK_IF(!a); gpr.Lock(a, b); + if (js.assumeNoPairedQuantize) + { + int storeOffset = 0; + gpr.BindToRegister(a, true, update); + X64Reg addr = gpr.RX(a); + if (update && js.memcheck) + { + addr = RSCRATCH2; + MOV(32, R(addr), gpr.R(a)); + } + if (indexed) + { + if (update) + { + ADD(32, R(addr), gpr.R(b)); + } + else + { + addr = RSCRATCH2; + if (a && gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg()) + { + LEA(32, addr, MComplex(gpr.RX(a), gpr.RX(b), SCALE_1, 0)); + } + else + { + MOV(32, R(addr), gpr.R(b)); + if (a) + ADD(32, R(addr), gpr.R(a)); + } + } + } + else + { + if (update) + ADD(32, R(addr), Imm32(offset)); + else + storeOffset = offset; + } + + fpr.Lock(s); + if (w) + { + CVTSD2SS(XMM0, fpr.R(s)); + MOVD_xmm(R(RSCRATCH), XMM0); + } + else + { + CVTPD2PS(XMM0, fpr.R(s)); + MOVQ_xmm(R(RSCRATCH), XMM0); + ROL(64, R(RSCRATCH), Imm8(32)); + } + + BitSet32 registersInUse = CallerSavedRegistersInUse(); + if (update && js.memcheck) + registersInUse[addr] = true; + SafeWriteRegToReg(RSCRATCH, addr, w ? 32 : 64, storeOffset, registersInUse); + MemoryExceptionCheck(); + if (update && js.memcheck) + MOV(32, gpr.R(a), R(addr)); + gpr.UnlockAll(); + fpr.UnlockAll(); + return; + } gpr.FlushLockX(RSCRATCH_EXTRA); if (update) gpr.BindToRegister(a, true, true); - fpr.BindToRegister(s, true, false); if (gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg() && (indexed || offset)) { if (indexed) @@ -92,7 +155,6 @@ void Jit64::psq_lXX(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITLoadStorePairedOff); - FALLBACK_IF(!inst.RA); s32 offset = inst.SIMM_12; bool indexed = inst.OPCD == 4; @@ -102,8 +164,116 @@ void Jit64::psq_lXX(UGeckoInstruction inst) int s = inst.FS; int i = indexed ? inst.Ix : inst.I; int w = indexed ? inst.Wx : inst.W; + FALLBACK_IF(!a); gpr.Lock(a, b); + if (js.assumeNoPairedQuantize) + { + s32 loadOffset = 0; + gpr.BindToRegister(a, true, update); + X64Reg addr = gpr.RX(a); + if (update && js.memcheck) + { + addr = RSCRATCH2; + MOV(32, R(addr), gpr.R(a)); + } + if (indexed) + { + if (update) + { + ADD(32, R(addr), gpr.R(b)); + } + else + { + addr = RSCRATCH2; + if (a && gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg()) + { + LEA(32, addr, MComplex(gpr.RX(a), gpr.RX(b), SCALE_1, 0)); + } + else + { + MOV(32, R(addr), gpr.R(b)); + if (a) + ADD(32, R(addr), gpr.R(a)); + } + } + } + else + { + if (update) + ADD(32, R(addr), Imm32(offset)); + else + loadOffset = offset; + } + + fpr.Lock(s); + if (js.memcheck) + { + fpr.StoreFromRegister(s); + js.revertFprLoad = s; + } + fpr.BindToRegister(s, false); + + // Let's mirror the JitAsmCommon code and assume all non-MMU loads go to RAM. + if (!js.memcheck) + { + if (w) + { + if (cpu_info.bSSSE3) + { + MOVD_xmm(XMM0, MComplex(RMEM, addr, SCALE_1, loadOffset)); + PSHUFB(XMM0, M(pbswapShuffle1x4)); + UNPCKLPS(XMM0, M(m_one)); + } + else + { + LoadAndSwap(32, RSCRATCH, MComplex(RMEM, addr, SCALE_1, loadOffset)); + MOVD_xmm(XMM0, R(RSCRATCH)); + UNPCKLPS(XMM0, M(m_one)); + } + } + else + { + if (cpu_info.bSSSE3) + { + MOVQ_xmm(XMM0, MComplex(RMEM, addr, SCALE_1, loadOffset)); + PSHUFB(XMM0, M(pbswapShuffle2x4)); + } + else + { + LoadAndSwap(64, RSCRATCH, MComplex(RMEM, addr, SCALE_1, loadOffset)); + ROL(64, R(RSCRATCH), Imm8(32)); + MOVQ_xmm(XMM0, R(RSCRATCH)); + } + } + CVTPS2PD(fpr.RX(s), R(XMM0)); + } + else + { + BitSet32 registersInUse = CallerSavedRegistersInUse(); + registersInUse[fpr.RX(s) << 16] = false; + if (update) + registersInUse[addr] = true; + SafeLoadToReg(RSCRATCH, R(addr), w ? 32 : 64, loadOffset, registersInUse, false); + MemoryExceptionCheck(); + if (w) + { + MOVD_xmm(XMM0, R(RSCRATCH)); + UNPCKLPS(XMM0, M(m_one)); + } + else + { + ROL(64, R(RSCRATCH), Imm8(32)); + MOVQ_xmm(XMM0, R(RSCRATCH)); + } + CVTPS2PD(fpr.RX(s), R(XMM0)); + if (update) + MOV(32, gpr.R(a), R(addr)); + } + gpr.UnlockAll(); + fpr.UnlockAll(); + return; + } gpr.FlushLockX(RSCRATCH_EXTRA); gpr.BindToRegister(a, true, update); fpr.BindToRegister(s, false, true); diff --git a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp index 0f9540298381..c7b39955104a 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp @@ -191,8 +191,8 @@ void CommonAsmRoutines::GenMfcr() // Safe + Fast Quantizers, originally from JITIL by magumagu -static const u8 GC_ALIGNED16(pbswapShuffle1x4[16]) = { 3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; -static const u8 GC_ALIGNED16(pbswapShuffle2x4[16]) = { 3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15 }; +const u8 GC_ALIGNED16(pbswapShuffle1x4[16]) = { 3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; +const u8 GC_ALIGNED16(pbswapShuffle2x4[16]) = { 3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15 }; static const float GC_ALIGNED16(m_quantizeTableS[]) = { @@ -257,7 +257,7 @@ static const float GC_ALIGNED16(m_255) = 255.0f; static const float GC_ALIGNED16(m_127) = 127.0f; static const float GC_ALIGNED16(m_m128) = -128.0f; -static const float GC_ALIGNED16(m_one[]) = {1.0f, 0.0f, 0.0f, 0.0f}; +const float GC_ALIGNED16(m_one[]) = { 1.0f, 0.0f, 0.0f, 0.0f }; #define QUANTIZE_OVERFLOW_SAFE diff --git a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h index 34a7232a4540..b41bc26875c7 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h @@ -6,6 +6,10 @@ #include "Core/PowerPC/JitCommon/Jit_Util.h" +extern const u8 GC_ALIGNED16(pbswapShuffle1x4[16]); +extern const u8 GC_ALIGNED16(pbswapShuffle2x4[16]); +extern const float GC_ALIGNED16(m_one[]); + class CommonAsmRoutinesBase { public: diff --git a/Source/Core/Core/PowerPC/JitCommon/JitBase.h b/Source/Core/Core/PowerPC/JitCommon/JitBase.h index cb79f3f51198..96dfdf75107c 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitBase.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitBase.h @@ -84,6 +84,7 @@ class JitBase : public CPUCoreBase int revertGprLoad; int revertFprLoad; + bool assumeNoPairedQuantize; bool firstFPInstructionFound; bool isLastInstruction; bool memcheck; @@ -104,6 +105,7 @@ class JitBase : public CPUCoreBase JitBlock *curBlock; std::unordered_set fifoWriteAddresses; + std::unordered_set pairedQuantizeAddresses; }; PPCAnalyst::CodeBlock code_block; diff --git a/Source/Core/Core/PowerPC/JitCommon/JitCache.cpp b/Source/Core/Core/PowerPC/JitCommon/JitCache.cpp index a3aee33ed4bd..f4c84be65fdd 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitCache.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/JitCache.cpp @@ -65,6 +65,7 @@ using namespace Gen; Core::DisplayMessage("Clearing code cache.", 3000); #endif jit->js.fifoWriteAddresses.clear(); + jit->js.pairedQuantizeAddresses.clear(); for (int i = 0; i < num_blocks; i++) { DestroyBlock(i, false); @@ -311,7 +312,10 @@ using namespace Gen; if (!forced) { for (u32 i = address; i < address + length; i += 4) + { jit->js.fifoWriteAddresses.erase(i); + jit->js.pairedQuantizeAddresses.erase(i); + } } } } diff --git a/Source/Core/Core/PowerPC/JitInterface.cpp b/Source/Core/Core/PowerPC/JitInterface.cpp index 87cb0c6d4340..bed1bd5ed224 100644 --- a/Source/Core/Core/PowerPC/JitInterface.cpp +++ b/Source/Core/Core/PowerPC/JitInterface.cpp @@ -240,18 +240,17 @@ namespace JitInterface case ExceptionType::EXCEPTIONS_FIFO_WRITE: exception_addresses = &jit->js.fifoWriteAddresses; break; + case ExceptionType::EXCEPTIONS_PAIRED_QUANTIZE: + exception_addresses = &jit->js.pairedQuantizeAddresses; + break; } if (PC != 0 && (exception_addresses->find(PC)) == (exception_addresses->end())) { - int optype = GetOpInfo(Memory::ReadUnchecked_U32(PC))->type; - if (optype == OPTYPE_STORE || optype == OPTYPE_STOREFP || (optype == OPTYPE_STOREPS)) - { - exception_addresses->insert(PC); + exception_addresses->insert(PC); - // Invalidate the JIT block so that it gets recompiled with the external exception check included. - jit->GetBlockCache()->InvalidateICache(PC, 4, true); - } + // Invalidate the JIT block so that it gets recompiled with the external exception check included. + jit->GetBlockCache()->InvalidateICache(PC, 4, true); } } diff --git a/Source/Core/Core/PowerPC/JitInterface.h b/Source/Core/Core/PowerPC/JitInterface.h index 1fe8b85086b1..3c56683af1c8 100644 --- a/Source/Core/Core/PowerPC/JitInterface.h +++ b/Source/Core/Core/PowerPC/JitInterface.h @@ -13,7 +13,8 @@ namespace JitInterface { enum class ExceptionType { - EXCEPTIONS_FIFO_WRITE + EXCEPTIONS_FIFO_WRITE, + EXCEPTIONS_PAIRED_QUANTIZE }; void DoState(PointerWrap &p); diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp index bc1da5a23088..037a7d1e2670 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp @@ -638,6 +638,7 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32 block->m_broken = false; block->m_memory_exception = false; block->m_num_instructions = 0; + block->m_gqr_used = BitSet8(0); if (address == 0) { @@ -860,6 +861,8 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32 // Forward scan, for flags that need the other direction for calculation. BitSet32 fprIsSingle, fprIsDuplicated, fprIsStoreSafe; + BitSet8 gqrUsed; + bool gqrModified = false; for (u32 i = 0; i < block->m_num_instructions; i++) { code[i].fprIsSingle = fprIsSingle; @@ -898,7 +901,22 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32 if (!strncmp(code[i].opinfo->opname, "mtfs", 4)) fprIsStoreSafe = BitSet32(0); } + + if (code[i].opinfo->type == OPTYPE_STOREPS || code[i].opinfo->type == OPTYPE_LOADPS) + { + int gqr = code[i].inst.OPCD == 4 ? code[i].inst.Ix : code[i].inst.I; + gqrUsed[gqr] = true; + } + + if (code[i].inst.OPCD == 31 && code[i].inst.SUBOP10 == 467) // mtspr + { + int gqr = ((code[i].inst.SPRU << 5) | code[i].inst.SPRL) - SPR_GQR0; + if (gqr >= 0 && gqr <= 7) + gqrModified = true; + } } + if (!gqrModified) + block->m_gqr_used = gqrUsed; return address; } diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.h b/Source/Core/Core/PowerPC/PPCAnalyst.h index 59c637e5b2ff..fc228627ceee 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.h +++ b/Source/Core/Core/PowerPC/PPCAnalyst.h @@ -154,6 +154,9 @@ struct CodeBlock // Did we have a memory_exception? bool m_memory_exception; + + // Which GQRs this block uses, if any. + BitSet8 m_gqr_used; }; class PPCAnalyzer