Skip to content

Commit

Permalink
JIT: optimize for the common case of unquantized psq_l/st
Browse files Browse the repository at this point in the history
Optimistically assume used GQRs are 0 in blocks that only use one GQR, and
bail at the start of the block and recompile if that assumption fails.

Many games use almost entirely unquantized stores (e.g. Rebel Strike, Sonic
Colors), so this will likely be a big performance improvement across the board
for games with heavy use of paired singles.
  • Loading branch information
FioraAeterna committed Jan 5, 2015
1 parent ba64225 commit b6f4dc2
Show file tree
Hide file tree
Showing 12 changed files with 250 additions and 18 deletions.
9 changes: 9 additions & 0 deletions Source/Core/Common/BitSet.h
Expand Up @@ -21,6 +21,12 @@ static inline int CountSetBits(T v)
v = (v + (v >> 4)) & (T)~(T)0/255*15;
return (T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * 8;
}
static inline int LeastSignificantSetBit(u8 val)
{
unsigned long index;
_BitScanForward(&index, val);
return (int)index;
}
static inline int LeastSignificantSetBit(u32 val)
{
unsigned long index;
Expand All @@ -34,8 +40,10 @@ static inline int LeastSignificantSetBit(u64 val)
return (int)index;
}
#else
static inline int CountSetBits(u8 val) { return __builtin_popcount(val); }
static inline int CountSetBits(u32 val) { return __builtin_popcount(val); }
static inline int CountSetBits(u64 val) { return __builtin_popcountll(val); }
static inline int LeastSignificantSetBit(u8 val) { return __builtin_ctz(val); }
static inline int LeastSignificantSetBit(u32 val) { return __builtin_ctz(val); }
static inline int LeastSignificantSetBit(u64 val) { return __builtin_ctzll(val); }
#endif
Expand Down Expand Up @@ -163,5 +171,6 @@ class BitSet

}

typedef BS::BitSet<u8> BitSet8;
typedef BS::BitSet<u32> BitSet32;
typedef BS::BitSet<u64> BitSet64;
8 changes: 4 additions & 4 deletions Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp
Expand Up @@ -137,10 +137,10 @@ static GekkoOPTemplate table4_2[] =

static GekkoOPTemplate table4_3[] =
{
{6, Interpreter::psq_lx, {"psq_lx", OPTYPE_PS, FL_OUT_FLOAT_S | FL_IN_A0B | FL_USE_FPU | FL_LOADSTORE, 1, 0, 0, 0}},
{7, Interpreter::psq_stx, {"psq_stx", OPTYPE_PS, FL_IN_FLOAT_S | FL_IN_A0B | FL_USE_FPU | FL_LOADSTORE, 1, 0, 0, 0}},
{38, Interpreter::psq_lux, {"psq_lux", OPTYPE_PS, FL_OUT_FLOAT_S | FL_OUT_A | FL_IN_AB | FL_USE_FPU | FL_LOADSTORE, 1, 0, 0, 0}},
{39, Interpreter::psq_stux, {"psq_stux", OPTYPE_PS, FL_IN_FLOAT_S | FL_OUT_A | FL_IN_AB | FL_USE_FPU | FL_LOADSTORE, 1, 0, 0, 0}},
{6, Interpreter::psq_lx, {"psq_lx", OPTYPE_LOADPS, FL_OUT_FLOAT_S | FL_IN_A0B | FL_USE_FPU | FL_LOADSTORE, 1, 0, 0, 0}},
{7, Interpreter::psq_stx, {"psq_stx", OPTYPE_STOREPS, FL_IN_FLOAT_S | FL_IN_A0B | FL_USE_FPU | FL_LOADSTORE, 1, 0, 0, 0}},
{38, Interpreter::psq_lux, {"psq_lux", OPTYPE_LOADPS, FL_OUT_FLOAT_S | FL_OUT_A | FL_IN_AB | FL_USE_FPU | FL_LOADSTORE, 1, 0, 0, 0}},
{39, Interpreter::psq_stux, {"psq_stux", OPTYPE_STOREPS, FL_IN_FLOAT_S | FL_OUT_A | FL_IN_AB | FL_USE_FPU | FL_LOADSTORE, 1, 0, 0, 0}},
};

static GekkoOPTemplate table19[] =
Expand Down
22 changes: 22 additions & 0 deletions Source/Core/Core/PowerPC/Jit64/Jit.cpp
Expand Up @@ -15,6 +15,7 @@
#include "Core/PatchEngine.h"
#include "Core/HLE/HLE.h"
#include "Core/HW/ProcessorInterface.h"
#include "Core/PowerPC/JitInterface.h"
#include "Core/PowerPC/Profiler.h"
#include "Core/PowerPC/Jit64/Jit.h"
#include "Core/PowerPC/Jit64/Jit64_Tables.h"
Expand Down Expand Up @@ -605,6 +606,27 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc
js.skipnext = false;
js.carryFlagSet = false;
js.carryFlagInverted = false;
js.assumeNoPairedQuantize = false;

if (code_block.m_gqr_used.Count() == 1 && js.pairedQuantizeAddresses.find(js.blockStart) == js.pairedQuantizeAddresses.end())
{
int gqr;
for (int i = 0; i < 8; i++)
if (code_block.m_gqr_used[i])
gqr = i;
CMP(32, PPCSTATE(spr[SPR_GQR0 + gqr]), Imm8(0));
FixupBranch failure = J_CC(CC_NZ, true);
SwitchToFarCode();
SetJumpTarget(failure);
MOV(32, PPCSTATE(pc), Imm32(js.blockStart));
ABI_PushRegistersAndAdjustStack({}, 0);
ABI_CallFunctionC((void *)&JitInterface::CompileExceptionCheck, (u32)JitInterface::ExceptionType::EXCEPTIONS_PAIRED_QUANTIZE);
ABI_PopRegistersAndAdjustStack({}, 0);
WriteExceptionExit();
SwitchToNearCode();
js.assumeNoPairedQuantize = true;
}

// Translate instructions
for (u32 i = 0; i < code_block.m_num_instructions; i++)
{
Expand Down
176 changes: 173 additions & 3 deletions Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp
Expand Up @@ -11,6 +11,7 @@
#include "Core/PowerPC/Jit64/Jit.h"
#include "Core/PowerPC/Jit64/JitAsm.h"
#include "Core/PowerPC/Jit64/JitRegCache.h"
#include "Core/PowerPC/JitCommon/JitAsmCommon.h"

using namespace Gen;

Expand All @@ -20,7 +21,6 @@ void Jit64::psq_stXX(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITLoadStorePairedOff);
FALLBACK_IF(!inst.RA);

s32 offset = inst.SIMM_12;
bool indexed = inst.OPCD == 4;
Expand All @@ -30,12 +30,75 @@ void Jit64::psq_stXX(UGeckoInstruction inst)
int s = inst.FS;
int i = indexed ? inst.Ix : inst.I;
int w = indexed ? inst.Wx : inst.W;
FALLBACK_IF(!a);

gpr.Lock(a, b);
if (js.assumeNoPairedQuantize)
{
int storeOffset = 0;
gpr.BindToRegister(a, true, update);
X64Reg addr = gpr.RX(a);
if (update && js.memcheck)
{
addr = RSCRATCH2;
MOV(32, R(addr), gpr.R(a));
}
if (indexed)
{
if (update)
{
ADD(32, R(addr), gpr.R(b));
}
else
{
addr = RSCRATCH2;
if (a && gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg())
{
LEA(32, addr, MComplex(gpr.RX(a), gpr.RX(b), SCALE_1, 0));
}
else
{
MOV(32, R(addr), gpr.R(b));
if (a)
ADD(32, R(addr), gpr.R(a));
}
}
}
else
{
if (update)
ADD(32, R(addr), Imm32(offset));
else
storeOffset = offset;
}

fpr.Lock(s);
if (w)
{
CVTSD2SS(XMM0, fpr.R(s));
MOVD_xmm(R(RSCRATCH), XMM0);
}
else
{
CVTPD2PS(XMM0, fpr.R(s));
MOVQ_xmm(R(RSCRATCH), XMM0);
ROL(64, R(RSCRATCH), Imm8(32));
}

BitSet32 registersInUse = CallerSavedRegistersInUse();
if (update && js.memcheck)
registersInUse[addr] = true;
SafeWriteRegToReg(RSCRATCH, addr, w ? 32 : 64, storeOffset, registersInUse);
MemoryExceptionCheck();
if (update && js.memcheck)
MOV(32, gpr.R(a), R(addr));
gpr.UnlockAll();
fpr.UnlockAll();
return;
}
gpr.FlushLockX(RSCRATCH_EXTRA);
if (update)
gpr.BindToRegister(a, true, true);
fpr.BindToRegister(s, true, false);
if (gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg() && (indexed || offset))
{
if (indexed)
Expand Down Expand Up @@ -92,7 +155,6 @@ void Jit64::psq_lXX(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITLoadStorePairedOff);
FALLBACK_IF(!inst.RA);

s32 offset = inst.SIMM_12;
bool indexed = inst.OPCD == 4;
Expand All @@ -102,8 +164,116 @@ void Jit64::psq_lXX(UGeckoInstruction inst)
int s = inst.FS;
int i = indexed ? inst.Ix : inst.I;
int w = indexed ? inst.Wx : inst.W;
FALLBACK_IF(!a);

gpr.Lock(a, b);
if (js.assumeNoPairedQuantize)
{
s32 loadOffset = 0;
gpr.BindToRegister(a, true, update);
X64Reg addr = gpr.RX(a);
if (update && js.memcheck)
{
addr = RSCRATCH2;
MOV(32, R(addr), gpr.R(a));
}
if (indexed)
{
if (update)
{
ADD(32, R(addr), gpr.R(b));
}
else
{
addr = RSCRATCH2;
if (a && gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg())
{
LEA(32, addr, MComplex(gpr.RX(a), gpr.RX(b), SCALE_1, 0));
}
else
{
MOV(32, R(addr), gpr.R(b));
if (a)
ADD(32, R(addr), gpr.R(a));
}
}
}
else
{
if (update)
ADD(32, R(addr), Imm32(offset));
else
loadOffset = offset;
}

fpr.Lock(s);
if (js.memcheck)
{
fpr.StoreFromRegister(s);
js.revertFprLoad = s;
}
fpr.BindToRegister(s, false);

// Let's mirror the JitAsmCommon code and assume all non-MMU loads go to RAM.
if (!js.memcheck)
{
if (w)
{
if (cpu_info.bSSSE3)
{
MOVD_xmm(XMM0, MComplex(RMEM, addr, SCALE_1, loadOffset));
PSHUFB(XMM0, M(pbswapShuffle1x4));
UNPCKLPS(XMM0, M(m_one));
}
else
{
LoadAndSwap(32, RSCRATCH, MComplex(RMEM, addr, SCALE_1, loadOffset));
MOVD_xmm(XMM0, R(RSCRATCH));
UNPCKLPS(XMM0, M(m_one));
}
}
else
{
if (cpu_info.bSSSE3)
{
MOVQ_xmm(XMM0, MComplex(RMEM, addr, SCALE_1, loadOffset));
PSHUFB(XMM0, M(pbswapShuffle2x4));
}
else
{
LoadAndSwap(64, RSCRATCH, MComplex(RMEM, addr, SCALE_1, loadOffset));
ROL(64, R(RSCRATCH), Imm8(32));
MOVQ_xmm(XMM0, R(RSCRATCH));
}
}
CVTPS2PD(fpr.RX(s), R(XMM0));
}
else
{
BitSet32 registersInUse = CallerSavedRegistersInUse();
registersInUse[fpr.RX(s) << 16] = false;
if (update)
registersInUse[addr] = true;
SafeLoadToReg(RSCRATCH, R(addr), w ? 32 : 64, loadOffset, registersInUse, false);
MemoryExceptionCheck();
if (w)
{
MOVD_xmm(XMM0, R(RSCRATCH));
UNPCKLPS(XMM0, M(m_one));
}
else
{
ROL(64, R(RSCRATCH), Imm8(32));
MOVQ_xmm(XMM0, R(RSCRATCH));
}
CVTPS2PD(fpr.RX(s), R(XMM0));
if (update)
MOV(32, gpr.R(a), R(addr));
}
gpr.UnlockAll();
fpr.UnlockAll();
return;
}
gpr.FlushLockX(RSCRATCH_EXTRA);
gpr.BindToRegister(a, true, update);
fpr.BindToRegister(s, false, true);
Expand Down
6 changes: 3 additions & 3 deletions Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp
Expand Up @@ -191,8 +191,8 @@ void CommonAsmRoutines::GenMfcr()

// Safe + Fast Quantizers, originally from JITIL by magumagu

static const u8 GC_ALIGNED16(pbswapShuffle1x4[16]) = { 3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
static const u8 GC_ALIGNED16(pbswapShuffle2x4[16]) = { 3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15 };
const u8 GC_ALIGNED16(pbswapShuffle1x4[16]) = { 3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
const u8 GC_ALIGNED16(pbswapShuffle2x4[16]) = { 3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15 };

static const float GC_ALIGNED16(m_quantizeTableS[]) =
{
Expand Down Expand Up @@ -257,7 +257,7 @@ static const float GC_ALIGNED16(m_255) = 255.0f;
static const float GC_ALIGNED16(m_127) = 127.0f;
static const float GC_ALIGNED16(m_m128) = -128.0f;

static const float GC_ALIGNED16(m_one[]) = {1.0f, 0.0f, 0.0f, 0.0f};
const float GC_ALIGNED16(m_one[]) = { 1.0f, 0.0f, 0.0f, 0.0f };

#define QUANTIZE_OVERFLOW_SAFE

Expand Down
4 changes: 4 additions & 0 deletions Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h
Expand Up @@ -6,6 +6,10 @@

#include "Core/PowerPC/JitCommon/Jit_Util.h"

extern const u8 GC_ALIGNED16(pbswapShuffle1x4[16]);
extern const u8 GC_ALIGNED16(pbswapShuffle2x4[16]);
extern const float GC_ALIGNED16(m_one[]);

class CommonAsmRoutinesBase
{
public:
Expand Down
2 changes: 2 additions & 0 deletions Source/Core/Core/PowerPC/JitCommon/JitBase.h
Expand Up @@ -84,6 +84,7 @@ class JitBase : public CPUCoreBase
int revertGprLoad;
int revertFprLoad;

bool assumeNoPairedQuantize;
bool firstFPInstructionFound;
bool isLastInstruction;
bool memcheck;
Expand All @@ -104,6 +105,7 @@ class JitBase : public CPUCoreBase
JitBlock *curBlock;

std::unordered_set<u32> fifoWriteAddresses;
std::unordered_set<u32> pairedQuantizeAddresses;
};

PPCAnalyst::CodeBlock code_block;
Expand Down
4 changes: 4 additions & 0 deletions Source/Core/Core/PowerPC/JitCommon/JitCache.cpp
Expand Up @@ -65,6 +65,7 @@ using namespace Gen;
Core::DisplayMessage("Clearing code cache.", 3000);
#endif
jit->js.fifoWriteAddresses.clear();
jit->js.pairedQuantizeAddresses.clear();
for (int i = 0; i < num_blocks; i++)
{
DestroyBlock(i, false);
Expand Down Expand Up @@ -311,7 +312,10 @@ using namespace Gen;
if (!forced)
{
for (u32 i = address; i < address + length; i += 4)
{
jit->js.fifoWriteAddresses.erase(i);
jit->js.pairedQuantizeAddresses.erase(i);
}
}
}
}
Expand Down
13 changes: 6 additions & 7 deletions Source/Core/Core/PowerPC/JitInterface.cpp
Expand Up @@ -240,18 +240,17 @@ namespace JitInterface
case ExceptionType::EXCEPTIONS_FIFO_WRITE:
exception_addresses = &jit->js.fifoWriteAddresses;
break;
case ExceptionType::EXCEPTIONS_PAIRED_QUANTIZE:
exception_addresses = &jit->js.pairedQuantizeAddresses;
break;
}

if (PC != 0 && (exception_addresses->find(PC)) == (exception_addresses->end()))
{
int optype = GetOpInfo(Memory::ReadUnchecked_U32(PC))->type;
if (optype == OPTYPE_STORE || optype == OPTYPE_STOREFP || (optype == OPTYPE_STOREPS))
{
exception_addresses->insert(PC);
exception_addresses->insert(PC);

// Invalidate the JIT block so that it gets recompiled with the external exception check included.
jit->GetBlockCache()->InvalidateICache(PC, 4, true);
}
// Invalidate the JIT block so that it gets recompiled with the external exception check included.
jit->GetBlockCache()->InvalidateICache(PC, 4, true);
}
}

Expand Down

0 comments on commit b6f4dc2

Please sign in to comment.