Permalink
Browse files

Optimize PPC CR emulation by using magic 64 bit values

PowerPC has a 32 bit CR register, which is used to store flags for results of
computations. Most instructions have an optional bit that tells the CPU whether
the flags should be updated. This 32 bit register actually contains 8 sets of 4
flags: Summary Overflow (SO), Equals (EQ), Greater Than (GT), Less Than (LT).
These 8 sets are usually called CR0-CR7 and accessed independently. In the most
common operations, the flags are computed from the result of the operation in
the following fashion:
  * EQ is set iff result == 0
  * LT is set iff result < 0
  * GT is set iff result > 0
  * (Dolphin does not emulate SO)

While X86 architectures have a similar concept of flags, it is very difficult
to access the FLAGS register directly to translate its value to an equivalent
PowerPC value. With the current Dolphin implementation, updating a PPC CR
register requires CPU branching, which has a few performance issues: it uses
space in the BTB, and in the worst case (!GT, !LT, EQ) requires 2 branches not
taken.

After some brainstorming on IRC about how this could be improved, calc84maniac
figured out a neat trick that makes common CR operations way more efficient to
JIT on 64 bit X86 architectures. It relies on emulating each CRn bitfield with
a 64 bit register internally, whose value is the result of the operation from
which flags are updated, sign extended to 64 bits. Then, checking if a CR bit
is set can be done in the following way:
  * EQ is set iff LOWER_32_BITS(cr_64b_val) == 0
  * GT is set iff (s64)cr_64b_val > 0
  * LT is set iff bit 62 of cr_64b_val is set

To take a few examples, if the result of an operation is:
  * -1 (0xFFFFFFFFFFFFFFFF) -> lower 32 bits not 0       => !EQ
                            -> (s64)val (-1) is not > 0  => !GT
                            -> bit 62 is set             =>  LT
            !EQ, !GT, LT

  *  0 (0x0000000000000000) -> lower 32 bits are 0       =>  EQ
                            -> (s64)val (0) is not > 0   => !GT
                            -> bit 62 is not set         => !LT
            EQ, !GT, !LT

  *  1 (0x0000000000000001) -> lower 32 bits not 0       => !EQ
                            -> (s64)val (1) is > 0       =>  GT
                            -> bit 62 is not set         => !LT
            !EQ, GT, !LT

Sometimes we need to convert PPC CR values to these 64 bit values. The
following convention is used in this case:
  * Bit 0 (LSB) is set iff !EQ
  * Bit 62 is set iff LT
  * Bit 63 is set iff !GT
  * Bit 32 always set to disambiguize between EQ and GT

Some more examples:
  * !EQ, GT, LT -> 0x4000000100000001 (!B63, B62, B32, B0)
                -> lower 32 bits not 0          => !EQ
                -> (s64)val is > 0              =>  GT
                -> bit 62 is set                =>  LT
  * EQ, GT, !LT -> 0x0000000100000000
                -> lower 32 bits are 0          =>  EQ
                -> (s64)val is > 0 (note: B32)  =>  GT
                -> bit 62 is not set            => !LT
  • Loading branch information...
1 parent 9b9817f commit 0ff1481494562025696f99e4b136bdd9e0b18423 @delroth delroth committed May 30, 2014
@@ -84,7 +84,7 @@ static void Trace(UGeckoInstruction& instCode)
char ppcInst[256];
DisassembleGekko(instCode.hex, PC, ppcInst, 256);
- DEBUG_LOG(POWERPC, "INTER PC: %08x SRR0: %08x SRR1: %08x CRfast: %02x%02x%02x%02x%02x%02x%02x%02x FPSCR: %08x MSR: %08x LR: %08x %s %s %08x %s", PC, SRR0, SRR1, PowerPC::ppcState.cr_fast[0], PowerPC::ppcState.cr_fast[1], PowerPC::ppcState.cr_fast[2], PowerPC::ppcState.cr_fast[3], PowerPC::ppcState.cr_fast[4], PowerPC::ppcState.cr_fast[5], PowerPC::ppcState.cr_fast[6], PowerPC::ppcState.cr_fast[7], PowerPC::ppcState.fpscr, PowerPC::ppcState.msr, PowerPC::ppcState.spr[8], regs.c_str(), fregs.c_str(), instCode.hex, ppcInst);
+ DEBUG_LOG(POWERPC, "INTER PC: %08x SRR0: %08x SRR1: %08x CRval: %016lx FPSCR: %08x MSR: %08x LR: %08x %s %08x %s", PC, SRR0, SRR1, PowerPC::ppcState.cr_val[0], PowerPC::ppcState.fpscr, PowerPC::ppcState.msr, PowerPC::ppcState.spr[8], regs.c_str(), instCode.hex, ppcInst);
}
int Interpreter::SingleStepInner(void)
@@ -4,32 +4,22 @@
#include "Core/PowerPC/Interpreter/Interpreter.h"
-void Interpreter::Helper_UpdateCR0(u32 _uValue)
-{
- u32 new_cr0;
- int sValue = (int)_uValue;
- if (sValue > 0)
- new_cr0 = 0x4;
- else if (sValue < 0)
- new_cr0 = 0x8;
- else
- new_cr0 = 0x2;
- new_cr0 |= GetXER_SO();
- SetCRField(0, new_cr0);
+void Interpreter::Helper_UpdateCR0(u32 value)
+{
+ s64 sign_extended = (s64)(s32)value;
+ u64 cr_val = (u64)sign_extended;
+ cr_val = (cr_val & ~(1ull << 61)) | ((u64)GetXER_SO() << 61);
+
+ PowerPC::ppcState.cr_val[0] = cr_val;
}
-void Interpreter::Helper_UpdateCRx(int _x, u32 _uValue)
+void Interpreter::Helper_UpdateCRx(int idx, u32 value)
{
- u32 new_crX;
- int sValue = (int)_uValue;
- if (sValue > 0)
- new_crX = 0x4;
- else if (sValue < 0)
- new_crX = 0x8;
- else
- new_crX = 0x2;
- new_crX |= GetXER_SO();
- SetCRField(_x, new_crX);
+ s64 sign_extended = (s64)(s32)value;
+ u64 cr_val = (u64)sign_extended;
+ cr_val = (cr_val & ~(1ull << 61)) | ((u64)GetXER_SO() << 61);
+
+ PowerPC::ppcState.cr_val[idx] = cr_val;
}
u32 Interpreter::Helper_Carry(u32 _uValue1, u32 _uValue2)
@@ -377,10 +377,8 @@ void Jit64::Trace()
}
#endif
- DEBUG_LOG(DYNA_REC, "JIT64 PC: %08x SRR0: %08x SRR1: %08x CRfast: %02x%02x%02x%02x%02x%02x%02x%02x FPSCR: %08x MSR: %08x LR: %08x %s %s",
- PC, SRR0, SRR1, PowerPC::ppcState.cr_fast[0], PowerPC::ppcState.cr_fast[1], PowerPC::ppcState.cr_fast[2], PowerPC::ppcState.cr_fast[3],
- PowerPC::ppcState.cr_fast[4], PowerPC::ppcState.cr_fast[5], PowerPC::ppcState.cr_fast[6], PowerPC::ppcState.cr_fast[7], PowerPC::ppcState.fpscr,
- PowerPC::ppcState.msr, PowerPC::ppcState.spr[8], regs.c_str(), fregs.c_str());
+ DEBUG_LOG(DYNA_REC, "JIT64 PC: %08x SRR0: %08x SRR1: %08x FPSCR: %08x MSR: %08x LR: %08x %s %s",
+ PC, SRR0, SRR1, PowerPC::ppcState.fpscr, PowerPC::ppcState.msr, PowerPC::ppcState.spr[8], regs.c_str(), fregs.c_str());
}
void STACKALIGN Jit64::Jit(u32 em_address)
@@ -106,6 +106,16 @@ class Jit64 : public Jitx86Base
void GenerateRC();
void ComputeRC(const Gen::OpArg & arg);
+ // Reads a given bit of a given CR register part. Clobbers ABI_PARAM1,
+ // don't forget to xlock it before.
+ void GetCRFieldBit(int field, int bit, Gen::X64Reg out);
+ // Clobbers ABI_PARAM1 and ABI_PARAM2, xlock them before.
+ void SetCRFieldBit(int field, int bit, Gen::X64Reg in);
+
+ // Generates a branch that will check if a given bit of a CR register part
+ // is set or not.
+ FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set = true);
+
void tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg));
typedef u32 (*Operation)(u32 a, u32 b);
void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false);
@@ -117,11 +117,8 @@ void Jit64::bcx(UGeckoInstruction inst)
FixupBranch pConditionDontBranch;
if ((inst.BO & BO_DONT_CHECK_CONDITION) == 0) // Test a CR bit
{
- TEST(8, M(&PowerPC::ppcState.cr_fast[inst.BI >> 2]), Imm8(8 >> (inst.BI & 3)));
- if (inst.BO & BO_BRANCH_IF_TRUE) // Conditional branch
- pConditionDontBranch = J_CC(CC_Z, true);
- else
- pConditionDontBranch = J_CC(CC_NZ, true);
+ pConditionDontBranch = JumpIfCRFieldBit(inst.BI >> 2, 3 - (inst.BI & 3),
+ !(inst.BO_2 & BO_BRANCH_IF_TRUE));
}
if (inst.LK)
@@ -179,14 +176,8 @@ void Jit64::bcctrx(UGeckoInstruction inst)
// BO_2 == 001zy -> b if false
// BO_2 == 011zy -> b if true
- // Ripped from bclrx
- TEST(8, M(&PowerPC::ppcState.cr_fast[inst.BI >> 2]), Imm8(8 >> (inst.BI & 3)));
- Gen::CCFlags branch;
- if (inst.BO_2 & BO_BRANCH_IF_TRUE)
- branch = CC_Z;
- else
- branch = CC_NZ;
- FixupBranch b = J_CC(branch, true);
+ FixupBranch b = JumpIfCRFieldBit(inst.BI >> 2, 3 - (inst.BI & 3),
+ !(inst.BO_2 & BO_BRANCH_IF_TRUE));
MOV(32, R(EAX), M(&CTR));
AND(32, R(EAX), Imm32(0xFFFFFFFC));
//MOV(32, M(&PC), R(EAX)); => Already done in WriteExitDestInEAX()
@@ -222,11 +213,8 @@ void Jit64::bclrx(UGeckoInstruction inst)
FixupBranch pConditionDontBranch;
if ((inst.BO & BO_DONT_CHECK_CONDITION) == 0) // Test a CR bit
{
- TEST(8, M(&PowerPC::ppcState.cr_fast[inst.BI >> 2]), Imm8(8 >> (inst.BI & 3)));
- if (inst.BO & BO_BRANCH_IF_TRUE) // Conditional branch
- pConditionDontBranch = J_CC(CC_Z, true);
- else
- pConditionDontBranch = J_CC(CC_NZ, true);
+ pConditionDontBranch = JumpIfCRFieldBit(inst.BI >> 2, 3 - (inst.BI & 3),
+ !(inst.BO_2 & BO_BRANCH_IF_TRUE));
}
// This below line can be used to prove that blr "eats flags" in practice.
@@ -237,26 +237,33 @@ void Jit64::fcmpx(UGeckoInstruction inst)
pGreater = J_CC(CC_B);
}
- // Equal
- MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x2));
+ // Read the documentation about cr_val in PowerPC.h to understand these
+ // magic values.
+
+ // Equal: !GT (bit 63 set), !LT (bit 62 not set), !SO (bit 61 not set), EQ
+ // (bits 31-0 not set).
+ MOV(64, R(RAX), Imm64(0x8000000000000000));
continue1 = J();
- // NAN
+ // NAN: !GT (bit 63 set), !LT (bit 62 not set), SO (bit 61 set), !EQ (bit 0
+ // set).
SetJumpTarget(pNaN);
- MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x1));
+ MOV(64, R(RAX), Imm64(0xA000000000000001));
if (a != b)
{
continue2 = J();
- // Greater Than
+ // Greater Than: GT (bit 63 not set), !LT (bit 62 not set), !SO (bit 61
+ // not set), !EQ (bit 0 set).
SetJumpTarget(pGreater);
- MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x4));
+ MOV(64, R(RAX), Imm64(0x0000000000000001));
continue3 = J();
- // Less Than
+ // Less Than: !GT (bit 63 set), LT (bit 62 set), !SO (bit 61 not set),
+ // !EQ (bit 0 set).
SetJumpTarget(pLesser);
- MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x8));
+ MOV(64, R(RAX), Imm64(0xC000000000000001));
}
SetJumpTarget(continue1);
@@ -266,6 +273,7 @@ void Jit64::fcmpx(UGeckoInstruction inst)
SetJumpTarget(continue3);
}
+ MOV(64, M(&PowerPC::ppcState.cr_val[crf]), R(RAX));
fpr.UnlockAll();
}
Oops, something went wrong.

0 comments on commit 0ff1481

Please sign in to comment.