From 5d80145dc0de8d742f19995310bd04e442f521e1 Mon Sep 17 00:00:00 2001 From: Fiora Date: Sat, 6 Sep 2014 19:33:34 -0700 Subject: [PATCH 1/6] JIT: unify subfe/submex/subfex/adde/addmex/addzex code Shorter, plus should make future optimizations easier. --- Source/Core/Core/PowerPC/Jit64/Jit.h | 7 +- .../Core/Core/PowerPC/Jit64/Jit64_Tables.cpp | 14 +- .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 147 +++--------------- 3 files changed, 30 insertions(+), 138 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index 821df102d168..dc3eae8a4cae 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -147,9 +147,7 @@ class Jit64 : public Jitx86Base void divwx(UGeckoInstruction inst); void srawix(UGeckoInstruction inst); void srawx(UGeckoInstruction inst); - void addex(UGeckoInstruction inst); - void addmex(UGeckoInstruction inst); - void addzex(UGeckoInstruction inst); + void arithXex(UGeckoInstruction inst); void extsXx(UGeckoInstruction inst); @@ -219,9 +217,6 @@ class Jit64 : public Jitx86Base void subfic(UGeckoInstruction inst); void subfcx(UGeckoInstruction inst); void subfx(UGeckoInstruction inst); - void subfex(UGeckoInstruction inst); - void subfmex(UGeckoInstruction inst); - void subfzex(UGeckoInstruction inst); void twx(UGeckoInstruction inst); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp index 927e83353f65..a285c815c2d0 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp @@ -296,10 +296,10 @@ static GekkoOPTemplate table31_2[] = {778, &Jit64::addx}, //"addx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT}}, {10, &Jit64::addcx}, //"addcx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT}}, {522, &Jit64::addcx}, //"addcox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT}}, - {138, &Jit64::addex}, //"addex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}}, - {650, &Jit64::addex}, //"addeox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}}, - {234, &Jit64::addmex}, //"addmex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}}, - {202, &Jit64::addzex}, //"addzex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}}, + {138, &Jit64::arithXex}, //"addex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}}, + {650, &Jit64::arithXex}, //"addeox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}}, + {234, &Jit64::arithXex}, //"addmex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}}, + {202, &Jit64::arithXex}, //"addzex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}}, {491, &Jit64::divwx}, //"divwx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 39}}, {1003, &Jit64::divwx}, //"divwox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 39}}, {459, &Jit64::divwux}, //"divwux", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 39}}, @@ -313,9 +313,9 @@ static GekkoOPTemplate table31_2[] = {552, &Jit64::subfx}, //"subox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT}}, {8, &Jit64::subfcx}, //"subfcx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT}}, {520, &Jit64::subfcx}, //"subfcox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT}}, - {136, &Jit64::subfex}, //"subfex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}}, - {232, &Jit64::subfmex}, //"subfmex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}}, - {200, &Jit64::subfzex}, //"subfzex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}}, + {136, &Jit64::arithXex}, //"subfex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}}, + {232, &Jit64::arithXex}, //"subfmex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}}, + {200, &Jit64::arithXex}, //"subfzex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}}, }; static GekkoOPTemplate table59[] = diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index 10ea9f844876..cf1e226c7c02 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -781,84 +781,6 @@ void Jit64::subfcx(UGeckoInstruction inst) gpr.UnlockAll(); } -void Jit64::subfex(UGeckoInstruction inst) -{ - INSTRUCTION_START; - JITDISABLE(bJITIntegerOff); - int a = inst.RA, b = inst.RB, d = inst.RD; - gpr.Lock(a, b, d); - gpr.BindToRegister(d, (d == a || d == b), true); - - JitGetAndClearCAOV(inst.OE); - - bool invertedCarry = false; - if (d == b) - { - // Convert carry to borrow - CMC(); - SBB(32, gpr.R(d), gpr.R(a)); - invertedCarry = true; - } - else if (d == a) - { - NOT(32, gpr.R(d)); - ADC(32, gpr.R(d), gpr.R(b)); - } - else - { - MOV(32, gpr.R(d), gpr.R(a)); - NOT(32, gpr.R(d)); - ADC(32, gpr.R(d), gpr.R(b)); - } - FinalizeCarryOverflow(inst.OE, invertedCarry); - if (inst.Rc) - ComputeRC(gpr.R(d)); - - gpr.UnlockAll(); -} - -void Jit64::subfmex(UGeckoInstruction inst) -{ - // USES_XER - INSTRUCTION_START - JITDISABLE(bJITIntegerOff); - int a = inst.RA, d = inst.RD; - gpr.Lock(a, d); - gpr.BindToRegister(d, d == a); - - JitGetAndClearCAOV(inst.OE); - if (d != a) - MOV(32, gpr.R(d), gpr.R(a)); - NOT(32, gpr.R(d)); - ADC(32, gpr.R(d), Imm32(0xFFFFFFFF)); - FinalizeCarryOverflow(inst.OE); - if (inst.Rc) - ComputeRC(gpr.R(d)); - gpr.UnlockAll(); -} - -void Jit64::subfzex(UGeckoInstruction inst) -{ - // USES_XER - INSTRUCTION_START - JITDISABLE(bJITIntegerOff); - int a = inst.RA, d = inst.RD; - - gpr.Lock(a, d); - gpr.BindToRegister(d, d == a); - - JitGetAndClearCAOV(inst.OE); - if (d != a) - MOV(32, gpr.R(d), gpr.R(a)); - NOT(32, gpr.R(d)); - ADC(32, gpr.R(d), Imm8(0)); - FinalizeCarryOverflow(inst.OE); - if (inst.Rc) - ComputeRC(gpr.R(d)); - - gpr.UnlockAll(); -} - void Jit64::subfx(UGeckoInstruction inst) { INSTRUCTION_START @@ -1325,26 +1247,39 @@ void Jit64::addx(UGeckoInstruction inst) } } -void Jit64::addex(UGeckoInstruction inst) +void Jit64::arithXex(UGeckoInstruction inst) { - // USES_XER INSTRUCTION_START JITDISABLE(bJITIntegerOff); - int a = inst.RA, b = inst.RB, d = inst.RD; + bool regsource = !(inst.SUBOP10 & 64); // addex or subfex + bool mex = !!(inst.SUBOP10 & 32); // addmex/subfmex or addzex/subfzex + bool add = !!(inst.SUBOP10 & 2); // add or sub + int a = inst.RA; + int b = regsource ? inst.RB : a; + int d = inst.RD; gpr.Lock(a, b, d); - gpr.BindToRegister(d, (d == a) || (d == b)); + gpr.BindToRegister(d, d == a || d == b); JitGetAndClearCAOV(inst.OE); - if ((d == a) || (d == b)) + + bool invertedCarry = false; + if (!add && regsource && d == b) { - ADC(32, gpr.R(d), gpr.R((d == a) ? b : a)); + // Convert carry to borrow + CMC(); + SBB(32, gpr.R(d), gpr.R(a)); + invertedCarry = true; } else { - MOV(32, gpr.R(d), gpr.R(a)); - ADC(32, gpr.R(d), gpr.R(b)); + OpArg source = regsource ? gpr.R(d == b ? a : b) : Imm32(mex ? 0xFFFFFFFF : 0); + if (d != a && d != b) + MOV(32, gpr.R(d), gpr.R(a)); + if (!add) + NOT(32, gpr.R(d)); + ADC(32, gpr.R(d), source); } - FinalizeCarryOverflow(inst.OE); + FinalizeCarryOverflow(inst.OE, invertedCarry); if (inst.Rc) ComputeRC(gpr.R(d)); gpr.UnlockAll(); @@ -1382,44 +1317,6 @@ void Jit64::addcx(UGeckoInstruction inst) } } -void Jit64::addmex(UGeckoInstruction inst) -{ - // USES_XER - INSTRUCTION_START - JITDISABLE(bJITIntegerOff); - int a = inst.RA, d = inst.RD; - - gpr.Lock(d); - gpr.BindToRegister(d, d == a); - JitGetAndClearCAOV(inst.OE); - if (d != a) - MOV(32, gpr.R(d), gpr.R(a)); - ADC(32, gpr.R(d), Imm32(0xFFFFFFFF)); - FinalizeCarryOverflow(inst.OE); - if (inst.Rc) - ComputeRC(gpr.R(d)); - gpr.UnlockAll(); -} - -void Jit64::addzex(UGeckoInstruction inst) -{ - // USES_XER - INSTRUCTION_START - JITDISABLE(bJITIntegerOff); - int a = inst.RA, d = inst.RD; - - gpr.Lock(d); - gpr.BindToRegister(d, d == a); - JitGetAndClearCAOV(inst.OE); - if (d != a) - MOV(32, gpr.R(d), gpr.R(a)); - ADC(32, gpr.R(d), Imm8(0)); - FinalizeCarryOverflow(inst.OE); - if (inst.Rc) - ComputeRC(gpr.R(d)); - gpr.UnlockAll(); -} - void Jit64::rlwinmx(UGeckoInstruction inst) { INSTRUCTION_START From a6c9515b00de392e3075d01287dd5ab38cab0bad Mon Sep 17 00:00:00 2001 From: Fiora Date: Sat, 6 Sep 2014 22:34:40 -0700 Subject: [PATCH 2/6] JIT: unify subfcx and addcx code Again, shorter and should make future optimizations easier. --- Source/Core/Core/PowerPC/Jit64/Jit.h | 3 +- .../Core/Core/PowerPC/Jit64/Jit64_Tables.cpp | 8 +- .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 77 +++++++------------ 3 files changed, 33 insertions(+), 55 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index dc3eae8a4cae..dd5af56673c8 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -139,7 +139,7 @@ class Jit64 : public Jitx86Base void DynaRunTable63(UGeckoInstruction _inst); void addx(UGeckoInstruction inst); - void addcx(UGeckoInstruction inst); + void arithcx(UGeckoInstruction inst); void mulli(UGeckoInstruction inst); void mulhwXx(UGeckoInstruction inst); void mullwx(UGeckoInstruction inst); @@ -215,7 +215,6 @@ class Jit64 : public Jitx86Base void dcbz(UGeckoInstruction inst); void subfic(UGeckoInstruction inst); - void subfcx(UGeckoInstruction inst); void subfx(UGeckoInstruction inst); void twx(UGeckoInstruction inst); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp index a285c815c2d0..98c986a2f080 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp @@ -294,8 +294,8 @@ static GekkoOPTemplate table31_2[] = { {266, &Jit64::addx}, //"addx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT}}, {778, &Jit64::addx}, //"addx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT}}, - {10, &Jit64::addcx}, //"addcx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT}}, - {522, &Jit64::addcx}, //"addcox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT}}, + {10, &Jit64::arithcx}, //"addcx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT}}, + {522, &Jit64::arithcx}, //"addcox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT}}, {138, &Jit64::arithXex}, //"addex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}}, {650, &Jit64::arithXex}, //"addeox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}}, {234, &Jit64::arithXex}, //"addmex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}}, @@ -311,8 +311,8 @@ static GekkoOPTemplate table31_2[] = {104, &Jit64::negx}, //"negx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT}}, {40, &Jit64::subfx}, //"subfx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT}}, {552, &Jit64::subfx}, //"subox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT}}, - {8, &Jit64::subfcx}, //"subfcx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT}}, - {520, &Jit64::subfcx}, //"subfcox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT}}, + {8, &Jit64::arithcx}, //"subfcx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT}}, + {520, &Jit64::arithcx}, //"subfcox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT}}, {136, &Jit64::arithXex}, //"subfex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}}, {232, &Jit64::arithXex}, //"subfmex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}}, {200, &Jit64::arithXex}, //"subfzex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}}, diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index cf1e226c7c02..0b6f0dac4f58 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -750,36 +750,7 @@ void Jit64::subfic(UGeckoInstruction inst) // This instruction has no RC flag } -void Jit64::subfcx(UGeckoInstruction inst) -{ - INSTRUCTION_START; - JITDISABLE(bJITIntegerOff); - int a = inst.RA, b = inst.RB, d = inst.RD; - gpr.Lock(a, b, d); - gpr.BindToRegister(d, (d == a || d == b), true); - JitClearCAOV(inst.OE); - if (d == b) - { - SUB(32, gpr.R(d), gpr.R(a)); - } - else if (d == a) - { - MOV(32, R(RSCRATCH), gpr.R(a)); - MOV(32, gpr.R(d), gpr.R(b)); - SUB(32, gpr.R(d), R(RSCRATCH)); - } - else - { - MOV(32, gpr.R(d), gpr.R(b)); - SUB(32, gpr.R(d), gpr.R(a)); - } - if (inst.Rc) - ComputeRC(gpr.R(d)); - FinalizeCarryOverflow(inst.OE, true); - - gpr.UnlockAll(); -} void Jit64::subfx(UGeckoInstruction inst) { @@ -1285,36 +1256,44 @@ void Jit64::arithXex(UGeckoInstruction inst) gpr.UnlockAll(); } -void Jit64::addcx(UGeckoInstruction inst) +void Jit64::arithcx(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITIntegerOff); + bool add = !!(inst.SUBOP10 & 2); // add or sub int a = inst.RA, b = inst.RB, d = inst.RD; + gpr.Lock(a, b, d); + gpr.BindToRegister(d, d == a || d == b, true); + JitClearCAOV(inst.OE); - if ((d == a) || (d == b)) + if (d == a && d != b) { - int operand = ((d == a) ? b : a); - gpr.Lock(a, b, d); - gpr.BindToRegister(d, true); - JitClearCAOV(inst.OE); - ADD(32, gpr.R(d), gpr.R(operand)); - FinalizeCarryOverflow(inst.OE); - if (inst.Rc) - ComputeRC(gpr.R(d)); - gpr.UnlockAll(); + if (add) + { + ADD(32, gpr.R(d), gpr.R(b)); + } + else + { + // special case, because sub isn't reversible + MOV(32, R(RSCRATCH), gpr.R(a)); + MOV(32, gpr.R(d), gpr.R(b)); + SUB(32, gpr.R(d), R(RSCRATCH)); + } } else { - gpr.Lock(a, b, d); - gpr.BindToRegister(d, false); - JitClearCAOV(inst.OE); - MOV(32, gpr.R(d), gpr.R(a)); - ADD(32, gpr.R(d), gpr.R(b)); - FinalizeCarryOverflow(inst.OE); - if (inst.Rc) - ComputeRC(gpr.R(d)); - gpr.UnlockAll(); + if (d != b) + MOV(32, gpr.R(d), gpr.R(b)); + if (add) + ADD(32, gpr.R(d), gpr.R(a)); + else + SUB(32, gpr.R(d), gpr.R(a)); } + + FinalizeCarryOverflow(inst.OE, !add); + if (inst.Rc) + ComputeRC(gpr.R(d)); + gpr.UnlockAll(); } void Jit64::rlwinmx(UGeckoInstruction inst) From bea2504a5111b00a0ce618f28c0777fb299307f3 Mon Sep 17 00:00:00 2001 From: Fiora Date: Thu, 21 Aug 2014 13:56:18 -0700 Subject: [PATCH 3/6] JIT64: optimize carry calculations Omit carry calculations that get overwritten later in the block before they're used. Very common in the case of srawix and friends. --- .../Interpreter/Interpreter_Tables.cpp | 8 +- Source/Core/Core/PowerPC/Jit64/Jit.h | 2 +- .../Core/Core/PowerPC/Jit64/Jit64_Tables.cpp | 4 +- .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 123 ++++++++++-------- Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp | 2 +- .../Core/Core/PowerPC/JitCommon/Jit_Util.cpp | 10 +- Source/Core/Core/PowerPC/JitCommon/Jit_Util.h | 2 +- Source/Core/Core/PowerPC/PPCAnalyst.cpp | 44 ++++--- Source/Core/Core/PowerPC/PPCAnalyst.h | 4 +- 9 files changed, 116 insertions(+), 83 deletions(-) diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp index 2bf66ae99b12..317132266dc8 100644 --- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp +++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp @@ -34,7 +34,7 @@ static GekkoOPTemplate primarytable[] = {10, Interpreter::cmpli, {"cmpli", OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn, 1, 0, 0, 0}}, {11, Interpreter::cmpi, {"cmpi", OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn, 1, 0, 0, 0}}, {12, Interpreter::addic, {"addic", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA, 1, 0, 0, 0}}, - {13, Interpreter::addic_rc, {"addic_rc", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CR0, 1, 0, 0, 0}}, + {13, Interpreter::addic_rc, {"addic_rc", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA | FL_SET_CR0, 1, 0, 0, 0}}, {14, Interpreter::addi, {"addi", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A0, 1, 0, 0, 0}}, {15, Interpreter::addis, {"addis", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A0, 1, 0, 0, 0}}, @@ -180,8 +180,8 @@ static GekkoOPTemplate table31[] = {922, Interpreter::extshx, {"extshx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}}, {954, Interpreter::extsbx, {"extsbx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}}, {536, Interpreter::srwx, {"srwx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}}, - {792, Interpreter::srawx, {"srawx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}}, - {824, Interpreter::srawix, {"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}}, + {792, Interpreter::srawx, {"srawx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}}, + {824, Interpreter::srawix, {"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}}, {24, Interpreter::slwx, {"slwx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}}, {54, Interpreter::dcbst, {"dcbst", OPTYPE_DCACHE, 0, 5, 0, 0, 0}}, @@ -260,7 +260,7 @@ static GekkoOPTemplate table31[] = {339, Interpreter::mfspr, {"mfspr", OPTYPE_SPR, FL_OUT_D, 1, 0, 0, 0}}, {467, Interpreter::mtspr, {"mtspr", OPTYPE_SPR, 0, 2, 0, 0, 0}}, {371, Interpreter::mftb, {"mftb", OPTYPE_SYSTEM, FL_OUT_D | FL_TIMER, 1, 0, 0, 0}}, - {512, Interpreter::mcrxr, {"mcrxr", OPTYPE_SYSTEM, 0, 1, 0, 0, 0}}, + {512, Interpreter::mcrxr, {"mcrxr", OPTYPE_SYSTEM, FL_READ_CA | FL_SET_CA, 1, 0, 0, 0}}, {595, Interpreter::mfsr, {"mfsr", OPTYPE_SYSTEM, FL_OUT_D, 3, 0, 0, 0}}, {659, Interpreter::mfsrin, {"mfsrin", OPTYPE_SYSTEM, FL_OUT_D, 3, 0, 0, 0}}, diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index dd5af56673c8..498d833dd7e5 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -100,7 +100,7 @@ class Jit64 : public Jitx86Base void GenerateConstantOverflow(bool overflow); void GenerateConstantOverflow(s64 val); void GenerateOverflow(); - void FinalizeCarryOverflow(bool oe, bool inv = false); + void FinalizeCarryOverflow(bool ca, bool oe, bool inv = false); void ComputeRC(const Gen::OpArg & arg); // Use to extract bytes from a register using the regcache. offset is in bytes. diff --git a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp index 98c986a2f080..f9e0ac97d50f 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp @@ -193,8 +193,8 @@ static GekkoOPTemplate table31[] = {922, &Jit64::extsXx}, //"extshx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}}, {954, &Jit64::extsXx}, //"extsbx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}}, {536, &Jit64::srwx}, //"srwx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}}, - {792, &Jit64::srawx}, //"srawx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}}, - {824, &Jit64::srawix}, //"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}}, + {792, &Jit64::srawx}, //"srawx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT}}, + {824, &Jit64::srawix}, //"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT}}, {24, &Jit64::slwx}, //"slwx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}}, {54, &Jit64::dcbst}, //"dcbst", OPTYPE_DCACHE, 0, 4}}, diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index 0b6f0dac4f58..974ae1569d3d 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -45,7 +45,7 @@ void Jit64::GenerateOverflow() } // Assumes CA,OV are clear -void Jit64::FinalizeCarryOverflow(bool oe, bool inv) +void Jit64::FinalizeCarryOverflow(bool ca, bool oe, bool inv) { // USES_XER if (oe) @@ -53,15 +53,17 @@ void Jit64::FinalizeCarryOverflow(bool oe, bool inv) // this is slightly messy because JitSetCAIf modifies x86 flags, so we have to do it in both // sides of the branch. FixupBranch jno = J_CC(CC_NO); - JitSetCAIf(inv ? CC_NC : CC_C); + if (ca) + JitSetCAIf(inv ? CC_NC : CC_C); //XER[OV/SO] = 1 OR(32, PPCSTATE(spr[SPR_XER]), Imm32(XER_SO_MASK | XER_OV_MASK)); FixupBranch exit = J(); SetJumpTarget(jno); - JitSetCAIf(inv ? CC_NC : CC_C); + if (ca) + JitSetCAIf(inv ? CC_NC : CC_C); SetJumpTarget(exit); } - else + else if (ca) { // Do carry JitSetCAIf(inv ? CC_NC : CC_C); @@ -129,10 +131,11 @@ static u32 Xor(u32 a, u32 b) void Jit64::regimmop(int d, int a, bool binary, u32 value, Operation doop, void (XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc, bool carry) { gpr.Lock(d, a); - if (a || binary || carry) // yeh nasty special case addic + // Be careful; addic treats r0 as r0, but addi treats r0 as zero. + if (a || binary || carry) { - if (carry) - JitClearCAOV(false); + carry &= js.op->wantsCA; + JitClearCAOV(carry, false); if (gpr.R(a).IsImm() && !carry) { gpr.SetImmediate32(d, doop((u32)gpr.R(a).offset, value)); @@ -717,41 +720,43 @@ void Jit64::subfic(UGeckoInstruction inst) { if (imm == 0) { - JitClearCAOV(false); + JitClearCAOV(js.op->wantsCA, false); // Flags act exactly like subtracting from 0 NEG(32, gpr.R(d)); // Output carry is inverted - JitSetCAIf(CC_NC); + if (js.op->wantsCA) + JitSetCAIf(CC_NC); } else if (imm == -1) { // CA is always set in this case - JitSetCA(); + if (js.op->wantsCA) + JitSetCA(); NOT(32, gpr.R(d)); } else { - JitClearCAOV(false); + JitClearCAOV(js.op->wantsCA, false); NOT(32, gpr.R(d)); ADD(32, gpr.R(d), Imm32(imm+1)); // Output carry is normal - JitSetCAIf(CC_C); + if (js.op->wantsCA) + JitSetCAIf(CC_C); } } else { - JitClearCAOV(false); + JitClearCAOV(js.op->wantsCA, false); MOV(32, gpr.R(d), Imm32(imm)); SUB(32, gpr.R(d), gpr.R(a)); // Output carry is inverted - JitSetCAIf(CC_NC); + if (js.op->wantsCA) + JitSetCAIf(CC_NC); } gpr.UnlockAll(); // This instruction has no RC flag } - - void Jit64::subfx(UGeckoInstruction inst) { INSTRUCTION_START @@ -1250,7 +1255,7 @@ void Jit64::arithXex(UGeckoInstruction inst) NOT(32, gpr.R(d)); ADC(32, gpr.R(d), source); } - FinalizeCarryOverflow(inst.OE, invertedCarry); + FinalizeCarryOverflow(js.op->wantsCA, inst.OE, invertedCarry); if (inst.Rc) ComputeRC(gpr.R(d)); gpr.UnlockAll(); @@ -1264,7 +1269,7 @@ void Jit64::arithcx(UGeckoInstruction inst) int a = inst.RA, b = inst.RB, d = inst.RD; gpr.Lock(a, b, d); gpr.BindToRegister(d, d == a || d == b, true); - JitClearCAOV(inst.OE); + JitClearCAOV(js.op->wantsCA, inst.OE); if (d == a && d != b) { @@ -1290,7 +1295,7 @@ void Jit64::arithcx(UGeckoInstruction inst) SUB(32, gpr.R(d), gpr.R(a)); } - FinalizeCarryOverflow(inst.OE, !add); + FinalizeCarryOverflow(js.op->wantsCA, inst.OE, !add); if (inst.Rc) ComputeRC(gpr.R(d)); gpr.UnlockAll(); @@ -1683,16 +1688,23 @@ void Jit64::srawx(UGeckoInstruction inst) gpr.FlushLockX(ECX); gpr.Lock(a, s, b); gpr.BindToRegister(a, (a == s || a == b), true); - JitClearCAOV(false); + JitClearCAOV(js.op->wantsCA, false); MOV(32, R(ECX), gpr.R(b)); if (a != s) MOV(32, gpr.R(a), gpr.R(s)); SHL(64, gpr.R(a), Imm8(32)); SAR(64, gpr.R(a), R(ECX)); - MOV(32, R(RSCRATCH), gpr.R(a)); - SHR(64, gpr.R(a), Imm8(32)); - TEST(32, gpr.R(a), R(RSCRATCH)); - JitSetCAIf(CC_NZ); + if (js.op->wantsCA) + { + MOV(32, R(RSCRATCH), gpr.R(a)); + SHR(64, gpr.R(a), Imm8(32)); + TEST(32, gpr.R(a), R(RSCRATCH)); + JitSetCAIf(CC_NZ); + } + else + { + SHR(64, gpr.R(a), Imm8(32)); + } gpr.UnlockAll(); gpr.UnlockAllX(); if (inst.Rc) @@ -1710,41 +1722,50 @@ void Jit64::srawix(UGeckoInstruction inst) { gpr.Lock(a, s); gpr.BindToRegister(a, a == s, true); - MOV(32, R(RSCRATCH), gpr.R(s)); - if (a != s) - MOV(32, gpr.R(a), R(RSCRATCH)); - // some optimized common cases that can be done in slightly fewer ops - if (amount == 31) - { - JitSetCA(); - SAR(32, gpr.R(a), Imm8(31)); - NEG(32, R(RSCRATCH)); // RSCRATCH = input == INT_MIN ? INT_MIN : -input; - AND(32, R(RSCRATCH), Imm32(0x80000000)); // RSCRATCH = input < 0 && input != INT_MIN ? 0 : 0x80000000 - SHR(32, R(RSCRATCH), Imm8(31 - XER_CA_SHIFT)); - XOR(32, PPCSTATE(spr[SPR_XER]), R(RSCRATCH)); // XER.CA = (input < 0 && input != INT_MIN) - } - else if (amount == 1) - { - JitClearCAOV(false); - SHR(32, R(RSCRATCH), Imm8(31)); // sign - AND(32, R(RSCRATCH), gpr.R(a)); // (sign && carry) - SAR(32, gpr.R(a), Imm8(1)); - SHL(32, R(RSCRATCH), Imm8(XER_CA_SHIFT)); - OR(32, PPCSTATE(spr[SPR_XER]), R(RSCRATCH)); // XER.CA = sign && carry, aka (input&0x80000001) == 0x80000001 + if (!js.op->wantsCA) + { + if (a != s) + MOV(32, gpr.R(a), gpr.R(s)); + SAR(32, gpr.R(a), Imm8(amount)); } else { - JitClearCAOV(false); - SAR(32, gpr.R(a), Imm8(amount)); - SHL(32, R(RSCRATCH), Imm8(32 - amount)); - TEST(32, R(RSCRATCH), gpr.R(a)); - JitSetCAIf(CC_NZ); + MOV(32, R(RSCRATCH), gpr.R(s)); + if (a != s) + MOV(32, gpr.R(a), R(RSCRATCH)); + // some optimized common cases that can be done in slightly fewer ops + if (amount == 31) + { + JitSetCA(); + SAR(32, gpr.R(a), Imm8(31)); + NEG(32, R(RSCRATCH)); // RSCRATCH = input == INT_MIN ? INT_MIN : -input; + AND(32, R(RSCRATCH), Imm32(0x80000000)); // RSCRATCH = input < 0 && input != INT_MIN ? 0 : 0x80000000 + SHR(32, R(RSCRATCH), Imm8(31 - XER_CA_SHIFT)); + XOR(32, PPCSTATE(spr[SPR_XER]), R(RSCRATCH)); // XER.CA = (input < 0 && input != INT_MIN) + } + else if (amount == 1) + { + JitClearCAOV(true, false); + SHR(32, R(RSCRATCH), Imm8(31)); // sign + AND(32, R(RSCRATCH), gpr.R(a)); // (sign && carry) + SAR(32, gpr.R(a), Imm8(1)); + SHL(32, R(RSCRATCH), Imm8(XER_CA_SHIFT)); + OR(32, PPCSTATE(spr[SPR_XER]), R(RSCRATCH)); // XER.CA = sign && carry, aka (input&0x80000001) == 0x80000001 + } + else + { + JitClearCAOV(true, false); + SAR(32, gpr.R(a), Imm8(amount)); + SHL(32, R(RSCRATCH), Imm8(32 - amount)); + TEST(32, R(RSCRATCH), gpr.R(a)); + JitSetCAIf(CC_NZ); + } } } else { gpr.Lock(a, s); - JitClearCAOV(false); + JitClearCAOV(js.op->wantsCA, false); gpr.BindToRegister(a, a == s, true); if (a != s) diff --git a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp index 695dab795edf..0305e22ac532 100644 --- a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp +++ b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp @@ -1106,7 +1106,7 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress) Jit->JitSetCA(); FixupBranch cont = Jit->J(); Jit->SetJumpTarget(nocarry); - Jit->JitClearCAOV(false); + Jit->JitClearCAOV(true, false); Jit->SetJumpTarget(cont); regNormalRegClear(RI, I); break; diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp index c12b9fedcfc4..2b1a0ef7c114 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp @@ -825,10 +825,10 @@ void EmuCodeBlock::JitSetCAIf(CCFlags conditionCode) OR(32, PPCSTATE(spr[SPR_XER]), R(RSCRATCH)); //XER.CA = 1 } -void EmuCodeBlock::JitClearCAOV(bool oe) +void EmuCodeBlock::JitClearCAOV(bool ca, bool oe) { - if (oe) - AND(32, PPCSTATE(spr[SPR_XER]), Imm32(~XER_CA_MASK & ~XER_OV_MASK)); //XER.CA, XER.OV = 0 - else - AND(32, PPCSTATE(spr[SPR_XER]), Imm32(~XER_CA_MASK)); //XER.CA = 0 + u32 mask = (ca ? ~XER_CA_MASK : 0xFFFFFFFF) & (oe ? ~XER_OV_MASK : 0xFFFFFFFF); + if (mask == 0xFFFFFFFF) + return; + AND(32, PPCSTATE(spr[SPR_XER]), Imm32(mask)); } diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h index e50eedf08fa7..221ce455ab67 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h @@ -74,7 +74,7 @@ class EmuCodeBlock : public Gen::X64CodeBlock void JitGetAndClearCAOV(bool oe); void JitSetCA(); void JitSetCAIf(Gen::CCFlags conditionCode); - void JitClearCAOV(bool oe); + void JitClearCAOV(bool ca, bool oe); void ForceSinglePrecisionS(Gen::X64Reg xmm); void ForceSinglePrecisionP(Gen::X64Reg xmm); diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp index 55adc3f17252..e7c06a200937 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp @@ -430,7 +430,6 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf { code->wantsCR0 = false; code->wantsCR1 = false; - code->wantsPS1 = false; if (opinfo->flags & FL_USE_FPU) block->m_fpa->any = true; @@ -458,6 +457,15 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf code->outputFPRF = (opinfo->flags & FL_SET_FPRF) ? true : false; code->canEndBlock = (opinfo->flags & FL_ENDBLOCK) ? true : false; + code->wantsCA = (opinfo->flags & FL_READ_CA) ? true : false; + code->outputCA = (opinfo->flags & FL_SET_CA) ? true : false; + + // mfspr/mtspr can affect/use XER, so be super careful here + if (code->inst.OPCD == 31 && code->inst.SUBOP10 == 339) // mfspr + code->wantsCA = ((code->inst.SPRU << 5) | (code->inst.SPRL & 0x1F)) == SPR_XER; + if (code->inst.OPCD == 31 && code->inst.SUBOP10 == 467) // mtspr + code->outputCA = ((code->inst.SPRU << 5) | (code->inst.SPRL & 0x1F)) == SPR_XER; + int numOut = 0; int numIn = 0; if (opinfo->flags & FL_OUT_A) @@ -715,26 +723,30 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32 block->m_broken = true; } - // Scan for CR0 dependency - // assume next block wants flags to be safe + // Scan for flag dependencies; assume the next block (or any branch that can leave the block) + // wants flags, to be safe. bool wantsCR0 = true; bool wantsCR1 = true; - bool wantsPS1 = true; bool wantsFPRF = true; + bool wantsCA = true; for (int i = block->m_num_instructions - 1; i >= 0; i--) { - wantsCR0 |= code[i].wantsCR0 || code[i].canEndBlock; - wantsCR1 |= code[i].wantsCR1 || code[i].canEndBlock; - wantsPS1 |= code[i].wantsPS1 || code[i].canEndBlock; - wantsFPRF |= code[i].wantsFPRF || code[i].canEndBlock; - code[i].wantsCR0 = wantsCR0; - code[i].wantsCR1 = wantsCR1; - code[i].wantsPS1 = wantsPS1; - code[i].wantsFPRF = wantsFPRF; - wantsCR0 &= !code[i].outputCR0; - wantsCR1 &= !code[i].outputCR1; - wantsPS1 &= !code[i].outputPS1; - wantsFPRF &= !code[i].outputFPRF; + bool opWantsCR0 = code[i].wantsCR0; + bool opWantsCR1 = code[i].wantsCR1; + bool opWantsFPRF = code[i].wantsFPRF; + bool opWantsCA = code[i].wantsCA; + code[i].wantsCR0 = wantsCR0 || code[i].canEndBlock; + code[i].wantsCR1 = wantsCR1 || code[i].canEndBlock; + code[i].wantsFPRF = wantsFPRF || code[i].canEndBlock; + code[i].wantsCA = wantsCA || code[i].canEndBlock; + wantsCR0 |= opWantsCR0 || code[i].canEndBlock; + wantsCR1 |= opWantsCR1 || code[i].canEndBlock; + wantsFPRF |= opWantsFPRF || code[i].canEndBlock; + wantsCA |= opWantsCA || code[i].canEndBlock; + wantsCR0 &= !code[i].outputCR0 || opWantsCR0; + wantsCR1 &= !code[i].outputCR1 || opWantsCR1; + wantsFPRF &= !code[i].outputFPRF || opWantsFPRF; + wantsCA &= !code[i].outputCA || opWantsCA; } return address; } diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.h b/Source/Core/Core/PowerPC/PPCAnalyst.h index 4a9058112a6e..774129a5d0fe 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.h +++ b/Source/Core/Core/PowerPC/PPCAnalyst.h @@ -33,12 +33,12 @@ struct CodeOp //16B bool isBranchTarget; bool wantsCR0; bool wantsCR1; - bool wantsPS1; bool wantsFPRF; + bool wantsCA; bool outputCR0; bool outputCR1; - bool outputPS1; bool outputFPRF; + bool outputCA; bool canEndBlock; bool skip; // followed BL-s for example }; From 45d84605a9a2a0115976af061d318345985ac422 Mon Sep 17 00:00:00 2001 From: Fiora Date: Sun, 7 Sep 2014 00:37:47 -0700 Subject: [PATCH 4/6] JIT64: optimize carry calculations further Keep carry flags in the x86 flags register if used in the next instruction. --- Source/Core/Core/PowerPC/Jit64/Jit.cpp | 3 + Source/Core/Core/PowerPC/Jit64/Jit.h | 4 +- .../Core/Core/PowerPC/Jit64/Jit64_Tables.cpp | 2 +- .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 129 ++++++++++++------ Source/Core/Core/PowerPC/JitCommon/JitBase.h | 5 +- .../Core/Core/PowerPC/JitCommon/Jit_Util.cpp | 1 + Source/Core/Core/PowerPC/PPCAnalyst.cpp | 6 + Source/Core/Core/PowerPC/PPCAnalyst.h | 1 + 8 files changed, 109 insertions(+), 42 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index 7412489948a5..619217e4b69b 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -457,6 +457,8 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc js.downcountAmount += PatchEngine::GetSpeedhackCycles(code_block.m_address); js.skipnext = false; + js.carryFlagSet = false; + js.carryFlagInverted = false; js.compilerPC = nextPC; // Translate instructions for (u32 i = 0; i < code_block.m_num_instructions; i++) @@ -488,6 +490,7 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc // help peephole optimizations js.next_inst = ops[i + 1].inst; js.next_compilerPC = ops[i + 1].address; + js.next_op = &ops[i + 1]; } if (jo.optimizeGatherPipe && js.fifoBytesThisBlock >= 32) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index 498d833dd7e5..53a846237f02 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -100,7 +100,9 @@ class Jit64 : public Jitx86Base void GenerateConstantOverflow(bool overflow); void GenerateConstantOverflow(s64 val); void GenerateOverflow(); - void FinalizeCarryOverflow(bool ca, bool oe, bool inv = false); + void FinalizeCarryOverflow(bool oe, bool inv = false); + void FinalizeCarry(Gen::CCFlags cond); + void FinalizeCarry(bool ca); void ComputeRC(const Gen::OpArg & arg); // Use to extract bytes from a register using the regcache. offset is in bytes. diff --git a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp index f9e0ac97d50f..88f686023a9c 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp @@ -273,7 +273,7 @@ static GekkoOPTemplate table31[] = {339, &Jit64::mfspr}, //"mfspr", OPTYPE_SPR, FL_OUT_D}}, {467, &Jit64::mtspr}, //"mtspr", OPTYPE_SPR, 0, 2}}, {371, &Jit64::mftb}, //"mftb", OPTYPE_SYSTEM, FL_OUT_D | FL_TIMER}}, - {512, &Jit64::mcrxr}, //"mcrxr", OPTYPE_SYSTEM, 0}}, + {512, &Jit64::mcrxr}, //"mcrxr", OPTYPE_SYSTEM, FL_READ_CA | FL_SET_CA}}, {595, &Jit64::FallBackToInterpreter}, //"mfsr", OPTYPE_SYSTEM, FL_OUT_D, 2}}, {659, &Jit64::FallBackToInterpreter}, //"mfsrin", OPTYPE_SYSTEM, FL_OUT_D, 2}}, diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index 974ae1569d3d..203a5fae00ad 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -44,30 +44,76 @@ void Jit64::GenerateOverflow() SetJumpTarget(exit); } +void Jit64::FinalizeCarry(CCFlags cond) +{ + js.carryFlagSet = false; + js.carryFlagInverted = false; + if (js.op->wantsCA) + { + if (js.next_op->wantsCAInFlags) + { + if (cond == CC_C || cond == CC_NC) + { + js.carryFlagInverted = cond == CC_NC; + } + else + { + // convert the condition to a carry flag (is there a better way?) + SETcc(cond, R(RSCRATCH)); + BT(8, R(RSCRATCH), Imm8(0)); + } + js.carryFlagSet = true; + } + else + { + JitSetCAIf(cond); + } + } +} + +// Unconditional version +void Jit64::FinalizeCarry(bool ca) +{ + js.carryFlagSet = false; + js.carryFlagInverted = false; + if (js.op->wantsCA) + { + if (js.next_op->wantsCAInFlags) + { + if (ca) + STC(); + else + CLC(); + js.carryFlagSet = true; + } + else if (ca) + { + JitSetCA(); + } + else + { + JitClearCAOV(true, false); + } + } +} + // Assumes CA,OV are clear -void Jit64::FinalizeCarryOverflow(bool ca, bool oe, bool inv) +void Jit64::FinalizeCarryOverflow(bool oe, bool inv) { // USES_XER if (oe) { - // this is slightly messy because JitSetCAIf modifies x86 flags, so we have to do it in both - // sides of the branch. + // Make sure not to lose the carry flags (not a big deal, this path is rare). + PUSHF(); + AND(32, PPCSTATE(spr[SPR_XER]), Imm32(~(XER_SO_MASK | XER_OV_MASK))); FixupBranch jno = J_CC(CC_NO); - if (ca) - JitSetCAIf(inv ? CC_NC : CC_C); //XER[OV/SO] = 1 OR(32, PPCSTATE(spr[SPR_XER]), Imm32(XER_SO_MASK | XER_OV_MASK)); - FixupBranch exit = J(); SetJumpTarget(jno); - if (ca) - JitSetCAIf(inv ? CC_NC : CC_C); - SetJumpTarget(exit); - } - else if (ca) - { - // Do carry - JitSetCAIf(inv ? CC_NC : CC_C); + POPF(); } + // Do carry + FinalizeCarry(inv ? CC_NC : CC_C); } void Jit64::ComputeRC(const Gen::OpArg & arg) @@ -135,7 +181,6 @@ void Jit64::regimmop(int d, int a, bool binary, u32 value, Operation doop, void if (a || binary || carry) { carry &= js.op->wantsCA; - JitClearCAOV(carry, false); if (gpr.R(a).IsImm() && !carry) { gpr.SetImmediate32(d, doop((u32)gpr.R(a).offset, value)); @@ -159,7 +204,7 @@ void Jit64::regimmop(int d, int a, bool binary, u32 value, Operation doop, void } } if (carry) - JitSetCAIf(CC_C); + FinalizeCarry(CC_C); if (Rc) ComputeRC(gpr.R(d)); } @@ -720,38 +765,31 @@ void Jit64::subfic(UGeckoInstruction inst) { if (imm == 0) { - JitClearCAOV(js.op->wantsCA, false); // Flags act exactly like subtracting from 0 NEG(32, gpr.R(d)); // Output carry is inverted - if (js.op->wantsCA) - JitSetCAIf(CC_NC); + FinalizeCarry(CC_NC); } else if (imm == -1) { - // CA is always set in this case - if (js.op->wantsCA) - JitSetCA(); NOT(32, gpr.R(d)); + // CA is always set in this case + FinalizeCarry(true); } else { - JitClearCAOV(js.op->wantsCA, false); NOT(32, gpr.R(d)); ADD(32, gpr.R(d), Imm32(imm+1)); // Output carry is normal - if (js.op->wantsCA) - JitSetCAIf(CC_C); + FinalizeCarry(CC_C); } } else { - JitClearCAOV(js.op->wantsCA, false); MOV(32, gpr.R(d), Imm32(imm)); SUB(32, gpr.R(d), gpr.R(a)); // Output carry is inverted - if (js.op->wantsCA) - JitSetCAIf(CC_NC); + FinalizeCarry(CC_NC); } gpr.UnlockAll(); // This instruction has no RC flag @@ -1233,29 +1271,44 @@ void Jit64::arithXex(UGeckoInstruction inst) int a = inst.RA; int b = regsource ? inst.RB : a; int d = inst.RD; + bool same_input_sub = !add && regsource && a == b; gpr.Lock(a, b, d); - gpr.BindToRegister(d, d == a || d == b); - JitGetAndClearCAOV(inst.OE); + gpr.BindToRegister(d, !same_input_sub && (d == a || d == b)); + if (!js.carryFlagSet) + JitGetAndClearCAOV(inst.OE); bool invertedCarry = false; - if (!add && regsource && d == b) + // Special case: subfe A, B, B is a common compiler idiom + if (same_input_sub) { // Convert carry to borrow - CMC(); + if (!js.carryFlagInverted) + CMC(); + SBB(32, gpr.R(d), gpr.R(d)); + invertedCarry = true; + } + else if (!add && regsource && d == b) + { + if (!js.carryFlagInverted) + CMC(); + if (d != b) + MOV(32, gpr.R(d), gpr.R(b)); SBB(32, gpr.R(d), gpr.R(a)); invertedCarry = true; } else { OpArg source = regsource ? gpr.R(d == b ? a : b) : Imm32(mex ? 0xFFFFFFFF : 0); + if (js.carryFlagInverted) + CMC(); if (d != a && d != b) MOV(32, gpr.R(d), gpr.R(a)); if (!add) NOT(32, gpr.R(d)); ADC(32, gpr.R(d), source); } - FinalizeCarryOverflow(js.op->wantsCA, inst.OE, invertedCarry); + FinalizeCarryOverflow(inst.OE, invertedCarry); if (inst.Rc) ComputeRC(gpr.R(d)); gpr.UnlockAll(); @@ -1269,7 +1322,6 @@ void Jit64::arithcx(UGeckoInstruction inst) int a = inst.RA, b = inst.RB, d = inst.RD; gpr.Lock(a, b, d); gpr.BindToRegister(d, d == a || d == b, true); - JitClearCAOV(js.op->wantsCA, inst.OE); if (d == a && d != b) { @@ -1295,7 +1347,7 @@ void Jit64::arithcx(UGeckoInstruction inst) SUB(32, gpr.R(d), gpr.R(a)); } - FinalizeCarryOverflow(js.op->wantsCA, inst.OE, !add); + FinalizeCarryOverflow(inst.OE, !add); if (inst.Rc) ComputeRC(gpr.R(d)); gpr.UnlockAll(); @@ -1688,7 +1740,6 @@ void Jit64::srawx(UGeckoInstruction inst) gpr.FlushLockX(ECX); gpr.Lock(a, s, b); gpr.BindToRegister(a, (a == s || a == b), true); - JitClearCAOV(js.op->wantsCA, false); MOV(32, R(ECX), gpr.R(b)); if (a != s) MOV(32, gpr.R(a), gpr.R(s)); @@ -1699,12 +1750,12 @@ void Jit64::srawx(UGeckoInstruction inst) MOV(32, R(RSCRATCH), gpr.R(a)); SHR(64, gpr.R(a), Imm8(32)); TEST(32, gpr.R(a), R(RSCRATCH)); - JitSetCAIf(CC_NZ); } else { SHR(64, gpr.R(a), Imm8(32)); } + FinalizeCarry(CC_NZ); gpr.UnlockAll(); gpr.UnlockAllX(); if (inst.Rc) @@ -1758,14 +1809,14 @@ void Jit64::srawix(UGeckoInstruction inst) SAR(32, gpr.R(a), Imm8(amount)); SHL(32, R(RSCRATCH), Imm8(32 - amount)); TEST(32, R(RSCRATCH), gpr.R(a)); - JitSetCAIf(CC_NZ); + FinalizeCarry(CC_NZ); } } } else { gpr.Lock(a, s); - JitClearCAOV(js.op->wantsCA, false); + FinalizeCarry(false); gpr.BindToRegister(a, a == s, true); if (a != s) diff --git a/Source/Core/Core/PowerPC/JitCommon/JitBase.h b/Source/Core/Core/PowerPC/JitCommon/JitBase.h index 816bfeae13bb..c6ff6e496747 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitBase.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitBase.h @@ -81,13 +81,16 @@ class JitBase : public CPUCoreBase bool isLastInstruction; bool memcheck; bool skipnext; + bool carryFlagSet; + bool carryFlagInverted; int fifoBytesThisBlock; PPCAnalyst::BlockStats st; PPCAnalyst::BlockRegStats gpa; PPCAnalyst::BlockRegStats fpa; - PPCAnalyst::CodeOp *op; + PPCAnalyst::CodeOp* op; + PPCAnalyst::CodeOp* next_op; u8* rewriteStart; JitBlock *curBlock; diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp index 2b1a0ef7c114..ee7441a607ba 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp @@ -822,6 +822,7 @@ void EmuCodeBlock::JitSetCAIf(CCFlags conditionCode) SETcc(conditionCode, R(RSCRATCH)); MOVZX(32, 8, RSCRATCH, R(RSCRATCH)); SHL(32, R(RSCRATCH), Imm8(XER_CA_SHIFT)); + AND(32, PPCSTATE(spr[SPR_XER]), Imm32(~XER_CA_MASK)); OR(32, PPCSTATE(spr[SPR_XER]), R(RSCRATCH)); //XER.CA = 1 } diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp index e7c06a200937..36f2ecd91dee 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp @@ -460,7 +460,13 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf code->wantsCA = (opinfo->flags & FL_READ_CA) ? true : false; code->outputCA = (opinfo->flags & FL_SET_CA) ? true : false; + // We're going to try to avoid storing carry in XER if we can avoid it -- keep it in the x86 carry flag! + // If the instruction reads CA but doesn't write it, we still need to store CA in XER; we can't + // leave it in flags. + code->wantsCAInFlags = code->wantsCA && code->outputCA && code->inst.SUBOP10 != 512; + // mfspr/mtspr can affect/use XER, so be super careful here + // we need to note specifically that mfspr needs CA in XER, not in the x86 carry flag if (code->inst.OPCD == 31 && code->inst.SUBOP10 == 339) // mfspr code->wantsCA = ((code->inst.SPRU << 5) | (code->inst.SPRL & 0x1F)) == SPR_XER; if (code->inst.OPCD == 31 && code->inst.SUBOP10 == 467) // mtspr diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.h b/Source/Core/Core/PowerPC/PPCAnalyst.h index 774129a5d0fe..aa1a00abeb30 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.h +++ b/Source/Core/Core/PowerPC/PPCAnalyst.h @@ -35,6 +35,7 @@ struct CodeOp //16B bool wantsCR1; bool wantsFPRF; bool wantsCA; + bool wantsCAInFlags; bool outputCR0; bool outputCR1; bool outputFPRF; From 54129a8ca5bebb7d540132f061ab80d07adaadb4 Mon Sep 17 00:00:00 2001 From: Fiora Date: Sun, 7 Sep 2014 08:30:11 -0700 Subject: [PATCH 5/6] PPCAnalyst: refactor, add carry op reordering and non-cmp reordering Tries as hard as possible to push carry-using operations (like addc and adde) next to each other. Refactor the instruction reordering to be more flexible and allow multiple passes. 353 -> 192 x86 instructions on a carry-heavy code block in Pokemon Puzzle. 12% faster overall in Pokemon Puzzle; probably less in typical games (Virtual Console games seem to be carry-heavy for some reason; maybe a different compiler?) --- .../Interpreter/Interpreter_Tables.cpp | 16 +-- .../Core/Core/PowerPC/Jit64/Jit64_Tables.cpp | 2 +- Source/Core/Core/PowerPC/PPCAnalyst.cpp | 106 +++++++++++++----- Source/Core/Core/PowerPC/PPCAnalyst.h | 7 ++ Source/Core/Core/PowerPC/PPCTables.h | 1 + 5 files changed, 93 insertions(+), 39 deletions(-) diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp index 317132266dc8..3dfc55ce2394 100644 --- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp +++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp @@ -280,26 +280,26 @@ static GekkoOPTemplate table31[] = static GekkoOPTemplate table31_2[] = { {266, Interpreter::addx, {"addx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}}, - {778, Interpreter::addx, {"addox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}}, + {778, Interpreter::addx, {"addox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}}, {10, Interpreter::addcx, {"addcx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}}, - {522, Interpreter::addcx, {"addcox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}}, + {522, Interpreter::addcx, {"addcox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}}, {138, Interpreter::addex, {"addex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}}, - {650, Interpreter::addex, {"addeox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}}, + {650, Interpreter::addex, {"addeox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT | FL_SET_OE, 1, 0, 0, 0}}, {234, Interpreter::addmex, {"addmex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}}, {202, Interpreter::addzex, {"addzex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}}, {491, Interpreter::divwx, {"divwx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 40, 0, 0, 0}}, - {1003, Interpreter::divwx, {"divwox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 40, 0, 0, 0}}, + {1003, Interpreter::divwx, {"divwox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 40, 0, 0, 0}}, {459, Interpreter::divwux, {"divwux", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 40, 0, 0, 0}}, - {971, Interpreter::divwux, {"divwuox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 40, 0, 0, 0}}, + {971, Interpreter::divwux, {"divwuox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 40, 0, 0, 0}}, {75, Interpreter::mulhwx, {"mulhwx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 5, 0, 0, 0}}, {11, Interpreter::mulhwux, {"mulhwux", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 5, 0, 0, 0}}, {235, Interpreter::mullwx, {"mullwx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 5, 0, 0, 0}}, - {747, Interpreter::mullwx, {"mullwox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 5, 0, 0, 0}}, + {747, Interpreter::mullwx, {"mullwox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 5, 0, 0, 0}}, {104, Interpreter::negx, {"negx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}}, {40, Interpreter::subfx, {"subfx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}}, - {552, Interpreter::subfx, {"subox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}}, + {552, Interpreter::subfx, {"subox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 1, 0, 0, 0}}, {8, Interpreter::subfcx, {"subfcx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}}, - {520, Interpreter::subfcx, {"subfcox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}}, + {520, Interpreter::subfcx, {"subfcox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT | FL_SET_OE, 1, 0, 0, 0}}, {136, Interpreter::subfex, {"subfex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}}, {232, Interpreter::subfmex, {"subfmex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}}, {200, Interpreter::subfzex, {"subfzex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}}, diff --git a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp index 88f686023a9c..a9c932b8ca0d 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp @@ -48,7 +48,7 @@ static GekkoOPTemplate primarytable[] = {10, &Jit64::cmpXX}, //"cmpli", OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn}}, {11, &Jit64::cmpXX}, //"cmpi", OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn}}, {12, &Jit64::reg_imm}, //"addic", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA}}, - {13, &Jit64::reg_imm}, //"addic_rc", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CR0}}, + {13, &Jit64::reg_imm}, //"addic_rc", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA | FL_SET_CR0}}, {14, &Jit64::reg_imm}, //"addi", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A0}}, {15, &Jit64::reg_imm}, //"addis", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A0}}, diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp index 36f2ecd91dee..2840b52fc993 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp @@ -213,14 +213,17 @@ static void AnalyzeFunction2(Symbol *func) func->flags = flags; } -// IMPORTANT - CURRENTLY ASSUMES THAT A IS A COMPARE static bool CanSwapAdjacentOps(const CodeOp &a, const CodeOp &b) { + const GekkoOPInfo *a_info = a.opinfo; const GekkoOPInfo *b_info = b.opinfo; + int a_flags = a_info->flags; int b_flags = b_info->flags; - if (b_flags & (FL_SET_CRx | FL_ENDBLOCK | FL_TIMER | FL_EVIL)) + if (b_flags & (FL_SET_CRx | FL_ENDBLOCK | FL_TIMER | FL_EVIL | FL_SET_OE)) return false; - if ((b_flags & (FL_RC_BIT | FL_RC_BIT_F)) && (b.inst.hex & 1)) + if ((b_flags & (FL_RC_BIT | FL_RC_BIT_F)) && (b.inst.Rc)) + return false; + if ((a_flags & (FL_SET_CA | FL_READ_CA)) && (b_flags & (FL_SET_CA | FL_READ_CA))) return false; switch (b.inst.OPCD) @@ -250,20 +253,16 @@ static bool CanSwapAdjacentOps(const CodeOp &a, const CodeOp &b) { int regInA = a.regsIn[j]; int regInB = b.regsIn[j]; - if (regInA >= 0 && - (b.regsOut[0] == regInA || - b.regsOut[1] == regInA)) - { - // reg collision! don't swap + // register collision: b outputs to one of a's inputs + if (regInA >= 0 && (b.regsOut[0] == regInA || b.regsOut[1] == regInA)) return false; - } - if (regInB >= 0 && - (a.regsOut[0] == regInB || - a.regsOut[1] == regInB)) - { - // reg collision! don't swap + // register collision: a outputs to one of b's inputs + if (regInB >= 0 && (a.regsOut[0] == regInB || a.regsOut[1] == regInB)) return false; - } + // register collision: b outputs to one of a's outputs (overwriting it) + for (int k = 0; k < 2; k++) + if (b.regsOut[k] >= 0 && (b.regsOut[k] == a.regsOut[0] || b.regsOut[k] == a.regsOut[1])) + return false; } return true; @@ -403,29 +402,76 @@ void FindFunctions(u32 startAddr, u32 endAddr, PPCSymbolDB *func_db) leafSize, niceSize, unniceSize); } -void PPCAnalyzer::ReorderInstructions(u32 instructions, CodeOp *code) +static bool isCmp(const CodeOp& a) { - // Instruction Reordering Pass - // Bubble down compares towards branches, so that they can be merged. - // -2: -1 for the pair, -1 for not swapping with the final instruction which is probably the branch. - for (u32 i = 0; i < (instructions - 2); ++i) + return (a.inst.OPCD == 10 || a.inst.OPCD == 11) || (a.inst.OPCD == 31 && (a.inst.SUBOP10 == 0 || a.inst.SUBOP10 == 32)); +} + +static bool isRlwinm_rc(const CodeOp& a) +{ + return a.inst.OPCD == 21 && a.inst.Rc; +} + +static bool isCarryOp(const CodeOp& a) +{ + return (a.opinfo->flags & FL_SET_CA) && !(a.opinfo->flags & FL_SET_OE) && a.opinfo->type == OPTYPE_INTEGER; +} + +void PPCAnalyzer::ReorderInstructionsCore(u32 instructions, CodeOp* code, bool reverse, ReorderType type) +{ + // Bubbling an instruction sometimes reveals another opportunity to bubble an instruction, so do + // multiple passes. + while (true) { - CodeOp &a = code[i]; - CodeOp &b = code[i + 1]; - // All integer compares can be reordered. - if ((a.inst.OPCD == 10 || a.inst.OPCD == 11) || - (a.inst.OPCD == 31 && (a.inst.SUBOP10 == 0 || a.inst.SUBOP10 == 32))) + // Instruction Reordering Pass + // Carry pass: bubble carry-using instructions as close to each other as possible, so we can avoid + // storing the carry flag. + // Compare pass: bubble compare instructions next to branches, so they can be merged. + bool swapped = false; + int increment = reverse ? -1 : 1; + int start = reverse ? instructions - 1 : 0; + int end = reverse ? 0 : instructions - 1; + for (int i = start; i != end; i += increment) { - // Got a compare instruction. - if (CanSwapAdjacentOps(a, b)) + CodeOp &a = code[i]; + CodeOp &b = code[i + increment]; + // Reorder integer compares, rlwinm., and carry-affecting ops + // (if we add more merged branch instructions, add them here!) + if ((type == REORDER_CARRY && isCarryOp(a)) || (type == REORDER_CMP && (isCmp(a) || isRlwinm_rc(a)))) { - // Alright, let's bubble it down! - std::swap(a, b); + // once we're next to a carry instruction, don't move away! + if (type == REORDER_CARRY && i != start) + { + // if we read the CA flag, and the previous instruction sets it, don't move away. + if (!reverse && (a.opinfo->flags & FL_READ_CA) && (code[i - increment].opinfo->flags & FL_SET_CA)) + continue; + // if we set the CA flag, and the next instruction reads it, don't move away. + if (reverse && (a.opinfo->flags & FL_SET_CA) && (code[i - increment].opinfo->flags & FL_READ_CA)) + continue; + } + + if (CanSwapAdjacentOps(a, b)) + { + // Alright, let's bubble it! + std::swap(a, b); + swapped = true; + } } } + if (!swapped) + return; } } +void PPCAnalyzer::ReorderInstructions(u32 instructions, CodeOp *code) +{ + // For carry, bubble instructions *towards* each other; one direction often isn't enough + // to get pairs like addc/adde next to each other. + ReorderInstructionsCore(instructions, code, true, REORDER_CARRY); + ReorderInstructionsCore(instructions, code, false, REORDER_CARRY); + ReorderInstructionsCore(instructions, code, false, REORDER_CMP); +} + void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInfo *opinfo, u32 index) { code->wantsCR0 = false; @@ -463,7 +509,7 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf // We're going to try to avoid storing carry in XER if we can avoid it -- keep it in the x86 carry flag! // If the instruction reads CA but doesn't write it, we still need to store CA in XER; we can't // leave it in flags. - code->wantsCAInFlags = code->wantsCA && code->outputCA && code->inst.SUBOP10 != 512; + code->wantsCAInFlags = code->wantsCA && code->outputCA && opinfo->type == OPTYPE_INTEGER; // mfspr/mtspr can affect/use XER, so be super careful here // we need to note specifically that mfspr needs CA in XER, not in the x86 carry flag diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.h b/Source/Core/Core/PowerPC/PPCAnalyst.h index aa1a00abeb30..6be495f8f69b 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.h +++ b/Source/Core/Core/PowerPC/PPCAnalyst.h @@ -144,6 +144,13 @@ class PPCAnalyzer { private: + enum ReorderType + { + REORDER_CARRY, + REORDER_CMP + }; + + void ReorderInstructionsCore(u32 instructions, CodeOp* code, bool reverse, ReorderType type); void ReorderInstructions(u32 instructions, CodeOp *code); void SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInfo *opinfo, u32 index); diff --git a/Source/Core/Core/PowerPC/PPCTables.h b/Source/Core/Core/PowerPC/PPCTables.h index f535817c9492..a3bb892d4587 100644 --- a/Source/Core/Core/PowerPC/PPCTables.h +++ b/Source/Core/Core/PowerPC/PPCTables.h @@ -38,6 +38,7 @@ enum FL_LOADSTORE = (1<<19), FL_SET_FPRF = (1<<20), FL_READ_FPRF = (1<<21), + FL_SET_OE = (1<<22), }; enum From 08ac10d00a06eb8d1f7fe7ad8249245b7883f9c4 Mon Sep 17 00:00:00 2001 From: Fiora Date: Thu, 11 Sep 2014 03:59:40 -0700 Subject: [PATCH 6/6] PPCAnalyst/JIT: add ability to easily toggle branch and carry merging --- Source/Core/Core/PowerPC/Jit64/Jit.cpp | 2 ++ Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 3 +++ Source/Core/Core/PowerPC/PPCAnalyst.cpp | 15 +++++++++++---- Source/Core/Core/PowerPC/PPCAnalyst.h | 8 ++++++++ 4 files changed, 24 insertions(+), 4 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index 619217e4b69b..4324e63ba9b0 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -177,6 +177,8 @@ void Jit64::Init() code_block.m_gpa = &js.gpa; code_block.m_fpa = &js.fpa; analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE); + analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_MERGE); + analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CARRY_MERGE); } void Jit64::ClearCache() diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index 203a5fae00ad..e86847ffc0ad 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -287,6 +287,9 @@ void Jit64::reg_imm(UGeckoInstruction inst) bool Jit64::CheckMergedBranch(int crf) { + if (!analyzer.HasOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_MERGE)) + return false; + const UGeckoInstruction& next = js.next_inst; return (((next.OPCD == 16 /* bcx */) || ((next.OPCD == 19) && (next.SUBOP10 == 528) /* bcctrx */) || diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp index 2840b52fc993..d5eb12841885 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp @@ -467,9 +467,13 @@ void PPCAnalyzer::ReorderInstructions(u32 instructions, CodeOp *code) { // For carry, bubble instructions *towards* each other; one direction often isn't enough // to get pairs like addc/adde next to each other. - ReorderInstructionsCore(instructions, code, true, REORDER_CARRY); - ReorderInstructionsCore(instructions, code, false, REORDER_CARRY); - ReorderInstructionsCore(instructions, code, false, REORDER_CMP); + if (HasOption(OPTION_CARRY_MERGE)) + { + ReorderInstructionsCore(instructions, code, true, REORDER_CARRY); + ReorderInstructionsCore(instructions, code, false, REORDER_CARRY); + } + if (HasOption(OPTION_BRANCH_MERGE)) + ReorderInstructionsCore(instructions, code, false, REORDER_CMP); } void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInfo *opinfo, u32 index) @@ -509,7 +513,10 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf // We're going to try to avoid storing carry in XER if we can avoid it -- keep it in the x86 carry flag! // If the instruction reads CA but doesn't write it, we still need to store CA in XER; we can't // leave it in flags. - code->wantsCAInFlags = code->wantsCA && code->outputCA && opinfo->type == OPTYPE_INTEGER; + if (HasOption(OPTION_CARRY_MERGE)) + code->wantsCAInFlags = code->wantsCA && code->outputCA && opinfo->type == OPTYPE_INTEGER; + else + code->wantsCAInFlags = false; // mfspr/mtspr can affect/use XER, so be super careful here // we need to note specifically that mfspr needs CA in XER, not in the x86 carry flag diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.h b/Source/Core/Core/PowerPC/PPCAnalyst.h index 6be495f8f69b..2238337710ca 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.h +++ b/Source/Core/Core/PowerPC/PPCAnalyst.h @@ -183,6 +183,14 @@ class PPCAnalyzer // Requires JIT support to work. // XXX: NOT COMPLETE OPTION_FORWARD_JUMP = (1 << 3), + + // Reorder compare/Rc instructions next to their associated branches and + // merge in the JIT (for common cases, anyway). + OPTION_BRANCH_MERGE = (1 << 4), + + // Reorder carry instructions next to their associated branches and pass + // carry flags in the x86 flags between them, instead of in XER. + OPTION_CARRY_MERGE = (1 << 5), };