From 5d80145dc0de8d742f19995310bd04e442f521e1 Mon Sep 17 00:00:00 2001
From: Fiora <fioraaeterna@gmail.com>
Date: Sat, 6 Sep 2014 19:33:34 -0700
Subject: [PATCH 1/6] JIT: unify subfe/submex/subfex/adde/addmex/addzex code

Shorter, plus should make future optimizations easier.
---
 Source/Core/Core/PowerPC/Jit64/Jit.h          |   7 +-
 .../Core/Core/PowerPC/Jit64/Jit64_Tables.cpp  |  14 +-
 .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp   | 147 +++---------------
 3 files changed, 30 insertions(+), 138 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h
index 821df102d168..dc3eae8a4cae 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit.h
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.h
@@ -147,9 +147,7 @@ class Jit64 : public Jitx86Base
 	void divwx(UGeckoInstruction inst);
 	void srawix(UGeckoInstruction inst);
 	void srawx(UGeckoInstruction inst);
-	void addex(UGeckoInstruction inst);
-	void addmex(UGeckoInstruction inst);
-	void addzex(UGeckoInstruction inst);
+	void arithXex(UGeckoInstruction inst);
 
 	void extsXx(UGeckoInstruction inst);
 
@@ -219,9 +217,6 @@ class Jit64 : public Jitx86Base
 	void subfic(UGeckoInstruction inst);
 	void subfcx(UGeckoInstruction inst);
 	void subfx(UGeckoInstruction inst);
-	void subfex(UGeckoInstruction inst);
-	void subfmex(UGeckoInstruction inst);
-	void subfzex(UGeckoInstruction inst);
 
 	void twx(UGeckoInstruction inst);
 
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp
index 927e83353f65..a285c815c2d0 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp
@@ -296,10 +296,10 @@ static GekkoOPTemplate table31_2[] =
 	{778,  &Jit64::addx},                  //"addx",    OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT}},
 	{10,   &Jit64::addcx},                 //"addcx",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT}},
 	{522,  &Jit64::addcx},                 //"addcox",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT}},
-	{138,  &Jit64::addex},                 //"addex",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}},
-	{650,  &Jit64::addex},                 //"addeox",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}},
-	{234,  &Jit64::addmex},                //"addmex",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}},
-	{202,  &Jit64::addzex},                //"addzex",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}},
+	{138,  &Jit64::arithXex},              //"addex",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}},
+	{650,  &Jit64::arithXex},              //"addeox",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}},
+	{234,  &Jit64::arithXex},              //"addmex",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}},
+	{202,  &Jit64::arithXex},              //"addzex",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}},
 	{491,  &Jit64::divwx},                 //"divwx",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 39}},
 	{1003, &Jit64::divwx},                 //"divwox",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 39}},
 	{459,  &Jit64::divwux},                //"divwux",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 39}},
@@ -313,9 +313,9 @@ static GekkoOPTemplate table31_2[] =
 	{552,  &Jit64::subfx},                 //"subox",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT}},
 	{8,    &Jit64::subfcx},                //"subfcx",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT}},
 	{520,  &Jit64::subfcx},                //"subfcox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT}},
-	{136,  &Jit64::subfex},                //"subfex",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}},
-	{232,  &Jit64::subfmex},               //"subfmex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}},
-	{200,  &Jit64::subfzex},               //"subfzex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}},
+	{136,  &Jit64::arithXex},              //"subfex",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}},
+	{232,  &Jit64::arithXex},              //"subfmex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}},
+	{200,  &Jit64::arithXex},              //"subfzex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}},
 };
 
 static GekkoOPTemplate table59[] =
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
index 10ea9f844876..cf1e226c7c02 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
@@ -781,84 +781,6 @@ void Jit64::subfcx(UGeckoInstruction inst)
 	gpr.UnlockAll();
 }
 
-void Jit64::subfex(UGeckoInstruction inst)
-{
-	INSTRUCTION_START;
-	JITDISABLE(bJITIntegerOff);
-	int a = inst.RA, b = inst.RB, d = inst.RD;
-	gpr.Lock(a, b, d);
-	gpr.BindToRegister(d, (d == a || d == b), true);
-
-	JitGetAndClearCAOV(inst.OE);
-
-	bool invertedCarry = false;
-	if (d == b)
-	{
-		// Convert carry to borrow
-		CMC();
-		SBB(32, gpr.R(d), gpr.R(a));
-		invertedCarry = true;
-	}
-	else if (d == a)
-	{
-		NOT(32, gpr.R(d));
-		ADC(32, gpr.R(d), gpr.R(b));
-	}
-	else
-	{
-		MOV(32, gpr.R(d), gpr.R(a));
-		NOT(32, gpr.R(d));
-		ADC(32, gpr.R(d), gpr.R(b));
-	}
-	FinalizeCarryOverflow(inst.OE, invertedCarry);
-	if (inst.Rc)
-		ComputeRC(gpr.R(d));
-
-	gpr.UnlockAll();
-}
-
-void Jit64::subfmex(UGeckoInstruction inst)
-{
-	// USES_XER
-	INSTRUCTION_START
-	JITDISABLE(bJITIntegerOff);
-	int a = inst.RA, d = inst.RD;
-	gpr.Lock(a, d);
-	gpr.BindToRegister(d, d == a);
-
-	JitGetAndClearCAOV(inst.OE);
-	if (d != a)
-		MOV(32, gpr.R(d), gpr.R(a));
-	NOT(32, gpr.R(d));
-	ADC(32, gpr.R(d), Imm32(0xFFFFFFFF));
-	FinalizeCarryOverflow(inst.OE);
-	if (inst.Rc)
-		ComputeRC(gpr.R(d));
-	gpr.UnlockAll();
-}
-
-void Jit64::subfzex(UGeckoInstruction inst)
-{
-	// USES_XER
-	INSTRUCTION_START
-	JITDISABLE(bJITIntegerOff);
-	int a = inst.RA, d = inst.RD;
-
-	gpr.Lock(a, d);
-	gpr.BindToRegister(d, d == a);
-
-	JitGetAndClearCAOV(inst.OE);
-	if (d != a)
-		MOV(32, gpr.R(d), gpr.R(a));
-	NOT(32, gpr.R(d));
-	ADC(32, gpr.R(d), Imm8(0));
-	FinalizeCarryOverflow(inst.OE);
-	if (inst.Rc)
-		ComputeRC(gpr.R(d));
-
-	gpr.UnlockAll();
-}
-
 void Jit64::subfx(UGeckoInstruction inst)
 {
 	INSTRUCTION_START
@@ -1325,26 +1247,39 @@ void Jit64::addx(UGeckoInstruction inst)
 	}
 }
 
-void Jit64::addex(UGeckoInstruction inst)
+void Jit64::arithXex(UGeckoInstruction inst)
 {
-	// USES_XER
 	INSTRUCTION_START
 	JITDISABLE(bJITIntegerOff);
-	int a = inst.RA, b = inst.RB, d = inst.RD;
+	bool regsource = !(inst.SUBOP10 & 64); // addex or subfex
+	bool mex = !!(inst.SUBOP10 & 32);      // addmex/subfmex or addzex/subfzex
+	bool add = !!(inst.SUBOP10 & 2);       // add or sub
+	int a = inst.RA;
+	int b = regsource ? inst.RB : a;
+	int d = inst.RD;
 
 	gpr.Lock(a, b, d);
-	gpr.BindToRegister(d, (d == a) || (d == b));
+	gpr.BindToRegister(d, d == a || d == b);
 	JitGetAndClearCAOV(inst.OE);
-	if ((d == a) || (d == b))
+
+	bool invertedCarry = false;
+	if (!add && regsource && d == b)
 	{
-		ADC(32, gpr.R(d), gpr.R((d == a) ? b : a));
+		// Convert carry to borrow
+		CMC();
+		SBB(32, gpr.R(d), gpr.R(a));
+		invertedCarry = true;
 	}
 	else
 	{
-		MOV(32, gpr.R(d), gpr.R(a));
-		ADC(32, gpr.R(d), gpr.R(b));
+		OpArg source = regsource ? gpr.R(d == b ? a : b) : Imm32(mex ? 0xFFFFFFFF : 0);
+		if (d != a && d != b)
+			MOV(32, gpr.R(d), gpr.R(a));
+		if (!add)
+			NOT(32, gpr.R(d));
+		ADC(32, gpr.R(d), source);
 	}
-	FinalizeCarryOverflow(inst.OE);
+	FinalizeCarryOverflow(inst.OE, invertedCarry);
 	if (inst.Rc)
 		ComputeRC(gpr.R(d));
 	gpr.UnlockAll();
@@ -1382,44 +1317,6 @@ void Jit64::addcx(UGeckoInstruction inst)
 	}
 }
 
-void Jit64::addmex(UGeckoInstruction inst)
-{
-	// USES_XER
-	INSTRUCTION_START
-	JITDISABLE(bJITIntegerOff);
-	int a = inst.RA, d = inst.RD;
-
-	gpr.Lock(d);
-	gpr.BindToRegister(d, d == a);
-	JitGetAndClearCAOV(inst.OE);
-	if (d != a)
-		MOV(32, gpr.R(d), gpr.R(a));
-	ADC(32, gpr.R(d), Imm32(0xFFFFFFFF));
-	FinalizeCarryOverflow(inst.OE);
-	if (inst.Rc)
-		ComputeRC(gpr.R(d));
-	gpr.UnlockAll();
-}
-
-void Jit64::addzex(UGeckoInstruction inst)
-{
-	// USES_XER
-	INSTRUCTION_START
-	JITDISABLE(bJITIntegerOff);
-	int a = inst.RA, d = inst.RD;
-
-	gpr.Lock(d);
-	gpr.BindToRegister(d, d == a);
-	JitGetAndClearCAOV(inst.OE);
-	if (d != a)
-		MOV(32, gpr.R(d), gpr.R(a));
-	ADC(32, gpr.R(d), Imm8(0));
-	FinalizeCarryOverflow(inst.OE);
-	if (inst.Rc)
-		ComputeRC(gpr.R(d));
-	gpr.UnlockAll();
-}
-
 void Jit64::rlwinmx(UGeckoInstruction inst)
 {
 	INSTRUCTION_START

From a6c9515b00de392e3075d01287dd5ab38cab0bad Mon Sep 17 00:00:00 2001
From: Fiora <fioraaeterna@gmail.com>
Date: Sat, 6 Sep 2014 22:34:40 -0700
Subject: [PATCH 2/6] JIT: unify subfcx and addcx code

Again, shorter and should make future optimizations easier.
---
 Source/Core/Core/PowerPC/Jit64/Jit.h          |  3 +-
 .../Core/Core/PowerPC/Jit64/Jit64_Tables.cpp  |  8 +-
 .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp   | 77 +++++++------------
 3 files changed, 33 insertions(+), 55 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h
index dc3eae8a4cae..dd5af56673c8 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit.h
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.h
@@ -139,7 +139,7 @@ class Jit64 : public Jitx86Base
 	void DynaRunTable63(UGeckoInstruction _inst);
 
 	void addx(UGeckoInstruction inst);
-	void addcx(UGeckoInstruction inst);
+	void arithcx(UGeckoInstruction inst);
 	void mulli(UGeckoInstruction inst);
 	void mulhwXx(UGeckoInstruction inst);
 	void mullwx(UGeckoInstruction inst);
@@ -215,7 +215,6 @@ class Jit64 : public Jitx86Base
 	void dcbz(UGeckoInstruction inst);
 
 	void subfic(UGeckoInstruction inst);
-	void subfcx(UGeckoInstruction inst);
 	void subfx(UGeckoInstruction inst);
 
 	void twx(UGeckoInstruction inst);
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp
index a285c815c2d0..98c986a2f080 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp
@@ -294,8 +294,8 @@ static GekkoOPTemplate table31_2[] =
 {
 	{266,  &Jit64::addx},                  //"addx",    OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT}},
 	{778,  &Jit64::addx},                  //"addx",    OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT}},
-	{10,   &Jit64::addcx},                 //"addcx",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT}},
-	{522,  &Jit64::addcx},                 //"addcox",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT}},
+	{10,   &Jit64::arithcx},               //"addcx",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT}},
+	{522,  &Jit64::arithcx},               //"addcox",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT}},
 	{138,  &Jit64::arithXex},              //"addex",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}},
 	{650,  &Jit64::arithXex},              //"addeox",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}},
 	{234,  &Jit64::arithXex},              //"addmex",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}},
@@ -311,8 +311,8 @@ static GekkoOPTemplate table31_2[] =
 	{104,  &Jit64::negx},                  //"negx",    OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT}},
 	{40,   &Jit64::subfx},                 //"subfx",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT}},
 	{552,  &Jit64::subfx},                 //"subox",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT}},
-	{8,    &Jit64::subfcx},                //"subfcx",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT}},
-	{520,  &Jit64::subfcx},                //"subfcox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT}},
+	{8,    &Jit64::arithcx},               //"subfcx",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT}},
+	{520,  &Jit64::arithcx},               //"subfcox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT}},
 	{136,  &Jit64::arithXex},              //"subfex",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}},
 	{232,  &Jit64::arithXex},              //"subfmex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}},
 	{200,  &Jit64::arithXex},              //"subfzex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}},
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
index cf1e226c7c02..0b6f0dac4f58 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
@@ -750,36 +750,7 @@ void Jit64::subfic(UGeckoInstruction inst)
 	// This instruction has no RC flag
 }
 
-void Jit64::subfcx(UGeckoInstruction inst)
-{
-	INSTRUCTION_START;
-	JITDISABLE(bJITIntegerOff);
-	int a = inst.RA, b = inst.RB, d = inst.RD;
-	gpr.Lock(a, b, d);
-	gpr.BindToRegister(d, (d == a || d == b), true);
 
-	JitClearCAOV(inst.OE);
-	if (d == b)
-	{
-		SUB(32, gpr.R(d), gpr.R(a));
-	}
-	else if (d == a)
-	{
-		MOV(32, R(RSCRATCH), gpr.R(a));
-		MOV(32, gpr.R(d), gpr.R(b));
-		SUB(32, gpr.R(d), R(RSCRATCH));
-	}
-	else
-	{
-		MOV(32, gpr.R(d), gpr.R(b));
-		SUB(32, gpr.R(d), gpr.R(a));
-	}
-	if (inst.Rc)
-		ComputeRC(gpr.R(d));
-	FinalizeCarryOverflow(inst.OE, true);
-
-	gpr.UnlockAll();
-}
 
 void Jit64::subfx(UGeckoInstruction inst)
 {
@@ -1285,36 +1256,44 @@ void Jit64::arithXex(UGeckoInstruction inst)
 	gpr.UnlockAll();
 }
 
-void Jit64::addcx(UGeckoInstruction inst)
+void Jit64::arithcx(UGeckoInstruction inst)
 {
 	INSTRUCTION_START
 	JITDISABLE(bJITIntegerOff);
+	bool add = !!(inst.SUBOP10 & 2); // add or sub
 	int a = inst.RA, b = inst.RB, d = inst.RD;
+	gpr.Lock(a, b, d);
+	gpr.BindToRegister(d, d == a || d == b, true);
+	JitClearCAOV(inst.OE);
 
-	if ((d == a) || (d == b))
+	if (d == a && d != b)
 	{
-		int operand = ((d == a) ? b : a);
-		gpr.Lock(a, b, d);
-		gpr.BindToRegister(d, true);
-		JitClearCAOV(inst.OE);
-		ADD(32, gpr.R(d), gpr.R(operand));
-		FinalizeCarryOverflow(inst.OE);
-		if (inst.Rc)
-			ComputeRC(gpr.R(d));
-		gpr.UnlockAll();
+		if (add)
+		{
+			ADD(32, gpr.R(d), gpr.R(b));
+		}
+		else
+		{
+			// special case, because sub isn't reversible
+			MOV(32, R(RSCRATCH), gpr.R(a));
+			MOV(32, gpr.R(d), gpr.R(b));
+			SUB(32, gpr.R(d), R(RSCRATCH));
+		}
 	}
 	else
 	{
-		gpr.Lock(a, b, d);
-		gpr.BindToRegister(d, false);
-		JitClearCAOV(inst.OE);
-		MOV(32, gpr.R(d), gpr.R(a));
-		ADD(32, gpr.R(d), gpr.R(b));
-		FinalizeCarryOverflow(inst.OE);
-		if (inst.Rc)
-			ComputeRC(gpr.R(d));
-		gpr.UnlockAll();
+		if (d != b)
+			MOV(32, gpr.R(d), gpr.R(b));
+		if (add)
+			ADD(32, gpr.R(d), gpr.R(a));
+		else
+			SUB(32, gpr.R(d), gpr.R(a));
 	}
+
+	FinalizeCarryOverflow(inst.OE, !add);
+	if (inst.Rc)
+		ComputeRC(gpr.R(d));
+	gpr.UnlockAll();
 }
 
 void Jit64::rlwinmx(UGeckoInstruction inst)

From bea2504a5111b00a0ce618f28c0777fb299307f3 Mon Sep 17 00:00:00 2001
From: Fiora <fioraaeterna@gmail.com>
Date: Thu, 21 Aug 2014 13:56:18 -0700
Subject: [PATCH 3/6] JIT64: optimize carry calculations

Omit carry calculations that get overwritten later in the block before they're
used. Very common in the case of srawix and friends.
---
 .../Interpreter/Interpreter_Tables.cpp        |   8 +-
 Source/Core/Core/PowerPC/Jit64/Jit.h          |   2 +-
 .../Core/Core/PowerPC/Jit64/Jit64_Tables.cpp  |   4 +-
 .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp   | 123 ++++++++++--------
 Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp   |   2 +-
 .../Core/Core/PowerPC/JitCommon/Jit_Util.cpp  |  10 +-
 Source/Core/Core/PowerPC/JitCommon/Jit_Util.h |   2 +-
 Source/Core/Core/PowerPC/PPCAnalyst.cpp       |  44 ++++---
 Source/Core/Core/PowerPC/PPCAnalyst.h         |   4 +-
 9 files changed, 116 insertions(+), 83 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp
index 2bf66ae99b12..317132266dc8 100644
--- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp
+++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp
@@ -34,7 +34,7 @@ static GekkoOPTemplate primarytable[] =
 	{10, Interpreter::cmpli,        {"cmpli",    OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn, 1, 0, 0, 0}},
 	{11, Interpreter::cmpi,         {"cmpi",     OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn, 1, 0, 0, 0}},
 	{12, Interpreter::addic,        {"addic",    OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA, 1, 0, 0, 0}},
-	{13, Interpreter::addic_rc,     {"addic_rc", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CR0, 1, 0, 0, 0}},
+	{13, Interpreter::addic_rc,     {"addic_rc", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA | FL_SET_CR0, 1, 0, 0, 0}},
 	{14, Interpreter::addi,         {"addi",     OPTYPE_INTEGER, FL_OUT_D | FL_IN_A0, 1, 0, 0, 0}},
 	{15, Interpreter::addis,        {"addis",    OPTYPE_INTEGER, FL_OUT_D | FL_IN_A0, 1, 0, 0, 0}},
 
@@ -180,8 +180,8 @@ static GekkoOPTemplate table31[] =
 	{922, Interpreter::extshx,      {"extshx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
 	{954, Interpreter::extsbx,      {"extsbx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
 	{536, Interpreter::srwx,        {"srwx",   OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
-	{792, Interpreter::srawx,       {"srawx",  OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
-	{824, Interpreter::srawix,      {"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
+	{792, Interpreter::srawx,       {"srawx",  OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
+	{824, Interpreter::srawix,      {"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
 	{24,  Interpreter::slwx,        {"slwx",   OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
 
 	{54,   Interpreter::dcbst,      {"dcbst",  OPTYPE_DCACHE, 0, 5, 0, 0, 0}},
@@ -260,7 +260,7 @@ static GekkoOPTemplate table31[] =
 	{339, Interpreter::mfspr,       {"mfspr",  OPTYPE_SPR, FL_OUT_D, 1, 0, 0, 0}},
 	{467, Interpreter::mtspr,       {"mtspr",  OPTYPE_SPR, 0, 2, 0, 0, 0}},
 	{371, Interpreter::mftb,        {"mftb",   OPTYPE_SYSTEM, FL_OUT_D | FL_TIMER, 1, 0, 0, 0}},
-	{512, Interpreter::mcrxr,       {"mcrxr",  OPTYPE_SYSTEM, 0, 1, 0, 0, 0}},
+	{512, Interpreter::mcrxr,       {"mcrxr",  OPTYPE_SYSTEM, FL_READ_CA | FL_SET_CA, 1, 0, 0, 0}},
 	{595, Interpreter::mfsr,        {"mfsr",   OPTYPE_SYSTEM, FL_OUT_D, 3, 0, 0, 0}},
 	{659, Interpreter::mfsrin,      {"mfsrin", OPTYPE_SYSTEM, FL_OUT_D, 3, 0, 0, 0}},
 
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h
index dd5af56673c8..498d833dd7e5 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit.h
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.h
@@ -100,7 +100,7 @@ class Jit64 : public Jitx86Base
 	void GenerateConstantOverflow(bool overflow);
 	void GenerateConstantOverflow(s64 val);
 	void GenerateOverflow();
-	void FinalizeCarryOverflow(bool oe, bool inv = false);
+	void FinalizeCarryOverflow(bool ca, bool oe, bool inv = false);
 	void ComputeRC(const Gen::OpArg & arg);
 
 	// Use to extract bytes from a register using the regcache. offset is in bytes.
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp
index 98c986a2f080..f9e0ac97d50f 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp
@@ -193,8 +193,8 @@ static GekkoOPTemplate table31[] =
 	{922, &Jit64::extsXx},                 //"extshx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}},
 	{954, &Jit64::extsXx},                 //"extsbx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}},
 	{536, &Jit64::srwx},                   //"srwx",   OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}},
-	{792, &Jit64::srawx},                  //"srawx",  OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}},
-	{824, &Jit64::srawix},                 //"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}},
+	{792, &Jit64::srawx},                  //"srawx",  OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT}},
+	{824, &Jit64::srawix},                 //"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT}},
 	{24,  &Jit64::slwx},                   //"slwx",   OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}},
 
 	{54,   &Jit64::dcbst},                 //"dcbst",  OPTYPE_DCACHE, 0, 4}},
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
index 0b6f0dac4f58..974ae1569d3d 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
@@ -45,7 +45,7 @@ void Jit64::GenerateOverflow()
 }
 
 // Assumes CA,OV are clear
-void Jit64::FinalizeCarryOverflow(bool oe, bool inv)
+void Jit64::FinalizeCarryOverflow(bool ca, bool oe, bool inv)
 {
 	// USES_XER
 	if (oe)
@@ -53,15 +53,17 @@ void Jit64::FinalizeCarryOverflow(bool oe, bool inv)
 		// this is slightly messy because JitSetCAIf modifies x86 flags, so we have to do it in both
 		// sides of the branch.
 		FixupBranch jno = J_CC(CC_NO);
-		JitSetCAIf(inv ? CC_NC : CC_C);
+		if (ca)
+			JitSetCAIf(inv ? CC_NC : CC_C);
 		//XER[OV/SO] = 1
 		OR(32, PPCSTATE(spr[SPR_XER]), Imm32(XER_SO_MASK | XER_OV_MASK));
 		FixupBranch exit = J();
 		SetJumpTarget(jno);
-		JitSetCAIf(inv ? CC_NC : CC_C);
+		if (ca)
+			JitSetCAIf(inv ? CC_NC : CC_C);
 		SetJumpTarget(exit);
 	}
-	else
+	else if (ca)
 	{
 		// Do carry
 		JitSetCAIf(inv ? CC_NC : CC_C);
@@ -129,10 +131,11 @@ static u32 Xor(u32 a, u32 b)
 void Jit64::regimmop(int d, int a, bool binary, u32 value, Operation doop, void (XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc, bool carry)
 {
 	gpr.Lock(d, a);
-	if (a || binary || carry)  // yeh nasty special case addic
+	// Be careful; addic treats r0 as r0, but addi treats r0 as zero.
+	if (a || binary || carry)
 	{
-		if (carry)
-			JitClearCAOV(false);
+		carry &= js.op->wantsCA;
+		JitClearCAOV(carry, false);
 		if (gpr.R(a).IsImm() && !carry)
 		{
 			gpr.SetImmediate32(d, doop((u32)gpr.R(a).offset, value));
@@ -717,41 +720,43 @@ void Jit64::subfic(UGeckoInstruction inst)
 	{
 		if (imm == 0)
 		{
-			JitClearCAOV(false);
+			JitClearCAOV(js.op->wantsCA, false);
 			// Flags act exactly like subtracting from 0
 			NEG(32, gpr.R(d));
 			// Output carry is inverted
-			JitSetCAIf(CC_NC);
+			if (js.op->wantsCA)
+				JitSetCAIf(CC_NC);
 		}
 		else if (imm == -1)
 		{
 			// CA is always set in this case
-			JitSetCA();
+			if (js.op->wantsCA)
+				JitSetCA();
 			NOT(32, gpr.R(d));
 		}
 		else
 		{
-			JitClearCAOV(false);
+			JitClearCAOV(js.op->wantsCA, false);
 			NOT(32, gpr.R(d));
 			ADD(32, gpr.R(d), Imm32(imm+1));
 			// Output carry is normal
-			JitSetCAIf(CC_C);
+			if (js.op->wantsCA)
+				JitSetCAIf(CC_C);
 		}
 	}
 	else
 	{
-		JitClearCAOV(false);
+		JitClearCAOV(js.op->wantsCA, false);
 		MOV(32, gpr.R(d), Imm32(imm));
 		SUB(32, gpr.R(d), gpr.R(a));
 		// Output carry is inverted
-		JitSetCAIf(CC_NC);
+		if (js.op->wantsCA)
+			JitSetCAIf(CC_NC);
 	}
 	gpr.UnlockAll();
 	// This instruction has no RC flag
 }
 
-
-
 void Jit64::subfx(UGeckoInstruction inst)
 {
 	INSTRUCTION_START
@@ -1250,7 +1255,7 @@ void Jit64::arithXex(UGeckoInstruction inst)
 			NOT(32, gpr.R(d));
 		ADC(32, gpr.R(d), source);
 	}
-	FinalizeCarryOverflow(inst.OE, invertedCarry);
+	FinalizeCarryOverflow(js.op->wantsCA, inst.OE, invertedCarry);
 	if (inst.Rc)
 		ComputeRC(gpr.R(d));
 	gpr.UnlockAll();
@@ -1264,7 +1269,7 @@ void Jit64::arithcx(UGeckoInstruction inst)
 	int a = inst.RA, b = inst.RB, d = inst.RD;
 	gpr.Lock(a, b, d);
 	gpr.BindToRegister(d, d == a || d == b, true);
-	JitClearCAOV(inst.OE);
+	JitClearCAOV(js.op->wantsCA, inst.OE);
 
 	if (d == a && d != b)
 	{
@@ -1290,7 +1295,7 @@ void Jit64::arithcx(UGeckoInstruction inst)
 			SUB(32, gpr.R(d), gpr.R(a));
 	}
 
-	FinalizeCarryOverflow(inst.OE, !add);
+	FinalizeCarryOverflow(js.op->wantsCA, inst.OE, !add);
 	if (inst.Rc)
 		ComputeRC(gpr.R(d));
 	gpr.UnlockAll();
@@ -1683,16 +1688,23 @@ void Jit64::srawx(UGeckoInstruction inst)
 	gpr.FlushLockX(ECX);
 	gpr.Lock(a, s, b);
 	gpr.BindToRegister(a, (a == s || a == b), true);
-	JitClearCAOV(false);
+	JitClearCAOV(js.op->wantsCA, false);
 	MOV(32, R(ECX), gpr.R(b));
 	if (a != s)
 		MOV(32, gpr.R(a), gpr.R(s));
 	SHL(64, gpr.R(a), Imm8(32));
 	SAR(64, gpr.R(a), R(ECX));
-	MOV(32, R(RSCRATCH), gpr.R(a));
-	SHR(64, gpr.R(a), Imm8(32));
-	TEST(32, gpr.R(a), R(RSCRATCH));
-	JitSetCAIf(CC_NZ);
+	if (js.op->wantsCA)
+	{
+		MOV(32, R(RSCRATCH), gpr.R(a));
+		SHR(64, gpr.R(a), Imm8(32));
+		TEST(32, gpr.R(a), R(RSCRATCH));
+		JitSetCAIf(CC_NZ);
+	}
+	else
+	{
+		SHR(64, gpr.R(a), Imm8(32));
+	}
 	gpr.UnlockAll();
 	gpr.UnlockAllX();
 	if (inst.Rc)
@@ -1710,41 +1722,50 @@ void Jit64::srawix(UGeckoInstruction inst)
 	{
 		gpr.Lock(a, s);
 		gpr.BindToRegister(a, a == s, true);
-		MOV(32, R(RSCRATCH), gpr.R(s));
-		if (a != s)
-			MOV(32, gpr.R(a), R(RSCRATCH));
-		// some optimized common cases that can be done in slightly fewer ops
-		if (amount == 31)
-		{
-			JitSetCA();
-			SAR(32, gpr.R(a), Imm8(31));
-			NEG(32, R(RSCRATCH));                                     // RSCRATCH = input == INT_MIN ? INT_MIN : -input;
-			AND(32, R(RSCRATCH), Imm32(0x80000000));                  // RSCRATCH = input < 0 && input != INT_MIN ? 0 : 0x80000000
-			SHR(32, R(RSCRATCH), Imm8(31 - XER_CA_SHIFT));
-			XOR(32, PPCSTATE(spr[SPR_XER]), R(RSCRATCH)); // XER.CA = (input < 0 && input != INT_MIN)
-		}
-		else if (amount == 1)
-		{
-			JitClearCAOV(false);
-			SHR(32, R(RSCRATCH), Imm8(31));                          // sign
-			AND(32, R(RSCRATCH), gpr.R(a));                          // (sign && carry)
-			SAR(32, gpr.R(a), Imm8(1));
-			SHL(32, R(RSCRATCH), Imm8(XER_CA_SHIFT));
-			OR(32, PPCSTATE(spr[SPR_XER]), R(RSCRATCH)); // XER.CA = sign && carry, aka (input&0x80000001) == 0x80000001
+		if (!js.op->wantsCA)
+		{
+			if (a != s)
+				MOV(32, gpr.R(a), gpr.R(s));
+			SAR(32, gpr.R(a), Imm8(amount));
 		}
 		else
 		{
-			JitClearCAOV(false);
-			SAR(32, gpr.R(a), Imm8(amount));
-			SHL(32, R(RSCRATCH), Imm8(32 - amount));
-			TEST(32, R(RSCRATCH), gpr.R(a));
-			JitSetCAIf(CC_NZ);
+			MOV(32, R(RSCRATCH), gpr.R(s));
+			if (a != s)
+				MOV(32, gpr.R(a), R(RSCRATCH));
+			// some optimized common cases that can be done in slightly fewer ops
+			if (amount == 31)
+			{
+				JitSetCA();
+				SAR(32, gpr.R(a), Imm8(31));
+				NEG(32, R(RSCRATCH));                                     // RSCRATCH = input == INT_MIN ? INT_MIN : -input;
+				AND(32, R(RSCRATCH), Imm32(0x80000000));                  // RSCRATCH = input < 0 && input != INT_MIN ? 0 : 0x80000000
+				SHR(32, R(RSCRATCH), Imm8(31 - XER_CA_SHIFT));
+				XOR(32, PPCSTATE(spr[SPR_XER]), R(RSCRATCH)); // XER.CA = (input < 0 && input != INT_MIN)
+			}
+			else if (amount == 1)
+			{
+				JitClearCAOV(true, false);
+				SHR(32, R(RSCRATCH), Imm8(31));                          // sign
+				AND(32, R(RSCRATCH), gpr.R(a));                          // (sign && carry)
+				SAR(32, gpr.R(a), Imm8(1));
+				SHL(32, R(RSCRATCH), Imm8(XER_CA_SHIFT));
+				OR(32, PPCSTATE(spr[SPR_XER]), R(RSCRATCH)); // XER.CA = sign && carry, aka (input&0x80000001) == 0x80000001
+			}
+			else
+			{
+				JitClearCAOV(true, false);
+				SAR(32, gpr.R(a), Imm8(amount));
+				SHL(32, R(RSCRATCH), Imm8(32 - amount));
+				TEST(32, R(RSCRATCH), gpr.R(a));
+				JitSetCAIf(CC_NZ);
+			}
 		}
 	}
 	else
 	{
 		gpr.Lock(a, s);
-		JitClearCAOV(false);
+		JitClearCAOV(js.op->wantsCA, false);
 		gpr.BindToRegister(a, a == s, true);
 
 		if (a != s)
diff --git a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp
index 695dab795edf..0305e22ac532 100644
--- a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp
+++ b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp
@@ -1106,7 +1106,7 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
 			Jit->JitSetCA();
 			FixupBranch cont = Jit->J();
 			Jit->SetJumpTarget(nocarry);
-			Jit->JitClearCAOV(false);
+			Jit->JitClearCAOV(true, false);
 			Jit->SetJumpTarget(cont);
 			regNormalRegClear(RI, I);
 			break;
diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
index c12b9fedcfc4..2b1a0ef7c114 100644
--- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
+++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
@@ -825,10 +825,10 @@ void EmuCodeBlock::JitSetCAIf(CCFlags conditionCode)
 	OR(32, PPCSTATE(spr[SPR_XER]), R(RSCRATCH)); //XER.CA = 1
 }
 
-void EmuCodeBlock::JitClearCAOV(bool oe)
+void EmuCodeBlock::JitClearCAOV(bool ca, bool oe)
 {
-	if (oe)
-		AND(32, PPCSTATE(spr[SPR_XER]), Imm32(~XER_CA_MASK & ~XER_OV_MASK)); //XER.CA, XER.OV = 0
-	else
-		AND(32, PPCSTATE(spr[SPR_XER]), Imm32(~XER_CA_MASK)); //XER.CA = 0
+	u32 mask = (ca ? ~XER_CA_MASK : 0xFFFFFFFF) & (oe ? ~XER_OV_MASK : 0xFFFFFFFF);
+	if (mask == 0xFFFFFFFF)
+		return;
+	AND(32, PPCSTATE(spr[SPR_XER]), Imm32(mask));
 }
diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
index e50eedf08fa7..221ce455ab67 100644
--- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
+++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
@@ -74,7 +74,7 @@ class EmuCodeBlock : public Gen::X64CodeBlock
 	void JitGetAndClearCAOV(bool oe);
 	void JitSetCA();
 	void JitSetCAIf(Gen::CCFlags conditionCode);
-	void JitClearCAOV(bool oe);
+	void JitClearCAOV(bool ca, bool oe);
 
 	void ForceSinglePrecisionS(Gen::X64Reg xmm);
 	void ForceSinglePrecisionP(Gen::X64Reg xmm);
diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp
index 55adc3f17252..e7c06a200937 100644
--- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp
+++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp
@@ -430,7 +430,6 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf
 {
 	code->wantsCR0 = false;
 	code->wantsCR1 = false;
-	code->wantsPS1 = false;
 
 	if (opinfo->flags & FL_USE_FPU)
 		block->m_fpa->any = true;
@@ -458,6 +457,15 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf
 	code->outputFPRF = (opinfo->flags & FL_SET_FPRF) ? true : false;
 	code->canEndBlock = (opinfo->flags & FL_ENDBLOCK) ? true : false;
 
+	code->wantsCA = (opinfo->flags & FL_READ_CA) ? true : false;
+	code->outputCA = (opinfo->flags & FL_SET_CA) ? true : false;
+
+	// mfspr/mtspr can affect/use XER, so be super careful here
+	if (code->inst.OPCD == 31 && code->inst.SUBOP10 == 339) // mfspr
+		code->wantsCA = ((code->inst.SPRU << 5) | (code->inst.SPRL & 0x1F)) == SPR_XER;
+	if (code->inst.OPCD == 31 && code->inst.SUBOP10 == 467) // mtspr
+		code->outputCA = ((code->inst.SPRU << 5) | (code->inst.SPRL & 0x1F)) == SPR_XER;
+
 	int numOut = 0;
 	int numIn = 0;
 	if (opinfo->flags & FL_OUT_A)
@@ -715,26 +723,30 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32
 		block->m_broken = true;
 	}
 
-	// Scan for CR0 dependency
-	// assume next block wants flags to be safe
+	// Scan for flag dependencies; assume the next block (or any branch that can leave the block)
+	// wants flags, to be safe.
 	bool wantsCR0 = true;
 	bool wantsCR1 = true;
-	bool wantsPS1 = true;
 	bool wantsFPRF = true;
+	bool wantsCA = true;
 	for (int i = block->m_num_instructions - 1; i >= 0; i--)
 	{
-		wantsCR0 |= code[i].wantsCR0 || code[i].canEndBlock;
-		wantsCR1 |= code[i].wantsCR1 || code[i].canEndBlock;
-		wantsPS1 |= code[i].wantsPS1 || code[i].canEndBlock;
-		wantsFPRF |= code[i].wantsFPRF || code[i].canEndBlock;
-		code[i].wantsCR0 = wantsCR0;
-		code[i].wantsCR1 = wantsCR1;
-		code[i].wantsPS1 = wantsPS1;
-		code[i].wantsFPRF = wantsFPRF;
-		wantsCR0 &= !code[i].outputCR0;
-		wantsCR1 &= !code[i].outputCR1;
-		wantsPS1 &= !code[i].outputPS1;
-		wantsFPRF &= !code[i].outputFPRF;
+		bool opWantsCR0  = code[i].wantsCR0;
+		bool opWantsCR1  = code[i].wantsCR1;
+		bool opWantsFPRF = code[i].wantsFPRF;
+		bool opWantsCA   = code[i].wantsCA;
+		code[i].wantsCR0  = wantsCR0  || code[i].canEndBlock;
+		code[i].wantsCR1  = wantsCR1  || code[i].canEndBlock;
+		code[i].wantsFPRF = wantsFPRF || code[i].canEndBlock;
+		code[i].wantsCA   = wantsCA   || code[i].canEndBlock;
+		wantsCR0  |= opWantsCR0  || code[i].canEndBlock;
+		wantsCR1  |= opWantsCR1  || code[i].canEndBlock;
+		wantsFPRF |= opWantsFPRF || code[i].canEndBlock;
+		wantsCA   |= opWantsCA   || code[i].canEndBlock;
+		wantsCR0  &= !code[i].outputCR0  || opWantsCR0;
+		wantsCR1  &= !code[i].outputCR1  || opWantsCR1;
+		wantsFPRF &= !code[i].outputFPRF || opWantsFPRF;
+		wantsCA   &= !code[i].outputCA   || opWantsCA;
 	}
 	return address;
 }
diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.h b/Source/Core/Core/PowerPC/PPCAnalyst.h
index 4a9058112a6e..774129a5d0fe 100644
--- a/Source/Core/Core/PowerPC/PPCAnalyst.h
+++ b/Source/Core/Core/PowerPC/PPCAnalyst.h
@@ -33,12 +33,12 @@ struct CodeOp //16B
 	bool isBranchTarget;
 	bool wantsCR0;
 	bool wantsCR1;
-	bool wantsPS1;
 	bool wantsFPRF;
+	bool wantsCA;
 	bool outputCR0;
 	bool outputCR1;
-	bool outputPS1;
 	bool outputFPRF;
+	bool outputCA;
 	bool canEndBlock;
 	bool skip;  // followed BL-s for example
 };

From 45d84605a9a2a0115976af061d318345985ac422 Mon Sep 17 00:00:00 2001
From: Fiora <fioraaeterna@gmail.com>
Date: Sun, 7 Sep 2014 00:37:47 -0700
Subject: [PATCH 4/6] JIT64: optimize carry calculations further

Keep carry flags in the x86 flags register if used in the next instruction.
---
 Source/Core/Core/PowerPC/Jit64/Jit.cpp        |   3 +
 Source/Core/Core/PowerPC/Jit64/Jit.h          |   4 +-
 .../Core/Core/PowerPC/Jit64/Jit64_Tables.cpp  |   2 +-
 .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp   | 129 ++++++++++++------
 Source/Core/Core/PowerPC/JitCommon/JitBase.h  |   5 +-
 .../Core/Core/PowerPC/JitCommon/Jit_Util.cpp  |   1 +
 Source/Core/Core/PowerPC/PPCAnalyst.cpp       |   6 +
 Source/Core/Core/PowerPC/PPCAnalyst.h         |   1 +
 8 files changed, 109 insertions(+), 42 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp
index 7412489948a5..619217e4b69b 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp
@@ -457,6 +457,8 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc
 		js.downcountAmount += PatchEngine::GetSpeedhackCycles(code_block.m_address);
 
 	js.skipnext = false;
+	js.carryFlagSet = false;
+	js.carryFlagInverted = false;
 	js.compilerPC = nextPC;
 	// Translate instructions
 	for (u32 i = 0; i < code_block.m_num_instructions; i++)
@@ -488,6 +490,7 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc
 			// help peephole optimizations
 			js.next_inst = ops[i + 1].inst;
 			js.next_compilerPC = ops[i + 1].address;
+			js.next_op = &ops[i + 1];
 		}
 
 		if (jo.optimizeGatherPipe && js.fifoBytesThisBlock >= 32)
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h
index 498d833dd7e5..53a846237f02 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit.h
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.h
@@ -100,7 +100,9 @@ class Jit64 : public Jitx86Base
 	void GenerateConstantOverflow(bool overflow);
 	void GenerateConstantOverflow(s64 val);
 	void GenerateOverflow();
-	void FinalizeCarryOverflow(bool ca, bool oe, bool inv = false);
+	void FinalizeCarryOverflow(bool oe, bool inv = false);
+	void FinalizeCarry(Gen::CCFlags cond);
+	void FinalizeCarry(bool ca);
 	void ComputeRC(const Gen::OpArg & arg);
 
 	// Use to extract bytes from a register using the regcache. offset is in bytes.
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp
index f9e0ac97d50f..88f686023a9c 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp
@@ -273,7 +273,7 @@ static GekkoOPTemplate table31[] =
 	{339, &Jit64::mfspr},                  //"mfspr",  OPTYPE_SPR, FL_OUT_D}},
 	{467, &Jit64::mtspr},                  //"mtspr",  OPTYPE_SPR, 0, 2}},
 	{371, &Jit64::mftb},                   //"mftb",   OPTYPE_SYSTEM, FL_OUT_D | FL_TIMER}},
-	{512, &Jit64::mcrxr},                  //"mcrxr",  OPTYPE_SYSTEM, 0}},
+	{512, &Jit64::mcrxr},                  //"mcrxr",  OPTYPE_SYSTEM, FL_READ_CA | FL_SET_CA}},
 	{595, &Jit64::FallBackToInterpreter},  //"mfsr",   OPTYPE_SYSTEM, FL_OUT_D, 2}},
 	{659, &Jit64::FallBackToInterpreter},  //"mfsrin", OPTYPE_SYSTEM, FL_OUT_D, 2}},
 
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
index 974ae1569d3d..203a5fae00ad 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
@@ -44,30 +44,76 @@ void Jit64::GenerateOverflow()
 	SetJumpTarget(exit);
 }
 
+void Jit64::FinalizeCarry(CCFlags cond)
+{
+	js.carryFlagSet = false;
+	js.carryFlagInverted = false;
+	if (js.op->wantsCA)
+	{
+		if (js.next_op->wantsCAInFlags)
+		{
+			if (cond == CC_C || cond == CC_NC)
+			{
+				js.carryFlagInverted = cond == CC_NC;
+			}
+			else
+			{
+				// convert the condition to a carry flag (is there a better way?)
+				SETcc(cond, R(RSCRATCH));
+				BT(8, R(RSCRATCH), Imm8(0));
+			}
+			js.carryFlagSet = true;
+		}
+		else
+		{
+			JitSetCAIf(cond);
+		}
+	}
+}
+
+// Unconditional version
+void Jit64::FinalizeCarry(bool ca)
+{
+	js.carryFlagSet = false;
+	js.carryFlagInverted = false;
+	if (js.op->wantsCA)
+	{
+		if (js.next_op->wantsCAInFlags)
+		{
+			if (ca)
+				STC();
+			else
+				CLC();
+			js.carryFlagSet = true;
+		}
+		else if (ca)
+		{
+			JitSetCA();
+		}
+		else
+		{
+			JitClearCAOV(true, false);
+		}
+	}
+}
+
 // Assumes CA,OV are clear
-void Jit64::FinalizeCarryOverflow(bool ca, bool oe, bool inv)
+void Jit64::FinalizeCarryOverflow(bool oe, bool inv)
 {
 	// USES_XER
 	if (oe)
 	{
-		// this is slightly messy because JitSetCAIf modifies x86 flags, so we have to do it in both
-		// sides of the branch.
+		// Make sure not to lose the carry flags (not a big deal, this path is rare).
+		PUSHF();
+		AND(32, PPCSTATE(spr[SPR_XER]), Imm32(~(XER_SO_MASK | XER_OV_MASK)));
 		FixupBranch jno = J_CC(CC_NO);
-		if (ca)
-			JitSetCAIf(inv ? CC_NC : CC_C);
 		//XER[OV/SO] = 1
 		OR(32, PPCSTATE(spr[SPR_XER]), Imm32(XER_SO_MASK | XER_OV_MASK));
-		FixupBranch exit = J();
 		SetJumpTarget(jno);
-		if (ca)
-			JitSetCAIf(inv ? CC_NC : CC_C);
-		SetJumpTarget(exit);
-	}
-	else if (ca)
-	{
-		// Do carry
-		JitSetCAIf(inv ? CC_NC : CC_C);
+		POPF();
 	}
+	// Do carry
+	FinalizeCarry(inv ? CC_NC : CC_C);
 }
 
 void Jit64::ComputeRC(const Gen::OpArg & arg)
@@ -135,7 +181,6 @@ void Jit64::regimmop(int d, int a, bool binary, u32 value, Operation doop, void
 	if (a || binary || carry)
 	{
 		carry &= js.op->wantsCA;
-		JitClearCAOV(carry, false);
 		if (gpr.R(a).IsImm() && !carry)
 		{
 			gpr.SetImmediate32(d, doop((u32)gpr.R(a).offset, value));
@@ -159,7 +204,7 @@ void Jit64::regimmop(int d, int a, bool binary, u32 value, Operation doop, void
 			}
 		}
 		if (carry)
-			JitSetCAIf(CC_C);
+			FinalizeCarry(CC_C);
 		if (Rc)
 			ComputeRC(gpr.R(d));
 	}
@@ -720,38 +765,31 @@ void Jit64::subfic(UGeckoInstruction inst)
 	{
 		if (imm == 0)
 		{
-			JitClearCAOV(js.op->wantsCA, false);
 			// Flags act exactly like subtracting from 0
 			NEG(32, gpr.R(d));
 			// Output carry is inverted
-			if (js.op->wantsCA)
-				JitSetCAIf(CC_NC);
+			FinalizeCarry(CC_NC);
 		}
 		else if (imm == -1)
 		{
-			// CA is always set in this case
-			if (js.op->wantsCA)
-				JitSetCA();
 			NOT(32, gpr.R(d));
+			// CA is always set in this case
+			FinalizeCarry(true);
 		}
 		else
 		{
-			JitClearCAOV(js.op->wantsCA, false);
 			NOT(32, gpr.R(d));
 			ADD(32, gpr.R(d), Imm32(imm+1));
 			// Output carry is normal
-			if (js.op->wantsCA)
-				JitSetCAIf(CC_C);
+			FinalizeCarry(CC_C);
 		}
 	}
 	else
 	{
-		JitClearCAOV(js.op->wantsCA, false);
 		MOV(32, gpr.R(d), Imm32(imm));
 		SUB(32, gpr.R(d), gpr.R(a));
 		// Output carry is inverted
-		if (js.op->wantsCA)
-			JitSetCAIf(CC_NC);
+		FinalizeCarry(CC_NC);
 	}
 	gpr.UnlockAll();
 	// This instruction has no RC flag
@@ -1233,29 +1271,44 @@ void Jit64::arithXex(UGeckoInstruction inst)
 	int a = inst.RA;
 	int b = regsource ? inst.RB : a;
 	int d = inst.RD;
+	bool same_input_sub = !add && regsource && a == b;
 
 	gpr.Lock(a, b, d);
-	gpr.BindToRegister(d, d == a || d == b);
-	JitGetAndClearCAOV(inst.OE);
+	gpr.BindToRegister(d, !same_input_sub && (d == a || d == b));
+	if (!js.carryFlagSet)
+		JitGetAndClearCAOV(inst.OE);
 
 	bool invertedCarry = false;
-	if (!add && regsource && d == b)
+	// Special case: subfe A, B, B is a common compiler idiom
+	if (same_input_sub)
 	{
 		// Convert carry to borrow
-		CMC();
+		if (!js.carryFlagInverted)
+			CMC();
+		SBB(32, gpr.R(d), gpr.R(d));
+		invertedCarry = true;
+	}
+	else if (!add && regsource && d == b)
+	{
+		if (!js.carryFlagInverted)
+			CMC();
+		if (d != b)
+			MOV(32, gpr.R(d), gpr.R(b));
 		SBB(32, gpr.R(d), gpr.R(a));
 		invertedCarry = true;
 	}
 	else
 	{
 		OpArg source = regsource ? gpr.R(d == b ? a : b) : Imm32(mex ? 0xFFFFFFFF : 0);
+		if (js.carryFlagInverted)
+			CMC();
 		if (d != a && d != b)
 			MOV(32, gpr.R(d), gpr.R(a));
 		if (!add)
 			NOT(32, gpr.R(d));
 		ADC(32, gpr.R(d), source);
 	}
-	FinalizeCarryOverflow(js.op->wantsCA, inst.OE, invertedCarry);
+	FinalizeCarryOverflow(inst.OE, invertedCarry);
 	if (inst.Rc)
 		ComputeRC(gpr.R(d));
 	gpr.UnlockAll();
@@ -1269,7 +1322,6 @@ void Jit64::arithcx(UGeckoInstruction inst)
 	int a = inst.RA, b = inst.RB, d = inst.RD;
 	gpr.Lock(a, b, d);
 	gpr.BindToRegister(d, d == a || d == b, true);
-	JitClearCAOV(js.op->wantsCA, inst.OE);
 
 	if (d == a && d != b)
 	{
@@ -1295,7 +1347,7 @@ void Jit64::arithcx(UGeckoInstruction inst)
 			SUB(32, gpr.R(d), gpr.R(a));
 	}
 
-	FinalizeCarryOverflow(js.op->wantsCA, inst.OE, !add);
+	FinalizeCarryOverflow(inst.OE, !add);
 	if (inst.Rc)
 		ComputeRC(gpr.R(d));
 	gpr.UnlockAll();
@@ -1688,7 +1740,6 @@ void Jit64::srawx(UGeckoInstruction inst)
 	gpr.FlushLockX(ECX);
 	gpr.Lock(a, s, b);
 	gpr.BindToRegister(a, (a == s || a == b), true);
-	JitClearCAOV(js.op->wantsCA, false);
 	MOV(32, R(ECX), gpr.R(b));
 	if (a != s)
 		MOV(32, gpr.R(a), gpr.R(s));
@@ -1699,12 +1750,12 @@ void Jit64::srawx(UGeckoInstruction inst)
 		MOV(32, R(RSCRATCH), gpr.R(a));
 		SHR(64, gpr.R(a), Imm8(32));
 		TEST(32, gpr.R(a), R(RSCRATCH));
-		JitSetCAIf(CC_NZ);
 	}
 	else
 	{
 		SHR(64, gpr.R(a), Imm8(32));
 	}
+	FinalizeCarry(CC_NZ);
 	gpr.UnlockAll();
 	gpr.UnlockAllX();
 	if (inst.Rc)
@@ -1758,14 +1809,14 @@ void Jit64::srawix(UGeckoInstruction inst)
 				SAR(32, gpr.R(a), Imm8(amount));
 				SHL(32, R(RSCRATCH), Imm8(32 - amount));
 				TEST(32, R(RSCRATCH), gpr.R(a));
-				JitSetCAIf(CC_NZ);
+				FinalizeCarry(CC_NZ);
 			}
 		}
 	}
 	else
 	{
 		gpr.Lock(a, s);
-		JitClearCAOV(js.op->wantsCA, false);
+		FinalizeCarry(false);
 		gpr.BindToRegister(a, a == s, true);
 
 		if (a != s)
diff --git a/Source/Core/Core/PowerPC/JitCommon/JitBase.h b/Source/Core/Core/PowerPC/JitCommon/JitBase.h
index 816bfeae13bb..c6ff6e496747 100644
--- a/Source/Core/Core/PowerPC/JitCommon/JitBase.h
+++ b/Source/Core/Core/PowerPC/JitCommon/JitBase.h
@@ -81,13 +81,16 @@ class JitBase : public CPUCoreBase
 		bool isLastInstruction;
 		bool memcheck;
 		bool skipnext;
+		bool carryFlagSet;
+		bool carryFlagInverted;
 
 		int fifoBytesThisBlock;
 
 		PPCAnalyst::BlockStats st;
 		PPCAnalyst::BlockRegStats gpa;
 		PPCAnalyst::BlockRegStats fpa;
-		PPCAnalyst::CodeOp *op;
+		PPCAnalyst::CodeOp* op;
+		PPCAnalyst::CodeOp* next_op;
 		u8* rewriteStart;
 
 		JitBlock *curBlock;
diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
index 2b1a0ef7c114..ee7441a607ba 100644
--- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
+++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
@@ -822,6 +822,7 @@ void EmuCodeBlock::JitSetCAIf(CCFlags conditionCode)
 	SETcc(conditionCode, R(RSCRATCH));
 	MOVZX(32, 8, RSCRATCH, R(RSCRATCH));
 	SHL(32, R(RSCRATCH), Imm8(XER_CA_SHIFT));
+	AND(32, PPCSTATE(spr[SPR_XER]), Imm32(~XER_CA_MASK));
 	OR(32, PPCSTATE(spr[SPR_XER]), R(RSCRATCH)); //XER.CA = 1
 }
 
diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp
index e7c06a200937..36f2ecd91dee 100644
--- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp
+++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp
@@ -460,7 +460,13 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf
 	code->wantsCA = (opinfo->flags & FL_READ_CA) ? true : false;
 	code->outputCA = (opinfo->flags & FL_SET_CA) ? true : false;
 
+	// We're going to try to avoid storing carry in XER if we can avoid it -- keep it in the x86 carry flag!
+	// If the instruction reads CA but doesn't write it, we still need to store CA in XER; we can't
+	// leave it in flags.
+	code->wantsCAInFlags = code->wantsCA && code->outputCA && code->inst.SUBOP10 != 512;
+
 	// mfspr/mtspr can affect/use XER, so be super careful here
+	// we need to note specifically that mfspr needs CA in XER, not in the x86 carry flag
 	if (code->inst.OPCD == 31 && code->inst.SUBOP10 == 339) // mfspr
 		code->wantsCA = ((code->inst.SPRU << 5) | (code->inst.SPRL & 0x1F)) == SPR_XER;
 	if (code->inst.OPCD == 31 && code->inst.SUBOP10 == 467) // mtspr
diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.h b/Source/Core/Core/PowerPC/PPCAnalyst.h
index 774129a5d0fe..aa1a00abeb30 100644
--- a/Source/Core/Core/PowerPC/PPCAnalyst.h
+++ b/Source/Core/Core/PowerPC/PPCAnalyst.h
@@ -35,6 +35,7 @@ struct CodeOp //16B
 	bool wantsCR1;
 	bool wantsFPRF;
 	bool wantsCA;
+	bool wantsCAInFlags;
 	bool outputCR0;
 	bool outputCR1;
 	bool outputFPRF;

From 54129a8ca5bebb7d540132f061ab80d07adaadb4 Mon Sep 17 00:00:00 2001
From: Fiora <fioraaeterna@gmail.com>
Date: Sun, 7 Sep 2014 08:30:11 -0700
Subject: [PATCH 5/6] PPCAnalyst: refactor, add carry op reordering and non-cmp
 reordering

Tries as hard as possible to push carry-using operations (like addc and adde)
next to each other. Refactor the instruction reordering to be more flexible
and allow multiple passes.

353 -> 192 x86 instructions on a carry-heavy code block in Pokemon Puzzle.
12% faster overall in Pokemon Puzzle; probably less in typical games (Virtual
Console games seem to be carry-heavy for some reason; maybe a different
compiler?)
---
 .../Interpreter/Interpreter_Tables.cpp        |  16 +--
 .../Core/Core/PowerPC/Jit64/Jit64_Tables.cpp  |   2 +-
 Source/Core/Core/PowerPC/PPCAnalyst.cpp       | 106 +++++++++++++-----
 Source/Core/Core/PowerPC/PPCAnalyst.h         |   7 ++
 Source/Core/Core/PowerPC/PPCTables.h          |   1 +
 5 files changed, 93 insertions(+), 39 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp
index 317132266dc8..3dfc55ce2394 100644
--- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp
+++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp
@@ -280,26 +280,26 @@ static GekkoOPTemplate table31[] =
 static GekkoOPTemplate table31_2[] =
 {
 	{266,  Interpreter::addx,        {"addx",    OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
-	{778,  Interpreter::addx,        {"addox",    OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
+	{778,  Interpreter::addx,        {"addox",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
 	{10,   Interpreter::addcx,       {"addcx",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
-	{522,  Interpreter::addcx,       {"addcox",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
+	{522,  Interpreter::addcx,       {"addcox",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
 	{138,  Interpreter::addex,       {"addex",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
-	{650,  Interpreter::addex,       {"addeox",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
+	{650,  Interpreter::addex,       {"addeox",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT | FL_SET_OE, 1, 0, 0, 0}},
 	{234,  Interpreter::addmex,      {"addmex",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
 	{202,  Interpreter::addzex,      {"addzex",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
 	{491,  Interpreter::divwx,       {"divwx",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 40, 0, 0, 0}},
-	{1003, Interpreter::divwx,       {"divwox",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 40, 0, 0, 0}},
+	{1003, Interpreter::divwx,       {"divwox",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 40, 0, 0, 0}},
 	{459,  Interpreter::divwux,      {"divwux",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 40, 0, 0, 0}},
-	{971,  Interpreter::divwux,      {"divwuox",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 40, 0, 0, 0}},
+	{971,  Interpreter::divwux,      {"divwuox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 40, 0, 0, 0}},
 	{75,   Interpreter::mulhwx,      {"mulhwx",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 5, 0, 0, 0}},
 	{11,   Interpreter::mulhwux,     {"mulhwux", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 5, 0, 0, 0}},
 	{235,  Interpreter::mullwx,      {"mullwx",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 5, 0, 0, 0}},
-	{747,  Interpreter::mullwx,      {"mullwox",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 5, 0, 0, 0}},
+	{747,  Interpreter::mullwx,      {"mullwox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 5, 0, 0, 0}},
 	{104,  Interpreter::negx,        {"negx",    OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
 	{40,   Interpreter::subfx,       {"subfx",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
-	{552,  Interpreter::subfx,       {"subox",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
+	{552,  Interpreter::subfx,       {"subox",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 1, 0, 0, 0}},
 	{8,    Interpreter::subfcx,      {"subfcx",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
-	{520,  Interpreter::subfcx,      {"subfcox",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
+	{520,  Interpreter::subfcx,      {"subfcox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT | FL_SET_OE, 1, 0, 0, 0}},
 	{136,  Interpreter::subfex,      {"subfex",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
 	{232,  Interpreter::subfmex,     {"subfmex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
 	{200,  Interpreter::subfzex,     {"subfzex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp
index 88f686023a9c..a9c932b8ca0d 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp
@@ -48,7 +48,7 @@ static GekkoOPTemplate primarytable[] =
 	{10, &Jit64::cmpXX},                 //"cmpli",    OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn}},
 	{11, &Jit64::cmpXX},                 //"cmpi",     OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn}},
 	{12, &Jit64::reg_imm},               //"addic",    OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA}},
-	{13, &Jit64::reg_imm},               //"addic_rc", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CR0}},
+	{13, &Jit64::reg_imm},               //"addic_rc", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA | FL_SET_CR0}},
 	{14, &Jit64::reg_imm},               //"addi",     OPTYPE_INTEGER, FL_OUT_D | FL_IN_A0}},
 	{15, &Jit64::reg_imm},               //"addis",    OPTYPE_INTEGER, FL_OUT_D | FL_IN_A0}},
 
diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp
index 36f2ecd91dee..2840b52fc993 100644
--- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp
+++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp
@@ -213,14 +213,17 @@ static void AnalyzeFunction2(Symbol *func)
 	func->flags = flags;
 }
 
-// IMPORTANT - CURRENTLY ASSUMES THAT A IS A COMPARE
 static bool CanSwapAdjacentOps(const CodeOp &a, const CodeOp &b)
 {
+	const GekkoOPInfo *a_info = a.opinfo;
 	const GekkoOPInfo *b_info = b.opinfo;
+	int a_flags = a_info->flags;
 	int b_flags = b_info->flags;
-	if (b_flags & (FL_SET_CRx | FL_ENDBLOCK | FL_TIMER | FL_EVIL))
+	if (b_flags & (FL_SET_CRx | FL_ENDBLOCK | FL_TIMER | FL_EVIL | FL_SET_OE))
 		return false;
-	if ((b_flags & (FL_RC_BIT | FL_RC_BIT_F)) && (b.inst.hex & 1))
+	if ((b_flags & (FL_RC_BIT | FL_RC_BIT_F)) && (b.inst.Rc))
+		return false;
+	if ((a_flags & (FL_SET_CA | FL_READ_CA)) && (b_flags & (FL_SET_CA | FL_READ_CA)))
 		return false;
 
 	switch (b.inst.OPCD)
@@ -250,20 +253,16 @@ static bool CanSwapAdjacentOps(const CodeOp &a, const CodeOp &b)
 	{
 		int regInA = a.regsIn[j];
 		int regInB = b.regsIn[j];
-		if (regInA >= 0 &&
-			(b.regsOut[0] == regInA ||
-			 b.regsOut[1] == regInA))
-		{
-			// reg collision! don't swap
+		// register collision: b outputs to one of a's inputs
+		if (regInA >= 0 && (b.regsOut[0] == regInA || b.regsOut[1] == regInA))
 			return false;
-		}
-		if (regInB >= 0 &&
-			(a.regsOut[0] == regInB ||
-			 a.regsOut[1] == regInB))
-		{
-			// reg collision! don't swap
+		// register collision: a outputs to one of b's inputs
+		if (regInB >= 0 && (a.regsOut[0] == regInB || a.regsOut[1] == regInB))
 			return false;
-		}
+		// register collision: b outputs to one of a's outputs (overwriting it)
+		for (int k = 0; k < 2; k++)
+			if (b.regsOut[k] >= 0 && (b.regsOut[k] == a.regsOut[0] || b.regsOut[k] == a.regsOut[1]))
+				return false;
 	}
 
 	return true;
@@ -403,29 +402,76 @@ void FindFunctions(u32 startAddr, u32 endAddr, PPCSymbolDB *func_db)
 		leafSize, niceSize, unniceSize);
 }
 
-void PPCAnalyzer::ReorderInstructions(u32 instructions, CodeOp *code)
+static bool isCmp(const CodeOp& a)
 {
-	// Instruction Reordering Pass
-	// Bubble down compares towards branches, so that they can be merged.
-	// -2: -1 for the pair, -1 for not swapping with the final instruction which is probably the branch.
-	for (u32 i = 0; i < (instructions - 2); ++i)
+	return (a.inst.OPCD == 10 || a.inst.OPCD == 11) || (a.inst.OPCD == 31 && (a.inst.SUBOP10 == 0 || a.inst.SUBOP10 == 32));
+}
+
+static bool isRlwinm_rc(const CodeOp& a)
+{
+	return a.inst.OPCD == 21 && a.inst.Rc;
+}
+
+static bool isCarryOp(const CodeOp& a)
+{
+	return (a.opinfo->flags & FL_SET_CA) && !(a.opinfo->flags & FL_SET_OE) && a.opinfo->type == OPTYPE_INTEGER;
+}
+
+void PPCAnalyzer::ReorderInstructionsCore(u32 instructions, CodeOp* code, bool reverse, ReorderType type)
+{
+	// Bubbling an instruction sometimes reveals another opportunity to bubble an instruction, so do
+	// multiple passes.
+	while (true)
 	{
-		CodeOp &a = code[i];
-		CodeOp &b = code[i + 1];
-		// All integer compares can be reordered.
-		if ((a.inst.OPCD == 10 || a.inst.OPCD == 11) ||
-			(a.inst.OPCD == 31 && (a.inst.SUBOP10 == 0 || a.inst.SUBOP10 == 32)))
+		// Instruction Reordering Pass
+		// Carry pass: bubble carry-using instructions as close to each other as possible, so we can avoid
+		// storing the carry flag.
+		// Compare pass: bubble compare instructions next to branches, so they can be merged.
+		bool swapped = false;
+		int increment = reverse ? -1 : 1;
+		int start = reverse ? instructions - 1 : 0;
+		int end = reverse ? 0 : instructions - 1;
+		for (int i = start; i != end; i += increment)
 		{
-			// Got a compare instruction.
-			if (CanSwapAdjacentOps(a, b))
+			CodeOp &a = code[i];
+			CodeOp &b = code[i + increment];
+			// Reorder integer compares, rlwinm., and carry-affecting ops
+			// (if we add more merged branch instructions, add them here!)
+			if ((type == REORDER_CARRY && isCarryOp(a)) || (type == REORDER_CMP && (isCmp(a) || isRlwinm_rc(a))))
 			{
-				// Alright, let's bubble it down!
-				std::swap(a, b);
+				// once we're next to a carry instruction, don't move away!
+				if (type == REORDER_CARRY && i != start)
+				{
+					// if we read the CA flag, and the previous instruction sets it, don't move away.
+					if (!reverse && (a.opinfo->flags & FL_READ_CA) && (code[i - increment].opinfo->flags & FL_SET_CA))
+						continue;
+					// if we set the CA flag, and the next instruction reads it, don't move away.
+					if (reverse && (a.opinfo->flags & FL_SET_CA) && (code[i - increment].opinfo->flags & FL_READ_CA))
+						continue;
+				}
+
+				if (CanSwapAdjacentOps(a, b))
+				{
+					// Alright, let's bubble it!
+					std::swap(a, b);
+					swapped = true;
+				}
 			}
 		}
+		if (!swapped)
+			return;
 	}
 }
 
+void PPCAnalyzer::ReorderInstructions(u32 instructions, CodeOp *code)
+{
+	// For carry, bubble instructions *towards* each other; one direction often isn't enough
+	// to get pairs like addc/adde next to each other.
+	ReorderInstructionsCore(instructions, code, true, REORDER_CARRY);
+	ReorderInstructionsCore(instructions, code, false, REORDER_CARRY);
+	ReorderInstructionsCore(instructions, code, false, REORDER_CMP);
+}
+
 void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInfo *opinfo, u32 index)
 {
 	code->wantsCR0 = false;
@@ -463,7 +509,7 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf
 	// We're going to try to avoid storing carry in XER if we can avoid it -- keep it in the x86 carry flag!
 	// If the instruction reads CA but doesn't write it, we still need to store CA in XER; we can't
 	// leave it in flags.
-	code->wantsCAInFlags = code->wantsCA && code->outputCA && code->inst.SUBOP10 != 512;
+	code->wantsCAInFlags = code->wantsCA && code->outputCA && opinfo->type == OPTYPE_INTEGER;
 
 	// mfspr/mtspr can affect/use XER, so be super careful here
 	// we need to note specifically that mfspr needs CA in XER, not in the x86 carry flag
diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.h b/Source/Core/Core/PowerPC/PPCAnalyst.h
index aa1a00abeb30..6be495f8f69b 100644
--- a/Source/Core/Core/PowerPC/PPCAnalyst.h
+++ b/Source/Core/Core/PowerPC/PPCAnalyst.h
@@ -144,6 +144,13 @@ class PPCAnalyzer
 {
 private:
 
+	enum ReorderType
+	{
+		REORDER_CARRY,
+		REORDER_CMP
+	};
+
+	void ReorderInstructionsCore(u32 instructions, CodeOp* code, bool reverse, ReorderType type);
 	void ReorderInstructions(u32 instructions, CodeOp *code);
 	void SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInfo *opinfo, u32 index);
 
diff --git a/Source/Core/Core/PowerPC/PPCTables.h b/Source/Core/Core/PowerPC/PPCTables.h
index f535817c9492..a3bb892d4587 100644
--- a/Source/Core/Core/PowerPC/PPCTables.h
+++ b/Source/Core/Core/PowerPC/PPCTables.h
@@ -38,6 +38,7 @@ enum
 	FL_LOADSTORE       = (1<<19),
 	FL_SET_FPRF        = (1<<20),
 	FL_READ_FPRF       = (1<<21),
+	FL_SET_OE          = (1<<22),
 };
 
 enum

From 08ac10d00a06eb8d1f7fe7ad8249245b7883f9c4 Mon Sep 17 00:00:00 2001
From: Fiora <fioraaeterna@gmail.com>
Date: Thu, 11 Sep 2014 03:59:40 -0700
Subject: [PATCH 6/6] PPCAnalyst/JIT: add ability to easily toggle branch and
 carry merging

---
 Source/Core/Core/PowerPC/Jit64/Jit.cpp         |  2 ++
 Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp |  3 +++
 Source/Core/Core/PowerPC/PPCAnalyst.cpp        | 15 +++++++++++----
 Source/Core/Core/PowerPC/PPCAnalyst.h          |  8 ++++++++
 4 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp
index 619217e4b69b..4324e63ba9b0 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp
@@ -177,6 +177,8 @@ void Jit64::Init()
 	code_block.m_gpa = &js.gpa;
 	code_block.m_fpa = &js.fpa;
 	analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE);
+	analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_MERGE);
+	analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CARRY_MERGE);
 }
 
 void Jit64::ClearCache()
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
index 203a5fae00ad..e86847ffc0ad 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
@@ -287,6 +287,9 @@ void Jit64::reg_imm(UGeckoInstruction inst)
 
 bool Jit64::CheckMergedBranch(int crf)
 {
+	if (!analyzer.HasOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_MERGE))
+		return false;
+
 	const UGeckoInstruction& next = js.next_inst;
 	return (((next.OPCD == 16 /* bcx */) ||
 	        ((next.OPCD == 19) && (next.SUBOP10 == 528) /* bcctrx */) ||
diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp
index 2840b52fc993..d5eb12841885 100644
--- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp
+++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp
@@ -467,9 +467,13 @@ void PPCAnalyzer::ReorderInstructions(u32 instructions, CodeOp *code)
 {
 	// For carry, bubble instructions *towards* each other; one direction often isn't enough
 	// to get pairs like addc/adde next to each other.
-	ReorderInstructionsCore(instructions, code, true, REORDER_CARRY);
-	ReorderInstructionsCore(instructions, code, false, REORDER_CARRY);
-	ReorderInstructionsCore(instructions, code, false, REORDER_CMP);
+	if (HasOption(OPTION_CARRY_MERGE))
+	{
+		ReorderInstructionsCore(instructions, code, true, REORDER_CARRY);
+		ReorderInstructionsCore(instructions, code, false, REORDER_CARRY);
+	}
+	if (HasOption(OPTION_BRANCH_MERGE))
+		ReorderInstructionsCore(instructions, code, false, REORDER_CMP);
 }
 
 void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInfo *opinfo, u32 index)
@@ -509,7 +513,10 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf
 	// We're going to try to avoid storing carry in XER if we can avoid it -- keep it in the x86 carry flag!
 	// If the instruction reads CA but doesn't write it, we still need to store CA in XER; we can't
 	// leave it in flags.
-	code->wantsCAInFlags = code->wantsCA && code->outputCA && opinfo->type == OPTYPE_INTEGER;
+	if (HasOption(OPTION_CARRY_MERGE))
+		code->wantsCAInFlags = code->wantsCA && code->outputCA && opinfo->type == OPTYPE_INTEGER;
+	else
+		code->wantsCAInFlags = false;
 
 	// mfspr/mtspr can affect/use XER, so be super careful here
 	// we need to note specifically that mfspr needs CA in XER, not in the x86 carry flag
diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.h b/Source/Core/Core/PowerPC/PPCAnalyst.h
index 6be495f8f69b..2238337710ca 100644
--- a/Source/Core/Core/PowerPC/PPCAnalyst.h
+++ b/Source/Core/Core/PowerPC/PPCAnalyst.h
@@ -183,6 +183,14 @@ class PPCAnalyzer
 		// Requires JIT support to work.
 		// XXX: NOT COMPLETE
 		OPTION_FORWARD_JUMP = (1 << 3),
+
+		// Reorder compare/Rc instructions next to their associated branches and
+		// merge in the JIT (for common cases, anyway).
+		OPTION_BRANCH_MERGE = (1 << 4),
+
+		// Reorder carry instructions next to their associated branches and pass
+		// carry flags in the x86 flags between them, instead of in XER.
+		OPTION_CARRY_MERGE = (1 << 5),
 	};