From 7ce4c3138e8efc57b9870792c121ad630dc7ea2f Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Wed, 12 Aug 2015 13:02:00 -0500 Subject: [PATCH] [AArch64] Optimize cases when an FPR is only used for non-paired ops. --- .../JitArm64/JitArm64_FloatingPoint.cpp | 209 ++++++++++++------ .../JitArm64/JitArm64_LoadStoreFloating.cpp | 4 +- .../JitArm64/JitArm64_LoadStorePaired.cpp | 6 +- .../Core/PowerPC/JitArm64/JitArm64_Paired.cpp | 200 ++++++++--------- .../PowerPC/JitArm64/JitArm64_RegCache.cpp | 81 ++++++- .../Core/PowerPC/JitArm64/JitArm64_RegCache.h | 22 +- 6 files changed, 322 insertions(+), 200 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp index 479e5c06c84b..c5c58835d0a8 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp @@ -26,12 +26,18 @@ void JitArm64::fabsx(UGeckoInstruction inst) fpr.BindToRegister(d, true); ARM64Reg VB = fpr.R(b); ARM64Reg VD = fpr.R(d); - ARM64Reg V0 = fpr.GetReg(); - - m_float_emit.FABS(EncodeRegToDouble(V0), EncodeRegToDouble(VB)); - m_float_emit.INS(64, VD, 0, V0, 0); - fpr.Unlock(V0); + if (fpr.IsLower(d)) + { + m_float_emit.FABS(EncodeRegToDouble(VD), EncodeRegToDouble(VB)); + } + else + { + ARM64Reg V0 = fpr.GetReg(); + m_float_emit.FABS(EncodeRegToDouble(V0), EncodeRegToDouble(VB)); + m_float_emit.INS(64, VD, 0, V0, 0); + fpr.Unlock(V0); + } } void JitArm64::faddsx(UGeckoInstruction inst) @@ -41,11 +47,11 @@ void JitArm64::faddsx(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b); + fpr.BindToRegister(d, d == a || d == b, false); ARM64Reg VA = fpr.R(a); ARM64Reg VB = fpr.R(b); - ARM64Reg VD = fpr.R(d); + ARM64Reg VD = fpr.R(d, false); m_float_emit.FADD(EncodeRegToDouble(VD), EncodeRegToDouble(VA), EncodeRegToDouble(VB)); m_float_emit.INS(64, VD, 1, VD, 0); @@ -63,12 +69,18 @@ void JitArm64::faddx(UGeckoInstruction inst) ARM64Reg VA = fpr.R(a); ARM64Reg VB = fpr.R(b); ARM64Reg VD = fpr.R(d); - ARM64Reg V0 = fpr.GetReg(); - - m_float_emit.FADD(EncodeRegToDouble(V0), EncodeRegToDouble(VA), EncodeRegToDouble(VB)); - m_float_emit.INS(64, VD, 0, V0, 0); - fpr.Unlock(V0); + if (fpr.IsLower(d)) + { + m_float_emit.FADD(EncodeRegToDouble(VD), EncodeRegToDouble(VA), EncodeRegToDouble(VB)); + } + else + { + ARM64Reg V0 = fpr.GetReg(); + m_float_emit.FADD(EncodeRegToDouble(V0), EncodeRegToDouble(VA), EncodeRegToDouble(VB)); + m_float_emit.INS(64, VD, 0, V0, 0); + fpr.Unlock(V0); + } } void JitArm64::fmaddsx(UGeckoInstruction inst) @@ -78,12 +90,12 @@ void JitArm64::fmaddsx(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b || d == c); + fpr.BindToRegister(d, d == a || d == b || d == c, false); ARM64Reg VA = fpr.R(a); ARM64Reg VB = fpr.R(b); ARM64Reg VC = fpr.R(c); - ARM64Reg VD = fpr.R(d); + ARM64Reg VD = fpr.R(d, false); ARM64Reg V0 = fpr.GetReg(); m_float_emit.FMUL(EncodeRegToDouble(V0), EncodeRegToDouble(VA), EncodeRegToDouble(VC)); @@ -105,12 +117,18 @@ void JitArm64::fmaddx(UGeckoInstruction inst) ARM64Reg VB = fpr.R(b); ARM64Reg VC = fpr.R(c); ARM64Reg VD = fpr.R(d); - ARM64Reg V0 = fpr.GetReg(); - m_float_emit.FMUL(EncodeRegToDouble(V0), EncodeRegToDouble(VA), EncodeRegToDouble(VC)); - m_float_emit.FADD(EncodeRegToDouble(V0), EncodeRegToDouble(V0), EncodeRegToDouble(VB)); - m_float_emit.INS(64, VD, 0, V0, 0); - fpr.Unlock(V0); + if (fpr.IsLower(d)) + { + m_float_emit.FMADD(EncodeRegToDouble(VD), EncodeRegToDouble(VA), EncodeRegToDouble(VC), EncodeRegToDouble(VB)); + } + else + { + ARM64Reg V0 = fpr.GetReg(); + m_float_emit.FMADD(EncodeRegToDouble(V0), EncodeRegToDouble(VA), EncodeRegToDouble(VC), EncodeRegToDouble(VB)); + m_float_emit.INS(64, VD, 0, V0, 0); + fpr.Unlock(V0); + } } void JitArm64::fmrx(UGeckoInstruction inst) @@ -135,12 +153,12 @@ void JitArm64::fmsubsx(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b || d == c); + fpr.BindToRegister(d, d == a || d == b || d == c, false); ARM64Reg VA = fpr.R(a); ARM64Reg VB = fpr.R(b); ARM64Reg VC = fpr.R(c); - ARM64Reg VD = fpr.R(d); + ARM64Reg VD = fpr.R(d, false); ARM64Reg V0 = fpr.GetReg(); m_float_emit.FMUL(EncodeRegToDouble(V0), EncodeRegToDouble(VA), EncodeRegToDouble(VC)); @@ -162,12 +180,18 @@ void JitArm64::fmsubx(UGeckoInstruction inst) ARM64Reg VB = fpr.R(b); ARM64Reg VC = fpr.R(c); ARM64Reg VD = fpr.R(d); - ARM64Reg V0 = fpr.GetReg(); - m_float_emit.FMUL(EncodeRegToDouble(V0), EncodeRegToDouble(VA), EncodeRegToDouble(VC)); - m_float_emit.FSUB(EncodeRegToDouble(V0), EncodeRegToDouble(V0), EncodeRegToDouble(VB)); - m_float_emit.INS(64, VD, 0, V0, 0); - fpr.Unlock(V0); + if (fpr.IsLower(d)) + { + m_float_emit.FNMSUB(EncodeRegToDouble(VD), EncodeRegToDouble(VA), EncodeRegToDouble(VC), EncodeRegToDouble(VB)); + } + else + { + ARM64Reg V0 = fpr.GetReg(); + m_float_emit.FNMSUB(EncodeRegToDouble(V0), EncodeRegToDouble(VA), EncodeRegToDouble(VC), EncodeRegToDouble(VB)); + m_float_emit.INS(64, VD, 0, V0, 0); + fpr.Unlock(V0); + } } void JitArm64::fmulsx(UGeckoInstruction inst) @@ -177,11 +201,11 @@ void JitArm64::fmulsx(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, c = inst.FC, d = inst.FD; - fpr.BindToRegister(d, d == a || d == c); + fpr.BindToRegister(d, d == a || d == c, false); ARM64Reg VA = fpr.R(a); ARM64Reg VC = fpr.R(c); - ARM64Reg VD = fpr.R(d); + ARM64Reg VD = fpr.R(d, false); m_float_emit.FMUL(EncodeRegToDouble(VD), EncodeRegToDouble(VA), EncodeRegToDouble(VC)); m_float_emit.INS(64, VD, 1, VD, 0); @@ -199,12 +223,18 @@ void JitArm64::fmulx(UGeckoInstruction inst) ARM64Reg VA = fpr.R(a); ARM64Reg VC = fpr.R(c); ARM64Reg VD = fpr.R(d); - ARM64Reg V0 = fpr.GetReg(); - - m_float_emit.FMUL(EncodeRegToDouble(V0), EncodeRegToDouble(VA), EncodeRegToDouble(VC)); - m_float_emit.INS(64, VD, 0, V0, 0); - fpr.Unlock(V0); + if (fpr.IsLower(d)) + { + m_float_emit.FMUL(EncodeRegToDouble(VD), EncodeRegToDouble(VA), EncodeRegToDouble(VC)); + } + else + { + ARM64Reg V0 = fpr.GetReg(); + m_float_emit.FMUL(EncodeRegToDouble(V0), EncodeRegToDouble(VA), EncodeRegToDouble(VC)); + m_float_emit.INS(64, VD, 0, V0, 0); + fpr.Unlock(V0); + } } void JitArm64::fnabsx(UGeckoInstruction inst) @@ -218,13 +248,20 @@ void JitArm64::fnabsx(UGeckoInstruction inst) ARM64Reg VB = fpr.R(b); ARM64Reg VD = fpr.R(d); - ARM64Reg V0 = fpr.GetReg(); - m_float_emit.FABS(EncodeRegToDouble(V0), EncodeRegToDouble(VB)); - m_float_emit.FNEG(EncodeRegToDouble(V0), EncodeRegToDouble(V0)); - m_float_emit.INS(64, VD, 0, V0, 0); - - fpr.Unlock(V0); + if (fpr.IsLower(d)) + { + m_float_emit.FABS(EncodeRegToDouble(VD), EncodeRegToDouble(VB)); + m_float_emit.FNEG(EncodeRegToDouble(VD), EncodeRegToDouble(VD)); + } + else + { + ARM64Reg V0 = fpr.GetReg(); + m_float_emit.FABS(EncodeRegToDouble(V0), EncodeRegToDouble(VB)); + m_float_emit.FNEG(EncodeRegToDouble(V0), EncodeRegToDouble(V0)); + m_float_emit.INS(64, VD, 0, V0, 0); + fpr.Unlock(V0); + } } void JitArm64::fnegx(UGeckoInstruction inst) @@ -238,12 +275,18 @@ void JitArm64::fnegx(UGeckoInstruction inst) ARM64Reg VB = fpr.R(b); ARM64Reg VD = fpr.R(d); - ARM64Reg V0 = fpr.GetReg(); - m_float_emit.FNEG(EncodeRegToDouble(V0), EncodeRegToDouble(VB)); - m_float_emit.INS(64, VD, 0, V0, 0); - - fpr.Unlock(V0); + if (fpr.IsLower(d)) + { + m_float_emit.FNEG(EncodeRegToDouble(VD), EncodeRegToDouble(VB)); + } + else + { + ARM64Reg V0 = fpr.GetReg(); + m_float_emit.FNEG(EncodeRegToDouble(V0), EncodeRegToDouble(VB)); + m_float_emit.INS(64, VD, 0, V0, 0); + fpr.Unlock(V0); + } } void JitArm64::fnmaddsx(UGeckoInstruction inst) @@ -253,12 +296,12 @@ void JitArm64::fnmaddsx(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b || d == c); + fpr.BindToRegister(d, d == a || d == b || d == c, false); ARM64Reg VA = fpr.R(a); ARM64Reg VB = fpr.R(b); ARM64Reg VC = fpr.R(c); - ARM64Reg VD = fpr.R(d); + ARM64Reg VD = fpr.R(d, false); ARM64Reg V0 = fpr.GetReg(); m_float_emit.FMUL(EncodeRegToDouble(V0), EncodeRegToDouble(VA), EncodeRegToDouble(VC)); @@ -281,13 +324,18 @@ void JitArm64::fnmaddx(UGeckoInstruction inst) ARM64Reg VB = fpr.R(b); ARM64Reg VC = fpr.R(c); ARM64Reg VD = fpr.R(d); - ARM64Reg V0 = fpr.GetReg(); - m_float_emit.FMUL(EncodeRegToDouble(V0), EncodeRegToDouble(VA), EncodeRegToDouble(VC)); - m_float_emit.FADD(EncodeRegToDouble(V0), EncodeRegToDouble(V0), EncodeRegToDouble(VB)); - m_float_emit.FNEG(EncodeRegToDouble(V0), EncodeRegToDouble(V0)); - m_float_emit.INS(64, VD, 0, V0, 0); - fpr.Unlock(V0); + if (fpr.IsLower(d)) + { + m_float_emit.FNMADD(EncodeRegToDouble(VD), EncodeRegToDouble(VA), EncodeRegToDouble(VC), EncodeRegToDouble(VB)); + } + else + { + ARM64Reg V0 = fpr.GetReg(); + m_float_emit.FNMADD(EncodeRegToDouble(V0), EncodeRegToDouble(VA), EncodeRegToDouble(VC), EncodeRegToDouble(VB)); + m_float_emit.INS(64, VD, 0, V0, 0); + fpr.Unlock(V0); + } } void JitArm64::fnmsubsx(UGeckoInstruction inst) @@ -297,12 +345,12 @@ void JitArm64::fnmsubsx(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b || d == c); + fpr.BindToRegister(d, d == a || d == b || d == c, false); ARM64Reg VA = fpr.R(a); ARM64Reg VB = fpr.R(b); ARM64Reg VC = fpr.R(c); - ARM64Reg VD = fpr.R(d); + ARM64Reg VD = fpr.R(d, false); ARM64Reg V0 = fpr.GetReg(); m_float_emit.FMUL(EncodeRegToDouble(V0), EncodeRegToDouble(VA), EncodeRegToDouble(VC)); @@ -325,13 +373,18 @@ void JitArm64::fnmsubx(UGeckoInstruction inst) ARM64Reg VB = fpr.R(b); ARM64Reg VC = fpr.R(c); ARM64Reg VD = fpr.R(d); - ARM64Reg V0 = fpr.GetReg(); - m_float_emit.FMUL(EncodeRegToDouble(V0), EncodeRegToDouble(VA), EncodeRegToDouble(VC)); - m_float_emit.FSUB(EncodeRegToDouble(V0), EncodeRegToDouble(V0), EncodeRegToDouble(VB)); - m_float_emit.FNEG(EncodeRegToDouble(V0), EncodeRegToDouble(V0)); - m_float_emit.INS(64, VD, 0, V0, 0); - fpr.Unlock(V0); + if (fpr.IsLower(d)) + { + m_float_emit.FMSUB(EncodeRegToDouble(VD), EncodeRegToDouble(VA), EncodeRegToDouble(VC), EncodeRegToDouble(VB)); + } + else + { + ARM64Reg V0 = fpr.GetReg(); + m_float_emit.FMSUB(EncodeRegToDouble(V0), EncodeRegToDouble(VA), EncodeRegToDouble(VC), EncodeRegToDouble(VB)); + m_float_emit.INS(64, VD, 0, V0, 0); + fpr.Unlock(V0); + } } void JitArm64::fselx(UGeckoInstruction inst) @@ -347,13 +400,19 @@ void JitArm64::fselx(UGeckoInstruction inst) ARM64Reg VA = fpr.R(a); ARM64Reg VB = fpr.R(b); ARM64Reg VC = fpr.R(c); - ARM64Reg V0 = fpr.GetReg(); m_float_emit.FCMPE(EncodeRegToDouble(VA)); - m_float_emit.FCSEL(EncodeRegToDouble(V0), EncodeRegToDouble(VC), EncodeRegToDouble(VB), CC_GE); - m_float_emit.INS(64, VD, 0, V0, 0); - - fpr.Unlock(V0); + if (fpr.IsLower(d)) + { + m_float_emit.FCSEL(EncodeRegToDouble(VD), EncodeRegToDouble(VC), EncodeRegToDouble(VB), CC_GE); + } + else + { + ARM64Reg V0 = fpr.GetReg(); + m_float_emit.FCSEL(EncodeRegToDouble(V0), EncodeRegToDouble(VC), EncodeRegToDouble(VB), CC_GE); + m_float_emit.INS(64, VD, 0, V0, 0); + fpr.Unlock(V0); + } } void JitArm64::fsubsx(UGeckoInstruction inst) @@ -363,11 +422,11 @@ void JitArm64::fsubsx(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b); + fpr.BindToRegister(d, d == a || d == b, false); ARM64Reg VA = fpr.R(a); ARM64Reg VB = fpr.R(b); - ARM64Reg VD = fpr.R(d); + ARM64Reg VD = fpr.R(d, false); m_float_emit.FSUB(EncodeRegToDouble(VD), EncodeRegToDouble(VA), EncodeRegToDouble(VB)); m_float_emit.INS(64, VD, 1, VD, 0); @@ -385,10 +444,16 @@ void JitArm64::fsubx(UGeckoInstruction inst) ARM64Reg VA = fpr.R(a); ARM64Reg VB = fpr.R(b); ARM64Reg VD = fpr.R(d); - ARM64Reg V0 = fpr.GetReg(); - m_float_emit.FSUB(EncodeRegToDouble(V0), EncodeRegToDouble(VA), EncodeRegToDouble(VB)); - m_float_emit.INS(64, VD, 0, V0, 0); - - fpr.Unlock(V0); + if (fpr.IsLower(d)) + { + m_float_emit.FSUB(EncodeRegToDouble(VD), EncodeRegToDouble(VA), EncodeRegToDouble(VB)); + } + else + { + ARM64Reg V0 = fpr.GetReg(); + m_float_emit.FSUB(EncodeRegToDouble(V0), EncodeRegToDouble(VA), EncodeRegToDouble(VB)); + m_float_emit.INS(64, VD, 0, V0, 0); + fpr.Unlock(V0); + } } diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp index da79ce823b40..41aba9093a8d 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp @@ -72,9 +72,9 @@ void JitArm64::lfXX(UGeckoInstruction inst) bool is_immediate = false; // 64 bit loads only load PSR0 - fpr.BindToRegister(inst.FD, flags & BackPatchInfo::FLAG_SIZE_F64); + fpr.BindToRegister(inst.FD, flags & BackPatchInfo::FLAG_SIZE_F64, flags & BackPatchInfo::FLAG_SIZE_F64); - ARM64Reg VD = fpr.R(inst.FD); + ARM64Reg VD = fpr.R(inst.FD, flags & BackPatchInfo::FLAG_SIZE_F64); ARM64Reg addr_reg = W0; gpr.Lock(W0, W30); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp index cce98aee07fb..8bf46e17d4a7 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp @@ -66,8 +66,8 @@ void JitArm64::psq_l(UGeckoInstruction inst) LDR(X30, X30, ArithOption(EncodeRegTo64(type_reg), true)); BLR(X30); - fpr.BindToRegister(inst.RS, false); - ARM64Reg VS = fpr.R(inst.RS); + fpr.BindToRegister(inst.RS, false, false); + ARM64Reg VS = fpr.R(inst.RS, false); m_float_emit.FCVTL(64, VS, D0); if (inst.W) { @@ -97,7 +97,7 @@ void JitArm64::psq_st(UGeckoInstruction inst) fpr.Lock(Q0, Q1); ARM64Reg arm_addr = gpr.R(inst.RA); - ARM64Reg VS = fpr.R(inst.RS); + ARM64Reg VS = fpr.R(inst.RS, false); ARM64Reg scale_reg = W0; ARM64Reg addr_reg = W1; diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp index 664e29764ef7..ee27a5caf8da 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp @@ -23,10 +23,10 @@ void JitArm64::ps_abs(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 b = inst.FB, d = inst.FD; - fpr.BindToRegister(d, d == b); + fpr.BindToRegister(d, d == b, false); - ARM64Reg VB = fpr.R(b); - ARM64Reg VD = fpr.R(d); + ARM64Reg VB = fpr.R(b, false); + ARM64Reg VD = fpr.R(d, false); m_float_emit.FABS(64, VD, VB); } @@ -38,11 +38,11 @@ void JitArm64::ps_add(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b); + fpr.BindToRegister(d, d == a || d == b, false); - ARM64Reg VA = fpr.R(a); - ARM64Reg VB = fpr.R(b); - ARM64Reg VD = fpr.R(d); + ARM64Reg VA = fpr.R(a, false); + ARM64Reg VB = fpr.R(b, false); + ARM64Reg VD = fpr.R(d, false); m_float_emit.FADD(64, VD, VA, VB); } @@ -54,11 +54,11 @@ void JitArm64::ps_div(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b); + fpr.BindToRegister(d, d == a || d == b, false); - ARM64Reg VA = fpr.R(a); - ARM64Reg VB = fpr.R(b); - ARM64Reg VD = fpr.R(d); + ARM64Reg VA = fpr.R(a, false); + ARM64Reg VB = fpr.R(b, false); + ARM64Reg VD = fpr.R(d, false); m_float_emit.FDIV(64, VD, VA, VB); } @@ -70,12 +70,12 @@ void JitArm64::ps_madd(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b || d == c); + fpr.BindToRegister(d, d == a || d == b || d == c, false); - ARM64Reg VA = fpr.R(a); - ARM64Reg VB = fpr.R(b); - ARM64Reg VC = fpr.R(c); - ARM64Reg VD = fpr.R(d); + ARM64Reg VA = fpr.R(a, false); + ARM64Reg VB = fpr.R(b, false); + ARM64Reg VC = fpr.R(c, false); + ARM64Reg VD = fpr.R(d, false); ARM64Reg V0 = fpr.GetReg(); m_float_emit.FMUL(64, V0, VA, VC); @@ -91,12 +91,12 @@ void JitArm64::ps_madds0(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b || d == c); + fpr.BindToRegister(d, d == a || d == b || d == c, false); - ARM64Reg VA = fpr.R(a); - ARM64Reg VB = fpr.R(b); - ARM64Reg VC = fpr.R(c); - ARM64Reg VD = fpr.R(d); + ARM64Reg VA = fpr.R(a, false); + ARM64Reg VB = fpr.R(b, false); + ARM64Reg VC = fpr.R(c, false); + ARM64Reg VD = fpr.R(d, false); ARM64Reg V0 = fpr.GetReg(); m_float_emit.DUP(64, V0, VC, 0); @@ -113,12 +113,12 @@ void JitArm64::ps_madds1(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b || d == c); + fpr.BindToRegister(d, d == a || d == b || d == c, false); - ARM64Reg VA = fpr.R(a); - ARM64Reg VB = fpr.R(b); - ARM64Reg VC = fpr.R(c); - ARM64Reg VD = fpr.R(d); + ARM64Reg VA = fpr.R(a, false); + ARM64Reg VB = fpr.R(b, false); + ARM64Reg VC = fpr.R(c, false); + ARM64Reg VD = fpr.R(d, false); ARM64Reg V0 = fpr.GetReg(); m_float_emit.DUP(64, V0, VC, 1); @@ -135,11 +135,11 @@ void JitArm64::ps_merge00(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b); + fpr.BindToRegister(d, d == a || d == b, false); - ARM64Reg VA = fpr.R(a); - ARM64Reg VB = fpr.R(b); - ARM64Reg VD = fpr.R(d); + ARM64Reg VA = fpr.R(a, false); + ARM64Reg VB = fpr.R(b, false); + ARM64Reg VD = fpr.R(d, false); m_float_emit.TRN1(64, VD, VA, VB); } @@ -151,11 +151,11 @@ void JitArm64::ps_merge01(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b); + fpr.BindToRegister(d, d == a || d == b, false); - ARM64Reg VA = fpr.R(a); - ARM64Reg VB = fpr.R(b); - ARM64Reg VD = fpr.R(d); + ARM64Reg VA = fpr.R(a, false); + ARM64Reg VB = fpr.R(b, false); + ARM64Reg VD = fpr.R(d, false); m_float_emit.INS(64, VD, 0, VA, 0); m_float_emit.INS(64, VD, 1, VB, 1); @@ -168,11 +168,11 @@ void JitArm64::ps_merge10(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b); + fpr.BindToRegister(d, d == a || d == b, false); - ARM64Reg VA = fpr.R(a); - ARM64Reg VB = fpr.R(b); - ARM64Reg VD = fpr.R(d); + ARM64Reg VA = fpr.R(a, false); + ARM64Reg VB = fpr.R(b, false); + ARM64Reg VD = fpr.R(d, false); if (d != a && d != b) { @@ -196,11 +196,11 @@ void JitArm64::ps_merge11(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b); + fpr.BindToRegister(d, d == a || d == b, false); - ARM64Reg VA = fpr.R(a); - ARM64Reg VB = fpr.R(b); - ARM64Reg VD = fpr.R(d); + ARM64Reg VA = fpr.R(a, false); + ARM64Reg VB = fpr.R(b, false); + ARM64Reg VD = fpr.R(d, false); m_float_emit.TRN2(64, VD, VA, VB); } @@ -216,10 +216,10 @@ void JitArm64::ps_mr(UGeckoInstruction inst) if (d == b) return; - fpr.BindToRegister(d, false); + fpr.BindToRegister(d, false, false); - ARM64Reg VB = fpr.R(b); - ARM64Reg VD = fpr.R(d); + ARM64Reg VB = fpr.R(b, false); + ARM64Reg VD = fpr.R(d, false); m_float_emit.ORR(VD, VB, VB); } @@ -231,11 +231,11 @@ void JitArm64::ps_mul(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, c = inst.FC, d = inst.FD; - fpr.BindToRegister(d, d == a || d == c); + fpr.BindToRegister(d, d == a || d == c, false); - ARM64Reg VA = fpr.R(a); - ARM64Reg VC = fpr.R(c); - ARM64Reg VD = fpr.R(d); + ARM64Reg VA = fpr.R(a, false); + ARM64Reg VC = fpr.R(c, false); + ARM64Reg VD = fpr.R(d, false); m_float_emit.FMUL(64, VD, VA, VC); } @@ -247,11 +247,11 @@ void JitArm64::ps_muls0(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, c = inst.FC, d = inst.FD; - fpr.BindToRegister(d, d == a || d == c); + fpr.BindToRegister(d, d == a || d == c, false); - ARM64Reg VA = fpr.R(a); - ARM64Reg VC = fpr.R(c); - ARM64Reg VD = fpr.R(d); + ARM64Reg VA = fpr.R(a, false); + ARM64Reg VC = fpr.R(c, false); + ARM64Reg VD = fpr.R(d, false); ARM64Reg V0 = fpr.GetReg(); m_float_emit.DUP(64, V0, VC, 0); @@ -266,11 +266,11 @@ void JitArm64::ps_muls1(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, c = inst.FC, d = inst.FD; - fpr.BindToRegister(d, d == a || d == c); + fpr.BindToRegister(d, d == a || d == c, false); - ARM64Reg VA = fpr.R(a); - ARM64Reg VC = fpr.R(c); - ARM64Reg VD = fpr.R(d); + ARM64Reg VA = fpr.R(a, false); + ARM64Reg VC = fpr.R(c, false); + ARM64Reg VD = fpr.R(d, false); ARM64Reg V0 = fpr.GetReg(); m_float_emit.DUP(64, V0, VC, 1); @@ -285,12 +285,12 @@ void JitArm64::ps_msub(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b || d == c); + fpr.BindToRegister(d, d == a || d == b || d == c, false); - ARM64Reg VA = fpr.R(a); - ARM64Reg VB = fpr.R(b); - ARM64Reg VC = fpr.R(c); - ARM64Reg VD = fpr.R(d); + ARM64Reg VA = fpr.R(a, false); + ARM64Reg VB = fpr.R(b, false); + ARM64Reg VC = fpr.R(c, false); + ARM64Reg VD = fpr.R(d, false); ARM64Reg V0 = fpr.GetReg(); m_float_emit.FMUL(64, V0, VA, VC); @@ -306,10 +306,10 @@ void JitArm64::ps_nabs(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 b = inst.FB, d = inst.FD; - fpr.BindToRegister(d, d == b); + fpr.BindToRegister(d, d == b, false); - ARM64Reg VB = fpr.R(b); - ARM64Reg VD = fpr.R(d); + ARM64Reg VB = fpr.R(b, false); + ARM64Reg VD = fpr.R(d, false); m_float_emit.FABS(64, VD, VB); m_float_emit.FNEG(64, VD, VD); @@ -322,10 +322,10 @@ void JitArm64::ps_neg(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 b = inst.FB, d = inst.FD; - fpr.BindToRegister(d, d == b); + fpr.BindToRegister(d, d == b, false); - ARM64Reg VB = fpr.R(b); - ARM64Reg VD = fpr.R(d); + ARM64Reg VB = fpr.R(b, false); + ARM64Reg VD = fpr.R(d, false); m_float_emit.FNEG(64, VD, VB); } @@ -337,12 +337,12 @@ void JitArm64::ps_nmadd(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b || d == c); + fpr.BindToRegister(d, d == a || d == b || d == c, false); - ARM64Reg VA = fpr.R(a); - ARM64Reg VB = fpr.R(b); - ARM64Reg VC = fpr.R(c); - ARM64Reg VD = fpr.R(d); + ARM64Reg VA = fpr.R(a, false); + ARM64Reg VB = fpr.R(b, false); + ARM64Reg VC = fpr.R(c, false); + ARM64Reg VD = fpr.R(d, false); ARM64Reg V0 = fpr.GetReg(); m_float_emit.FMUL(64, V0, VA, VC); @@ -359,12 +359,12 @@ void JitArm64::ps_nmsub(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b || d == c); + fpr.BindToRegister(d, d == a || d == b || d == c, false); - ARM64Reg VA = fpr.R(a); - ARM64Reg VB = fpr.R(b); - ARM64Reg VC = fpr.R(c); - ARM64Reg VD = fpr.R(d); + ARM64Reg VA = fpr.R(a, false); + ARM64Reg VB = fpr.R(b, false); + ARM64Reg VC = fpr.R(c, false); + ARM64Reg VD = fpr.R(d, false); ARM64Reg V0 = fpr.GetReg(); m_float_emit.FMUL(64, V0, VA, VC); @@ -381,10 +381,10 @@ void JitArm64::ps_res(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 b = inst.FB, d = inst.FD; - fpr.BindToRegister(d, d == b); + fpr.BindToRegister(d, d == b, false); - ARM64Reg VB = fpr.R(b); - ARM64Reg VD = fpr.R(d); + ARM64Reg VB = fpr.R(b, false); + ARM64Reg VD = fpr.R(d, false); m_float_emit.FRSQRTE(64, VD, VB); } @@ -396,12 +396,12 @@ void JitArm64::ps_sel(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b || d == c); + fpr.BindToRegister(d, d == a || d == b || d == c, false); - ARM64Reg VA = fpr.R(a); - ARM64Reg VB = fpr.R(b); - ARM64Reg VC = fpr.R(c); - ARM64Reg VD = fpr.R(d); + ARM64Reg VA = fpr.R(a, false); + ARM64Reg VB = fpr.R(b, false); + ARM64Reg VC = fpr.R(c, false); + ARM64Reg VD = fpr.R(d, false); if (d != a && d != b && d != c) { @@ -425,11 +425,11 @@ void JitArm64::ps_sub(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b); + fpr.BindToRegister(d, d == a || d == b, false); - ARM64Reg VA = fpr.R(a); - ARM64Reg VB = fpr.R(b); - ARM64Reg VD = fpr.R(d); + ARM64Reg VA = fpr.R(a, false); + ARM64Reg VB = fpr.R(b, false); + ARM64Reg VD = fpr.R(d, false); m_float_emit.FSUB(64, VD, VA, VB); } @@ -441,12 +441,12 @@ void JitArm64::ps_sum0(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b || d == c); + fpr.BindToRegister(d, d == a || d == b || d == c, false); - ARM64Reg VA = fpr.R(a); - ARM64Reg VB = fpr.R(b); - ARM64Reg VC = fpr.R(c); - ARM64Reg VD = fpr.R(d); + ARM64Reg VA = fpr.R(a, false); + ARM64Reg VB = fpr.R(b, false); + ARM64Reg VC = fpr.R(c, false); + ARM64Reg VD = fpr.R(d, false); ARM64Reg V0 = fpr.GetReg(); m_float_emit.DUP(64, V0, VB, 1); @@ -471,12 +471,12 @@ void JitArm64::ps_sum1(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b || d == c); + fpr.BindToRegister(d, d == a || d == b || d == c, false); - ARM64Reg VA = fpr.R(a); - ARM64Reg VB = fpr.R(b); - ARM64Reg VC = fpr.R(c); - ARM64Reg VD = fpr.R(d); + ARM64Reg VA = fpr.R(a, false); + ARM64Reg VB = fpr.R(b, false); + ARM64Reg VC = fpr.R(c, false); + ARM64Reg VD = fpr.R(d, false); ARM64Reg V0 = fpr.GetReg(); m_float_emit.DUP(64, V0, VA, 0); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp index 2ae260d95a7d..aa8b0ebd48a1 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp @@ -65,7 +65,8 @@ void Arm64RegCache::FlushMostStaleRegister() { u32 last_used = m_guest_registers[i].GetLastUsed(); if (last_used > most_stale_amount && - m_guest_registers[i].GetType() == REG_REG) + (m_guest_registers[i].GetType() != REG_NOTLOADED && + m_guest_registers[i].GetType() != REG_IMM)) { most_stale_preg = i; most_stale_amount = last_used; @@ -261,7 +262,8 @@ void Arm64FPRCache::Flush(FlushMode mode, PPCAnalyst::CodeOp* op) { for (int i = 0; i < 32; ++i) { - if (m_guest_registers[i].GetType() == REG_REG) + if (m_guest_registers[i].GetType() != REG_NOTLOADED && + m_guest_registers[i].GetType() != REG_IMM) { // XXX: Determine if we can keep a register in the lower 64bits // Which will allow it to be callee saved. @@ -270,7 +272,7 @@ void Arm64FPRCache::Flush(FlushMode mode, PPCAnalyst::CodeOp* op) } } -ARM64Reg Arm64FPRCache::R(u32 preg) +ARM64Reg Arm64FPRCache::R(u32 preg, bool only_lower) { OpArg& reg = m_guest_registers[preg]; IncrementAllUsed(); @@ -279,14 +281,25 @@ ARM64Reg Arm64FPRCache::R(u32 preg) switch (reg.GetType()) { case REG_REG: // already in a reg + case REG_LOWER_PAIR: return reg.GetReg(); break; case REG_NOTLOADED: // Register isn't loaded at /all/ { ARM64Reg host_reg = GetReg(); - reg.LoadToReg(host_reg); + u32 load_size; + if (only_lower) + { + load_size = 64; + reg.LoadLowerReg(host_reg); + } + else + { + load_size = 128; + reg.LoadToReg(host_reg); + } reg.SetDirty(false); - m_float_emit->LDR(128, INDEX_UNSIGNED, host_reg, X29, PPCSTATE_OFF(ps[preg][0])); + m_float_emit->LDR(load_size, INDEX_UNSIGNED, host_reg, X29, PPCSTATE_OFF(ps[preg][0])); return host_reg; } break; @@ -298,17 +311,55 @@ ARM64Reg Arm64FPRCache::R(u32 preg) return INVALID_REG; } -void Arm64FPRCache::BindToRegister(u32 preg, bool do_load) +void Arm64FPRCache::BindToRegister(u32 preg, bool do_load, bool only_lower) { OpArg& reg = m_guest_registers[preg]; reg.SetDirty(true); - if (reg.GetType() == REG_NOTLOADED) + switch (reg.GetType()) + { + case REG_NOTLOADED: { ARM64Reg host_reg = GetReg(); - reg.LoadToReg(host_reg); + u32 load_size; + if (only_lower) + { + // We only want the lower 64bits + load_size = 64; + reg.LoadLowerReg(host_reg); + } + else + { + // We want the full 128bit register + load_size = 128; + reg.LoadToReg(host_reg); + } if (do_load) - m_float_emit->LDR(128, INDEX_UNSIGNED, host_reg, X29, PPCSTATE_OFF(ps[preg][0])); + m_float_emit->LDR(load_size, INDEX_UNSIGNED, host_reg, X29, PPCSTATE_OFF(ps[preg][0])); + } + break; + case REG_LOWER_PAIR: + { + if (!only_lower) + { + // Okay, we've got the lower reg loaded and we really wanted the full register + if (do_load) + { + // Load the high 64bits from the file and insert them in to the high 64bits of the host register + ARM64Reg tmp_reg = GetReg(); + m_float_emit->LDR(64, INDEX_UNSIGNED, tmp_reg, X29, PPCSTATE_OFF(ps[preg][1])); + m_float_emit->INS(64, reg.GetReg(), 1, tmp_reg, 0); + UnlockRegister(tmp_reg); + } + + // Change it over to a full 128bit register + reg.LoadToReg(reg.GetReg()); + } + } + break; + default: + // Do nothing + break; } } @@ -334,7 +385,7 @@ void Arm64FPRCache::FlushByHost(ARM64Reg host_reg) for (int i = 0; i < 32; ++i) { OpArg& reg = m_guest_registers[i]; - if (reg.GetType() == REG_REG && reg.GetReg() == host_reg) + if ((reg.GetType() != REG_NOTLOADED && reg.GetType() != REG_IMM) && reg.GetReg() == host_reg) { FlushRegister(i, false); return; @@ -355,12 +406,18 @@ bool Arm64FPRCache::IsCalleeSaved(ARM64Reg reg) void Arm64FPRCache::FlushRegister(u32 preg, bool maintain_state) { OpArg& reg = m_guest_registers[preg]; - if (reg.GetType() == REG_REG) + if (reg.GetType() == REG_REG || + reg.GetType() == REG_LOWER_PAIR) { ARM64Reg host_reg = reg.GetReg(); + u32 store_size; + if (reg.GetType() == REG_REG) + store_size = 128; + else + store_size = 64; if (reg.IsDirty()) - m_float_emit->STR(128, INDEX_UNSIGNED, host_reg, X29, PPCSTATE_OFF(ps[preg][0])); + m_float_emit->STR(store_size, INDEX_UNSIGNED, host_reg, X29, PPCSTATE_OFF(ps[preg][0])); if (!maintain_state) { diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h index d4abffb523a0..433f0dad8f27 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h @@ -19,11 +19,7 @@ enum RegType REG_NOTLOADED = 0, REG_REG, // Reg type is register REG_IMM, // Reg is really a IMM -}; -enum RegLocation -{ - REG_LOW = 0, - REG_HIGH, + REG_LOWER_PAIR, // Only the lower pair of a paired register }; enum FlushMode @@ -64,6 +60,11 @@ class OpArg m_type = REG_REG; m_reg = reg; } + void LoadLowerReg(ARM64Reg reg) + { + m_type = REG_LOWER_PAIR; + m_reg = reg; + } void LoadToImm(u32 imm) { m_type = REG_IMM; @@ -134,10 +135,6 @@ class Arm64RegCache // Flushes the register cache in different ways depending on the mode virtual void Flush(FlushMode mode, PPCAnalyst::CodeOp* op) = 0; - // Returns a guest register inside of a host register - // Will dump an immediate to the host register as well - virtual ARM64Reg R(u32 reg) = 0; - virtual BitSet32 GetCallerSavedUsed() = 0; // Returns a temporary register for use @@ -265,9 +262,12 @@ class Arm64FPRCache : public Arm64RegCache // Returns a guest register inside of a host register // Will dump an immediate to the host register as well - ARM64Reg R(u32 preg); + ARM64Reg R(u32 preg, bool only_lower = true); - void BindToRegister(u32 preg, bool do_load); + void BindToRegister(u32 preg, bool do_load, bool only_lower = true); + + // Returns if the register is only the lower 64bit register + bool IsLower(u32 preg) const { return m_guest_registers[preg].GetType() == REG_LOWER_PAIR; } BitSet32 GetCallerSavedUsed() override;