Skip to content
Permalink
Browse files
Merge pull request #11147 from JosJuice/jitarm64-arith-org
JitArm64: Merge ps_mulsX, ps_maddXX, and parts of fp_arith
  • Loading branch information
JMC47 committed Oct 22, 2022
2 parents b7310a1 + 812067a commit 5b69c67
Show file tree
Hide file tree
Showing 4 changed files with 120 additions and 196 deletions.
@@ -152,9 +152,8 @@ class JitArm64 : public JitBase, public Arm64Gen::ARM64CodeBlock, public CommonA
void frsqrtex(UGeckoInstruction inst);

// Paired
void ps_maddXX(UGeckoInstruction inst);
void ps_mergeXX(UGeckoInstruction inst);
void ps_mulsX(UGeckoInstruction inst);
void ps_arith(UGeckoInstruction inst);
void ps_sel(UGeckoInstruction inst);
void ps_sumX(UGeckoInstruction inst);
void ps_res(UGeckoInstruction inst);
@@ -69,170 +69,118 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD;
u32 op5 = inst.SUBOP5;

bool single = inst.OPCD == 59;
bool packed = inst.OPCD == 4;

const bool use_c = op5 >= 25; // fmul and all kind of fmaddXX
const bool use_b = op5 != 25; // fmul uses no B

const bool outputs_are_singles = single || packed;
const bool round_c = use_c && outputs_are_singles && !js.op->fprIsSingle[inst.FC];
const bool output_is_single = inst.OPCD == 59;
const bool inaccurate_fma = op5 > 25 && !Config::Get(Config::SESSION_USE_FMA);
const bool round_c = use_c && output_is_single && !js.op->fprIsSingle[inst.FC];

const auto inputs_are_singles_func = [&] {
return fpr.IsSingle(a, !packed) && (!use_b || fpr.IsSingle(b, !packed)) &&
(!use_c || fpr.IsSingle(c, !packed));
return fpr.IsSingle(a, true) && (!use_b || fpr.IsSingle(b, true)) &&
(!use_c || fpr.IsSingle(c, true));
};
const bool inputs_are_singles = inputs_are_singles_func();

ARM64Reg VA{}, VB{}, VC{}, VD{};
const RegType type =
(inputs_are_singles && output_is_single) ? RegType::LowerPairSingle : RegType::LowerPair;
const RegType type_out =
output_is_single ? (inputs_are_singles ? RegType::DuplicatedSingle : RegType::Duplicated) :
RegType::LowerPair;
const auto reg_encoder =
(inputs_are_singles && output_is_single) ? EncodeRegToSingle : EncodeRegToDouble;

const ARM64Reg VA = reg_encoder(fpr.R(a, type));
const ARM64Reg VB = use_b ? reg_encoder(fpr.R(b, type)) : ARM64Reg::INVALID_REG;
ARM64Reg VC = use_c ? reg_encoder(fpr.R(c, type)) : ARM64Reg::INVALID_REG;
const ARM64Reg VD = reg_encoder(fpr.RW(d, type_out));

ARM64Reg V0Q = ARM64Reg::INVALID_REG;
ARM64Reg V1Q = ARM64Reg::INVALID_REG;

if (packed)
if (round_c)
{
const RegType type = inputs_are_singles ? RegType::Single : RegType::Register;
const u8 size = inputs_are_singles ? 32 : 64;
const auto reg_encoder = inputs_are_singles ? EncodeRegToDouble : EncodeRegToQuad;

VA = reg_encoder(fpr.R(a, type));
if (use_b)
VB = reg_encoder(fpr.R(b, type));
if (use_c)
VC = reg_encoder(fpr.R(c, type));
VD = reg_encoder(fpr.RW(d, type));

if (round_c)
{
ASSERT_MSG(DYNA_REC, !inputs_are_singles, "Tried to apply 25-bit precision to single");
ASSERT_MSG(DYNA_REC, !inputs_are_singles, "Tried to apply 25-bit precision to single");

V0Q = fpr.GetReg();
V1Q = fpr.GetReg();

Force25BitPrecision(reg_encoder(V0Q), VC);
VC = reg_encoder(V0Q);
}

switch (op5)
{
case 18:
m_float_emit.FDIV(size, VD, VA, VB);
break;
case 20:
m_float_emit.FSUB(size, VD, VA, VB);
break;
case 21:
m_float_emit.FADD(size, VD, VA, VB);
break;
case 25:
m_float_emit.FMUL(size, VD, VA, VC);
break;
default:
ASSERT_MSG(DYNA_REC, 0, "fp_arith");
break;
}
Force25BitPrecision(reg_encoder(V1Q), VC);
VC = reg_encoder(V1Q);
}
else

ARM64Reg inaccurate_fma_temp_reg = VD;
if (inaccurate_fma && d == b)
{
const RegType type =
(inputs_are_singles && single) ? RegType::LowerPairSingle : RegType::LowerPair;
const RegType type_out =
single ? (inputs_are_singles ? RegType::DuplicatedSingle : RegType::Duplicated) :
RegType::LowerPair;
const auto reg_encoder = (inputs_are_singles && single) ? EncodeRegToSingle : EncodeRegToDouble;

VA = reg_encoder(fpr.R(a, type));
if (use_b)
VB = reg_encoder(fpr.R(b, type));
if (use_c)
VC = reg_encoder(fpr.R(c, type));
VD = reg_encoder(fpr.RW(d, type_out));

const bool inaccurate_fma = op5 > 25 && !Config::Get(Config::SESSION_USE_FMA);

if (round_c)
{
ASSERT_MSG(DYNA_REC, !inputs_are_singles, "Tried to apply 25-bit precision to single");
V0Q = fpr.GetReg();

V1Q = fpr.GetReg();
inaccurate_fma_temp_reg = reg_encoder(V0Q);
}

Force25BitPrecision(reg_encoder(V1Q), VC);
VC = reg_encoder(V1Q);
switch (op5)
{
case 18:
m_float_emit.FDIV(VD, VA, VB);
break;
case 20:
m_float_emit.FSUB(VD, VA, VB);
break;
case 21:
m_float_emit.FADD(VD, VA, VB);
break;
case 25:
m_float_emit.FMUL(VD, VA, VC);
break;
// While it may seem like PowerPC's nmadd/nmsub map to AArch64's nmadd/msub [sic],
// the subtly different definitions affect how signed zeroes are handled.
// Also, PowerPC's nmadd/nmsub perform rounding before the final negation.
// So, we negate using a separate FNEG instruction instead of using AArch64's nmadd/msub.
case 28: // fmsub: "D = A*C - B" vs "Vd = (-Va) + Vn*Vm"
case 30: // fnmsub: "D = -(A*C - B)" vs "Vd = -((-Va) + Vn*Vm)"
if (inaccurate_fma)
{
m_float_emit.FMUL(inaccurate_fma_temp_reg, VA, VC);
m_float_emit.FSUB(VD, inaccurate_fma_temp_reg, VB);
}

ARM64Reg inaccurate_fma_temp_reg = VD;
if (inaccurate_fma && d == b)
else
{
V0Q = fpr.GetReg();

inaccurate_fma_temp_reg = reg_encoder(V0Q);
m_float_emit.FNMSUB(VD, VA, VC, VB);
}

switch (op5)
if (op5 == 30)
m_float_emit.FNEG(VD, VD);
break;
case 29: // fmadd: "D = A*C + B" vs "Vd = Va + Vn*Vm"
case 31: // fnmadd: "D = -(A*C + B)" vs "Vd = -(Va + Vn*Vm)"
if (inaccurate_fma)
{
case 18:
m_float_emit.FDIV(VD, VA, VB);
break;
case 20:
m_float_emit.FSUB(VD, VA, VB);
break;
case 21:
m_float_emit.FADD(VD, VA, VB);
break;
case 25:
m_float_emit.FMUL(VD, VA, VC);
break;
// While it may seem like PowerPC's nmadd/nmsub map to AArch64's nmadd/msub [sic],
// the subtly different definitions affect how signed zeroes are handled.
// Also, PowerPC's nmadd/nmsub perform rounding before the final negation.
// So, we negate using a separate FNEG instruction instead of using AArch64's nmadd/msub.
case 28: // fmsub: "D = A*C - B" vs "Vd = (-Va) + Vn*Vm"
case 30: // fnmsub: "D = -(A*C - B)" vs "Vd = -((-Va) + Vn*Vm)"
if (inaccurate_fma)
{
m_float_emit.FMUL(inaccurate_fma_temp_reg, VA, VC);
m_float_emit.FSUB(VD, inaccurate_fma_temp_reg, VB);
}
else
{
m_float_emit.FNMSUB(VD, VA, VC, VB);
}
if (op5 == 30)
m_float_emit.FNEG(VD, VD);
break;
case 29: // fmadd: "D = A*C + B" vs "Vd = Va + Vn*Vm"
case 31: // fnmadd: "D = -(A*C + B)" vs "Vd = -(Va + Vn*Vm)"
if (inaccurate_fma)
{
m_float_emit.FMUL(inaccurate_fma_temp_reg, VA, VC);
m_float_emit.FADD(VD, inaccurate_fma_temp_reg, VB);
}
else
{
m_float_emit.FMADD(VD, VA, VC, VB);
}
if (op5 == 31)
m_float_emit.FNEG(VD, VD);
break;
default:
ASSERT_MSG(DYNA_REC, 0, "fp_arith");
break;
m_float_emit.FMUL(inaccurate_fma_temp_reg, VA, VC);
m_float_emit.FADD(VD, inaccurate_fma_temp_reg, VB);
}
else
{
m_float_emit.FMADD(VD, VA, VC, VB);
}
if (op5 == 31)
m_float_emit.FNEG(VD, VD);
break;
default:
ASSERT_MSG(DYNA_REC, 0, "fp_arith");
break;
}

if (V0Q != ARM64Reg::INVALID_REG)
fpr.Unlock(V0Q);
if (V1Q != ARM64Reg::INVALID_REG)
fpr.Unlock(V1Q);

if (outputs_are_singles)
if (output_is_single)
{
ASSERT_MSG(DYNA_REC, inputs_are_singles == inputs_are_singles_func(),
"Register allocation turned singles into doubles in the middle of fp_arith");

fpr.FixSinglePrecision(d);
}

SetFPRFIfNeeded(outputs_are_singles, VD);
SetFPRFIfNeeded(output_is_single, VD);
}

void JitArm64::fp_logic(UGeckoInstruction inst)
@@ -73,55 +73,7 @@ void JitArm64::ps_mergeXX(UGeckoInstruction inst)
"Register allocation turned singles into doubles in the middle of ps_mergeXX");
}

void JitArm64::ps_mulsX(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITPairedOff);
FALLBACK_IF(inst.Rc);
FALLBACK_IF(jo.fp_exceptions);

const u32 a = inst.FA;
const u32 c = inst.FC;
const u32 d = inst.FD;

const bool upper = inst.SUBOP5 == 13;

const bool singles = fpr.IsSingle(a) && fpr.IsSingle(c);
const bool round_c = !js.op->fprIsSingle[inst.FC];
const RegType type = singles ? RegType::Single : RegType::Register;
const u8 size = singles ? 32 : 64;
const auto reg_encoder = singles ? EncodeRegToDouble : EncodeRegToQuad;

const ARM64Reg VA = fpr.R(a, type);
ARM64Reg VC = fpr.R(c, type);
const ARM64Reg VD = fpr.RW(d, type);

ARM64Reg V0Q = ARM64Reg::INVALID_REG;

if (round_c)
{
ASSERT_MSG(DYNA_REC, !singles, "Tried to apply 25-bit precision to single");

V0Q = fpr.GetReg();

Force25BitPrecision(reg_encoder(V0Q), reg_encoder(VC));
VC = reg_encoder(V0Q);
}

m_float_emit.FMUL(size, reg_encoder(VD), reg_encoder(VA), reg_encoder(VC), upper ? 1 : 0);

if (V0Q != ARM64Reg::INVALID_REG)
fpr.Unlock(V0Q);

ASSERT_MSG(DYNA_REC, singles == (fpr.IsSingle(a) && fpr.IsSingle(c)),
"Register allocation turned singles into doubles in the middle of ps_mulsX");

fpr.FixSinglePrecision(d);

SetFPRFIfNeeded(true, VD);
}

void JitArm64::ps_maddXX(UGeckoInstruction inst)
void JitArm64::ps_arith(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITPairedOff);
@@ -134,16 +86,23 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
const u32 d = inst.FD;
const u32 op5 = inst.SUBOP5;

const bool use_c = op5 == 25 || (op5 & ~0x13) == 12; // mul, muls, and all kinds of maddXX
const bool use_b = op5 != 25 && (op5 & ~0x1) != 12; // mul and muls don't use B

const auto singles_func = [&] {
return fpr.IsSingle(a) && (!use_b || fpr.IsSingle(b)) && (!use_c || fpr.IsSingle(c));
};
const bool singles = singles_func();

const bool inaccurate_fma = !Config::Get(Config::SESSION_USE_FMA);
const bool singles = fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c);
const bool round_c = !js.op->fprIsSingle[inst.FC];
const bool round_c = use_c && !js.op->fprIsSingle[inst.FC];
const RegType type = singles ? RegType::Single : RegType::Register;
const u8 size = singles ? 32 : 64;
const auto reg_encoder = singles ? EncodeRegToDouble : EncodeRegToQuad;

const ARM64Reg VA = reg_encoder(fpr.R(a, type));
const ARM64Reg VB = reg_encoder(fpr.R(b, type));
ARM64Reg VC = reg_encoder(fpr.R(c, type));
const ARM64Reg VB = use_b ? reg_encoder(fpr.R(b, type)) : ARM64Reg::INVALID_REG;
ARM64Reg VC = use_c ? reg_encoder(fpr.R(c, type)) : ARM64Reg::INVALID_REG;
const ARM64Reg VD = reg_encoder(fpr.RW(d, type));

ARM64Reg V0Q = ARM64Reg::INVALID_REG;
@@ -178,6 +137,12 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
ARM64Reg result_reg = VD;
switch (op5)
{
case 12: // ps_muls0: d = a * c.ps0
m_float_emit.FMUL(size, VD, VA, VC, 0);
break;
case 13: // ps_muls1: d = a * c.ps1
m_float_emit.FMUL(size, VD, VA, VC, 1);
break;
case 14: // ps_madds0: d = a * c.ps0 + b
if (inaccurate_fma)
{
@@ -224,6 +189,18 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
result_reg = V0;
}
break;
case 18: // ps_div
m_float_emit.FDIV(size, VD, VA, VB);
break;
case 20: // ps_sub
m_float_emit.FSUB(size, VD, VA, VB);
break;
case 21: // ps_add
m_float_emit.FADD(size, VD, VA, VB);
break;
case 25: // ps_mul
m_float_emit.FMUL(size, VD, VA, VC);
break;
case 28: // ps_msub: d = a * c - b
case 30: // ps_nmsub: d = -(a * c - b)
if (inaccurate_fma)
@@ -269,7 +246,7 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
}
break;
default:
ASSERT_MSG(DYNA_REC, 0, "ps_madd - invalid op");
ASSERT_MSG(DYNA_REC, 0, "ps_arith - invalid op");
break;
}

@@ -292,8 +269,8 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
if (V1Q != ARM64Reg::INVALID_REG)
fpr.Unlock(V1Q);

ASSERT_MSG(DYNA_REC, singles == (fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c)),
"Register allocation turned singles into doubles in the middle of ps_maddXX");
ASSERT_MSG(DYNA_REC, singles == singles_func(),
"Register allocation turned singles into doubles in the middle of ps_arith");

fpr.FixSinglePrecision(d);

0 comments on commit 5b69c67

Please sign in to comment.