Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Arm64Emitter: Simplify LogicalImm logic #10807

Merged
merged 4 commits into from
Jul 10, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
180 changes: 21 additions & 159 deletions Source/Core/Common/Arm64Emitter.h
Original file line number Diff line number Diff line change
Expand Up @@ -509,8 +509,6 @@ struct LogicalImm

constexpr LogicalImm(u64 value, u32 width)
{
bool negate = false;

// Logical immediates are encoded using parameters n, imm_s and imm_r using
// the following table:
//
Expand All @@ -526,28 +524,6 @@ struct LogicalImm
// A pattern is constructed of size bits, where the least significant S+1 bits
// are set. The pattern is rotated right by R, and repeated across a 32 or
// 64-bit value, depending on destination register width.
//
// Put another way: the basic format of a logical immediate is a single
// contiguous stretch of 1 bits, repeated across the whole word at intervals
// given by a power of 2. To identify them quickly, we first locate the
// lowest stretch of 1 bits, then the next 1 bit above that; that combination
// is different for every logical immediate, so it gives us all the
// information we need to identify the only logical immediate that our input
// could be, and then we simply check if that's the value we actually have.
//
// (The rotation parameter does give the possibility of the stretch of 1 bits
// going 'round the end' of the word. To deal with that, we observe that in
// any situation where that happens the bitwise NOT of the value is also a
// valid logical immediate. So we simply invert the input whenever its low bit
// is set, and then we know that the rotated case can't arise.)

if (value & 1)
{
// If the low bit is 1, negate the value, and set a flag to remember that we
// did (so that we can adjust the return values appropriately).
negate = true;
value = ~value;
}

constexpr int kWRegSizeInBits = 32;

Expand All @@ -558,156 +534,42 @@ struct LogicalImm
// as a logical immediate will also be the correct encoding of the 32-bit
// value.

// The most-significant 32 bits may not be zero (ie. negate is true) so
// shift the value left before duplicating it.
value <<= kWRegSizeInBits;
value |= value >> kWRegSizeInBits;
}

// The basic analysis idea: imagine our input word looks like this.
//
// 0011111000111110001111100011111000111110001111100011111000111110
// c b a
// |<--d-->|
//
// We find the lowest set bit (as an actual power-of-2 value, not its index)
// and call it a. Then we add a to our original number, which wipes out the
// bottommost stretch of set bits and replaces it with a 1 carried into the
// next zero bit. Then we look for the new lowest set bit, which is in
// position b, and subtract it, so now our number is just like the original
// but with the lowest stretch of set bits completely gone. Now we find the
// lowest set bit again, which is position c in the diagram above. Then we'll
// measure the distance d between bit positions a and c (using CLZ), and that
// tells us that the only valid logical immediate that could possibly be equal
// to this number is the one in which a stretch of bits running from a to just
// below b is replicated every d bits.
u64 a = Common::LargestPowerOf2Divisor(value);
u64 value_plus_a = value + a;
u64 b = Common::LargestPowerOf2Divisor(value_plus_a);
u64 value_plus_a_minus_b = value_plus_a - b;
u64 c = Common::LargestPowerOf2Divisor(value_plus_a_minus_b);

int d = 0, clz_a = 0, out_n = 0;
u64 mask = 0;

if (c != 0)
{
// The general case, in which there is more than one stretch of set bits.
// Compute the repeat distance d, and set up a bitmask covering the basic
// unit of repetition (i.e. a word with the bottom d bits set). Also, in all
// of these cases the N bit of the output will be zero.
clz_a = Common::CountLeadingZeros(a);
int clz_c = Common::CountLeadingZeros(c);
d = clz_a - clz_c;
mask = ((UINT64_C(1) << d) - 1);
out_n = 0;
}
else
if (value == 0 || (~value) == 0)
{
// Handle degenerate cases.
//
// If any of those 'find lowest set bit' operations didn't find a set bit at
// all, then the word will have been zero thereafter, so in particular the
// last lowest_set_bit operation will have returned zero. So we can test for
// all the special case conditions in one go by seeing if c is zero.
if (a == 0)
{
// The input was zero (or all 1 bits, which will come to here too after we
// inverted it at the start of the function), which is invalid.
return;
}
else
{
// Otherwise, if c was zero but a was not, then there's just one stretch
// of set bits in our word, meaning that we have the trivial case of
// d == 64 and only one 'repetition'. Set up all the same variables as in
// the general case above, and set the N bit in the output.
clz_a = Common::CountLeadingZeros(a);
d = 64;
mask = ~UINT64_C(0);
out_n = 1;
}
}

// If the repeat period d is not a power of two, it can't be encoded.
if (!MathUtil::IsPow2<u64>(d))
valid = false;
return;
}

// If the bit stretch (b - a) does not fit within the mask derived from the
// repeat period, then fail.
if (((b - a) & ~mask) != 0)
return;
// Normalize value, rotating it such that the LSB is 1:
// If LSB is already one, we mask away the trailing sequence of ones and
// pick the next sequence of ones. This ensures we get a complete element
// that has not been cut-in-half due to rotation across the word boundary.

// The only possible option is b - a repeated every d bits. Now we're going to
// actually construct the valid logical immediate derived from that
// specification, and see if it equals our original input.
//
// To repeat a value every d bits, we multiply it by a number of the form
// (1 + 2^d + 2^(2d) + ...), i.e. 0x0001000100010001 or similar. These can
// be derived using a table lookup on CLZ(d).
constexpr std::array<u64, 6> multipliers = {{
0x0000000000000001UL,
0x0000000100000001UL,
0x0001000100010001UL,
0x0101010101010101UL,
0x1111111111111111UL,
0x5555555555555555UL,
}};

const int multiplier_idx = Common::CountLeadingZeros((u64)d) - 57;

// Ensure that the index to the multipliers array is within bounds.
DEBUG_ASSERT((multiplier_idx >= 0) &&
(static_cast<size_t>(multiplier_idx) < multipliers.size()));

const u64 multiplier = multipliers[multiplier_idx];
const u64 candidate = (b - a) * multiplier;

// The candidate pattern doesn't match our input value, so fail.
if (value != candidate)
return;
const size_t rotation = Common::CountTrailingZeros(value & (value + 1));
const u64 normalized = Common::RotateRight(value, rotation);

// We have a match! This is a valid logical immediate, so now we have to
// construct the bits and pieces of the instruction encoding that generates
// it.
n = out_n;
const size_t element_size = Common::CountTrailingZeros(normalized & (normalized + 1));
const size_t ones = Common::CountTrailingZeros(~normalized);

// Count the set bits in our basic stretch. The special case of clz(0) == -1
// makes the answer come out right for stretches that reach the very top of
// the word (e.g. numbers like 0xffffc00000000000).
const int clz_b = (b == 0) ? -1 : Common::CountLeadingZeros(b);
s = clz_a - clz_b;
// Check the value is repeating; also ensures element size is a power of two.

// Decide how many bits to rotate right by, to put the low bit of that basic
// stretch in position a.
if (negate)
{
// If we inverted the input right at the start of this function, here's
// where we compensate: the number of set bits becomes the number of clear
// bits, and the rotation count is based on position b rather than position
// a (since b is the location of the 'lowest' 1 bit after inversion).
s = d - s;
r = (clz_b + 1) & (d - 1);
}
else
if (Common::RotateRight(value, element_size) != value)
{
r = (clz_a + 1) & (d - 1);
valid = false;
return;
}

// Now we're done, except for having to encode the S output in such a way that
// Now we're done. We just have to encode the S output in such a way that
// it gives both the number of set bits and the length of the repeated
// segment. The s field is encoded like this:
//
// imms size S
// ssssss 64 UInt(ssssss)
// 0sssss 32 UInt(sssss)
// 10ssss 16 UInt(ssss)
// 110sss 8 UInt(sss)
// 1110ss 4 UInt(ss)
// 11110s 2 UInt(s)
//
// So we 'or' (-d << 1) with our computed s to form imms.
s = ((-d << 1) | (s - 1)) & 0x3f;
// segment.

r = static_cast<u8>((element_size - rotation) & (element_size - 1));
s = (((~element_size + 1) << 1) | (ones - 1)) & 0x3f;
n = (element_size >> 6) & 1;

valid = true;
}
Expand Down
50 changes: 50 additions & 0 deletions Source/Core/Common/BitUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,56 @@ constexpr int CountLeadingZeros(uint32_t value)
#endif
}

template <typename T>
constexpr int CountTrailingZerosConst(T value)
{
int result = sizeof(T) * 8;
while (value)
{
result--;
value <<= 1;
}
return result;
}

constexpr int CountTrailingZeros(uint64_t value)
{
#if defined(__GNUC__)
return value ? __builtin_ctzll(value) : 64;
#elif defined(_MSC_VER)
if (std::is_constant_evaluated())
{
return CountTrailingZerosConst(value);
}
else
{
unsigned long index = 0;
return _BitScanForward64(&index, value) ? index : 64;
}
#else
return CountTrailingZerosConst(value);
#endif
}

constexpr int CountTrailingZeros(uint32_t value)
{
#if defined(__GNUC__)
return value ? __builtin_ctz(value) : 32;
#elif defined(_MSC_VER)
if (std::is_constant_evaluated())
{
return CountTrailingZerosConst(value);
}
else
{
unsigned long index = 0;
return _BitScanForward(&index, value) ? index : 32;
}
#else
return CountLeadingZerosConst(value);
#endif
}

#undef CONSTEXPR_FROM_INTRINSIC

template <typename T>
Expand Down
31 changes: 31 additions & 0 deletions Source/UnitTests/Core/PowerPC/JitArm64/MovI2R.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,37 @@ TEST(JitArm64, MovI2R_Rand)
}
}

// Construct and test every 64-bit logical immediate
TEST(JitArm64, MovI2R_LogImm)
{
TestMovI2R test;

for (unsigned size = 2; size <= 64; size *= 2)
{
for (unsigned length = 1; length < size; ++length)
{
u64 imm = ~u64{0} >> (64 - length);
for (unsigned e = size; e < 64; e *= 2)
{
imm |= imm << e;
}
for (unsigned rotation = 0; rotation < size; ++rotation)
{
test.Check64(imm);
EXPECT_EQ(static_cast<bool>(LogicalImm(imm, 64)), true);

if (size < 64)
{
test.Check32(imm);
EXPECT_EQ(static_cast<bool>(LogicalImm(static_cast<u32>(imm), 32)), true);
}

imm = (imm >> 63) | (imm << 1);
}
}
}
}

TEST(JitArm64, MovI2R_ADP)
{
TestMovI2R test;
Expand Down