Merge pull request #1307 from comex/bitset

Higher level bitset wrapper
dolphin-emu · Oct 29, 2014 · 089e32b · 089e32b
2 parents 7747c9e + c81e3da
commit 089e32b
Show file tree

Hide file tree

Showing 31 changed files with 493 additions and 308 deletions.
diff --git a/Source/Core/Common/BitSet.h b/Source/Core/Common/BitSet.h
@@ -0,0 +1,166 @@
+// This file is under the public domain.
+
+#pragma once
+
+#include <initializer_list>
+#include <type_traits>
+#include "CommonTypes.h"
+
+// Helper functions:
+
+#ifdef _WIN32
+template <typename T>
+static inline int CountSetBits(T v)
+{
+	// from https://graphics.stanford.edu/~seander/bithacks.html
+	// GCC has this built in, but MSVC's intrinsic will only emit the actual
+	// POPCNT instruction, which we're not depending on
+	v = v - ((v >> 1) & (T)~(T)0/3);
+	v = (v & (T)~(T)0/15*3) + ((v >> 2) & (T)~(T)0/15*3);
+	v = (v + (v >> 4)) & (T)~(T)0/255*15;
+	return (T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * 8;
+}
+static inline int LeastSignificantSetBit(u32 val)
+{
+	unsigned long index;
+	_BitScanForward(&index, val);
+	return (int)index;
+}
+static inline int LeastSignificantSetBit(u64 val)
+{
+	unsigned long index;
+	_BitScanForward64(&index, val);
+	return (int)index;
+}
+#else
+static inline int CountSetBits(u32 val) { return __builtin_popcount(val); }
+static inline int CountSetBits(u64 val) { return __builtin_popcountll(val); }
+static inline int LeastSignificantSetBit(u32 val) { return __builtin_ctz(val); }
+static inline int LeastSignificantSetBit(u64 val) { return __builtin_ctzll(val); }
+#endif
+
+// namespace avoids conflict with OS X Carbon; don't use BitSet<T> directly
+namespace BS
+{
+
+// Similar to std::bitset, this is a class which encapsulates a bitset, i.e.
+// using the set bits of an integer to represent a set of integers.  Like that
+// class, it acts like an array of bools:
+//     BitSet32 bs;
+//     bs[1] = true;
+// but also like the underlying integer ([0] = least significant bit):
+//     BitSet32 bs2 = ...;
+//     bs = (bs ^ bs2) & BitSet32(0xffff);
+// The following additional functionality is provided:
+// - Construction using an initializer list.
+//     BitSet bs { 1, 2, 4, 8 };
+// - Efficiently iterating through the set bits:
+//     for (int i : bs)
+//         [i is the *index* of a set bit]
+//   (This uses the appropriate CPU instruction to find the next set bit in one
+//   operation.)
+// - Counting set bits using .Count() - see comment on that method.
+
+// TODO: use constexpr when MSVC gets out of the Dark Ages
+
+template <typename IntTy>
+class BitSet
+{
+	static_assert(!std::is_signed<IntTy>::value, "BitSet should not be used with signed types");
+public:
+	// A reference to a particular bit, returned from operator[].
+	class Ref
+	{
+	public:
+		Ref(Ref&& other) : m_bs(other.m_bs), m_mask(other.m_mask) {}
+		Ref(BitSet* bs, IntTy mask) : m_bs(bs), m_mask(mask) {}
+		operator bool() const { return (m_bs->m_val & m_mask) != 0; }
+		bool operator=(bool set)
+		{
+			m_bs->m_val = (m_bs->m_val & ~m_mask) | (set ? m_mask : 0);
+			return set;
+		}
+	private:
+		BitSet* m_bs;
+		IntTy m_mask;
+	};
+
+	// A STL-like iterator is required to be able to use range-based for loops.
+	class Iterator
+	{
+	public:
+		Iterator(const Iterator& other) : m_val(other.m_val), m_bit(other.m_bit) {}
+		Iterator(IntTy val, int bit) : m_val(val), m_bit(bit) {}
+		Iterator& operator=(Iterator other) { new (this) Iterator(other); return *this; }
+		int operator*() { return m_bit; }
+		Iterator& operator++()
+		{
+			if (m_val == 0)
+			{
+				m_bit = -1;
+			}
+			else
+			{
+				int bit = LeastSignificantSetBit(m_val);
+				m_val &= ~(1 << bit);
+				m_bit = bit;
+			}
+			return *this;
+		}
+		Iterator operator++(int _)
+		{
+			Iterator other(*this);
+			++*this;
+			return other;
+		}
+		bool operator==(Iterator other) const { return m_bit == other.m_bit; }
+		bool operator!=(Iterator other) const { return m_bit != other.m_bit; }
+	private:
+		IntTy m_val;
+		int m_bit;
+	};
+
+	BitSet() : m_val(0) {}
+	explicit BitSet(IntTy val) : m_val(val) {}
+	BitSet(std::initializer_list<int> init)
+	{
+		m_val = 0;
+		for (int bit : init)
+			m_val |= (IntTy)1 << bit;
+	}
+
+	static BitSet AllTrue(size_t count)
+	{
+		return BitSet(count == sizeof(IntTy)*8 ? ~(IntTy)0 : (((IntTy)1 << count) - 1));
+	}
+
+	Ref operator[](size_t bit) { return Ref(this, (IntTy)1 << bit); }
+	const Ref operator[](size_t bit) const { return (*const_cast<BitSet*>(this))[bit]; }
+	bool operator==(BitSet other) const { return m_val == other.m_val; }
+	bool operator!=(BitSet other) const { return m_val != other.m_val; }
+	BitSet operator|(BitSet other) const { return BitSet(m_val | other.m_val); }
+	BitSet operator&(BitSet other) const { return BitSet(m_val & other.m_val); }
+	BitSet operator^(BitSet other) const { return BitSet(m_val ^ other.m_val); }
+	BitSet operator~() const { return BitSet(~m_val); }
+	BitSet& operator|=(BitSet other) { return *this = *this | other; }
+	BitSet& operator&=(BitSet other) { return *this = *this & other; }
+	BitSet& operator^=(BitSet other) { return *this = *this ^ other; }
+	operator u32() = delete;
+	operator bool() { return m_val != 0; }
+
+	// Warning: Even though on modern CPUs this is a single fast instruction,
+	// Dolphin's official builds do not currently assume POPCNT support on x86,
+	// so slower explicit bit twiddling is generated.  Still should generally
+	// be faster than a loop.
+	unsigned int Count() const { return CountSetBits(m_val); }
+
+	Iterator begin() const { Iterator it(m_val, 0); return ++it; }
+	Iterator end() const { return Iterator(m_val, -1); }
+
+	IntTy m_val;
+};
+
+}
+
+typedef BS::BitSet<u32> BitSet32;
+typedef BS::BitSet<u64> BitSet64;
diff --git a/Source/Core/Common/Common.vcxproj b/Source/Core/Common/Common.vcxproj
@@ -39,6 +39,7 @@
     <ClInclude Include="Atomic_GCC.h" />
     <ClInclude Include="Atomic_Win32.h" />
     <ClInclude Include="BitField.h" />
+    <ClInclude Include="BitSet.h" />
     <ClInclude Include="BreakPoints.h" />
     <ClInclude Include="CDUtils.h" />
     <ClInclude Include="ChunkFile.h" />
@@ -137,4 +138,4 @@
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
   </ImportGroup>
-</Project>
+</Project>
diff --git a/Source/Core/Common/Common.vcxproj.filters b/Source/Core/Common/Common.vcxproj.filters
@@ -13,6 +13,7 @@
     <ClInclude Include="Atomic_GCC.h" />
     <ClInclude Include="Atomic_Win32.h" />
     <ClInclude Include="BitField.h" />
+    <ClInclude Include="BitSet.h" />
     <ClInclude Include="BreakPoints.h" />
     <ClInclude Include="CDUtils.h" />
     <ClInclude Include="ChunkFile.h" />
@@ -118,4 +119,4 @@
   <ItemGroup>
     <Text Include="CMakeLists.txt" />
   </ItemGroup>
-</Project>
+</Project>
diff --git a/Source/Core/Common/x64ABI.cpp b/Source/Core/Common/x64ABI.cpp
@@ -10,31 +10,23 @@ using namespace Gen;
 
 // Shared code between Win64 and Unix64
 
-void XEmitter::ABI_CalculateFrameSize(u32 mask, size_t rsp_alignment, size_t needed_frame_size, size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp)
+void XEmitter::ABI_CalculateFrameSize(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size, size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp)
 {
 	size_t shadow = 0;
 #if defined(_WIN32)
 	shadow = 0x20;
 #endif
 
-	int count = 0;
-	for (int r = 0; r < 16; r++)
-	{
-		if (mask & (1 << r))
-			count++;
-	}
+	int count = (mask & ABI_ALL_GPRS).Count();
 	rsp_alignment -= count * 8;
 	size_t subtraction = 0;
-	if (mask & 0xffff0000)
+	int fpr_count = (mask & ABI_ALL_FPRS).Count();
+	if (fpr_count)
 	{
 		// If we have any XMMs to save, we must align the stack here.
 		subtraction = rsp_alignment & 0xf;
 	}
-	for (int x = 0; x < 16; x++)
-	{
-		if (mask & (1 << (16 + x)))
-			subtraction += 16;
-	}
+	subtraction += 16 * fpr_count;
 	size_t xmm_base_subtraction = subtraction;
 	subtraction += needed_frame_size;
 	subtraction += shadow;
@@ -47,55 +39,44 @@ void XEmitter::ABI_CalculateFrameSize(u32 mask, size_t rsp_alignment, size_t nee
 	*xmm_offsetp = subtraction - xmm_base_subtraction;
 }
 
-size_t XEmitter::ABI_PushRegistersAndAdjustStack(u32 mask, size_t rsp_alignment, size_t needed_frame_size)
+size_t XEmitter::ABI_PushRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size)
 {
 	size_t shadow, subtraction, xmm_offset;
 	ABI_CalculateFrameSize(mask, rsp_alignment, needed_frame_size, &shadow, &subtraction, &xmm_offset);
 
-	for (int r = 0; r < 16; r++)
-	{
-		if (mask & (1 << r))
-			PUSH((X64Reg) r);
-	}
+	for (int r : mask & ABI_ALL_GPRS)
+		PUSH((X64Reg) r);
 
 	if (subtraction)
 		SUB(64, R(RSP), subtraction >= 0x80 ? Imm32((u32)subtraction) : Imm8((u8)subtraction));
 
-	for (int x = 0; x < 16; x++)
+	for (int x : mask & ABI_ALL_FPRS)
 	{
-		if (mask & (1 << (16 + x)))
-		{
-			MOVAPD(MDisp(RSP, (int)xmm_offset), (X64Reg) x);
-			xmm_offset += 16;
-		}
+		MOVAPD(MDisp(RSP, (int)xmm_offset), (X64Reg) (x - 16));
+		xmm_offset += 16;
 	}
 
 	return shadow;
 }
 
-void XEmitter::ABI_PopRegistersAndAdjustStack(u32 mask, size_t rsp_alignment, size_t needed_frame_size)
+void XEmitter::ABI_PopRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size)
 {
 	size_t shadow, subtraction, xmm_offset;
 	ABI_CalculateFrameSize(mask, rsp_alignment, needed_frame_size, &shadow, &subtraction, &xmm_offset);
 
-	for (int x = 0; x < 16; x++)
+	for (int x : mask & ABI_ALL_FPRS)
 	{
-		if (mask & (1 << (16 + x)))
-		{
-			MOVAPD((X64Reg) x, MDisp(RSP, (int)xmm_offset));
-			xmm_offset += 16;
-		}
+		MOVAPD((X64Reg) (x - 16), MDisp(RSP, (int)xmm_offset));
+		xmm_offset += 16;
 	}
 
 	if (subtraction)
 		ADD(64, R(RSP), subtraction >= 0x80 ? Imm32((u32)subtraction) : Imm8((u8)subtraction));
 
 	for (int r = 15; r >= 0; r--)
 	{
-		if (mask & (1 << r))
-		{
+		if (mask[r])
 			POP((X64Reg) r);
-		}
 	}
 }
 

diff --git a/Source/Core/Common/x64ABI.h b/Source/Core/Common/x64ABI.h
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include "Common/BitSet.h"
 #include "Common/x64Emitter.h"
 
 // x64 ABI:s, and helpers to help follow them when JIT-ing code.
@@ -23,6 +24,9 @@
 // Callee-save:  RBX RBP R12 R13 R14 R15
 // Parameters:   RDI RSI RDX RCX R8 R9
 
+#define ABI_ALL_FPRS BitSet32(0xffff0000)
+#define ABI_ALL_GPRS BitSet32(0x0000ffff)
+
 #ifdef _WIN32 // 64-bit Windows - the really exotic calling convention
 
 #define ABI_PARAM1 RCX
@@ -31,11 +35,9 @@
 #define ABI_PARAM4 R9
 
 // xmm0-xmm15 use the upper 16 bits in the functions that push/pop registers.
-#define ABI_ALL_CALLER_SAVED ((1 << RAX) | (1 << RCX) | (1 << RDX) | (1 << R8) | \
-                              (1 << R9) | (1 << R10) | (1 << R11) | \
-                              (1 << (XMM0+16)) | (1 << (XMM1+16)) | (1 << (XMM2+16)) | (1 << (XMM3+16)) | \
-                              (1 << (XMM4+16)) | (1 << (XMM5+16)))
-
+#define ABI_ALL_CALLER_SAVED \
+	(BitSet32 { RAX, RCX, RDX, R8, R9, R10, R11, \
+	            XMM0+16, XMM1+16, XMM2+16, XMM3+16, XMM4+16, XMM5+16 })
 #else  //64-bit Unix / OS X
 
 #define ABI_PARAM1 RDI
@@ -47,13 +49,12 @@
 
 // FIXME: avoid pushing all 16 XMM registers when possible? most functions we call probably
 // don't actually clobber them.
-#define ABI_ALL_CALLER_SAVED ((1 << RAX) | (1 << RCX) | (1 << RDX) | (1 << RDI) | \
-                              (1 << RSI) | (1 << R8) | (1 << R9) | (1 << R10) | (1 << R11) | \
-                              0xffff0000 /* xmm0..15 */)
-
+#define ABI_ALL_CALLER_SAVED \
+	(BitSet32 { RAX, RCX, RDX, RDI, RSI, R8, R9, R10, R11 } | \
+	 ABI_ALL_FPRS)
 #endif // WIN32
 
-#define ABI_ALL_CALLEE_SAVED ((u32) ~ABI_ALL_CALLER_SAVED)
+#define ABI_ALL_CALLEE_SAVED (~ABI_ALL_CALLER_SAVED)
 
 #define ABI_RETURN RAX
 
diff --git a/Source/Core/Common/x64Emitter.h b/Source/Core/Common/x64Emitter.h
@@ -10,6 +10,7 @@
 #include <cstring>
 #include <functional>
 
+#include "Common/BitSet.h"
 #include "Common/CodeBlock.h"
 #include "Common/CommonTypes.h"
 
@@ -302,7 +303,7 @@ class XEmitter
 	void WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, OpArg arg);
 	void WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg &a1, const OpArg &a2);
 
-	void ABI_CalculateFrameSize(u32 mask, size_t rsp_alignment, size_t needed_frame_size, size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp);
+	void ABI_CalculateFrameSize(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size, size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp);
 
 protected:
 	inline void Write8(u8 value)   {*code++ = value;}
@@ -883,8 +884,8 @@ class XEmitter
 	// Saves/restores the registers and adjusts the stack to be aligned as
 	// required by the ABI, where the previous alignment was as specified.
 	// Push returns the size of the shadow space, i.e. the offset of the frame.
-	size_t ABI_PushRegistersAndAdjustStack(u32 mask, size_t rsp_alignment, size_t needed_frame_size = 0);
-	void ABI_PopRegistersAndAdjustStack(u32 mask, size_t rsp_alignment, size_t needed_frame_size = 0);
+	size_t ABI_PushRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size = 0);
+	void ABI_PopRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size = 0);
 
 	inline int ABI_GetNumXMMRegs() { return 16; }
 

diff --git a/Source/Core/Core/DSP/DSPEmitter.cpp b/Source/Core/Core/DSP/DSPEmitter.cpp
@@ -385,7 +385,7 @@ void DSPEmitter::CompileDispatcher()
 {
 	enterDispatcher = AlignCode16();
 	// We don't use floating point (high 16 bits).
-	u32 registers_used = ABI_ALL_CALLEE_SAVED & 0xffff;
+	BitSet32 registers_used = ABI_ALL_CALLEE_SAVED & BitSet32(0xffff);
 	ABI_PushRegistersAndAdjustStack(registers_used, 8);
 
 	const u8 *dispatcherLoop = GetCodePtr();