Permalink
Browse files

Making the number of bits a compile time constant

  • Loading branch information...
1 parent 944506e commit ac88298df6f14aee8bcc6b84d63a8e952539a872 @grundprinzip committed Mar 12, 2012
Showing with 124 additions and 45 deletions.
  1. +5 −3 Makefile
  2. +44 −36 bcv.h
  3. +8 −0 bcv.sublime-project
  4. +67 −6 main.cpp
View
@@ -1,10 +1,12 @@
SHELL = /bin/bash
BUILD_DIR=build
+CXXFLAGS= -g2 -mtune=native -mssse3 -msse4.1
+
all: gen
- mkdir $(BUILD_DIR)
- g++ -o $(BUILD_DIR)/main main.cpp -g2
- g++ -O3 -o $(BUILD_DIR)/main_opt main.cpp -g2 -DNDEBUG
+ mkdir -p $(BUILD_DIR)
+ g++ -o $(BUILD_DIR)/main main.cpp $(CXXFLAGS)
+ g++ -O3 -o $(BUILD_DIR)/main_opt main.cpp -g2 -DNDEBUG $(CXXFLAGS)
gen:
cat mask_tpl.h > mask.h
View
@@ -1,3 +1,4 @@
+#include <algorithm>
#include <iostream>
#include <memory>
#include <stdexcept>
@@ -10,14 +11,25 @@
#include <stdlib.h>
#include <string.h>
+// SSE requirements
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#include <smmintrin.h>
+
#ifndef NDEBUG
+
#define DEBUG(msg) std::cout << msg << std::endl;
+#define DEBUG_M128(m) std::cout << (uint64_t) _mm_extract_epi64(m, 0) << " " << (uint64_t) _mm_extract_epi64(m, 0) << std::endl;
+
#else
+
#define DEBUG(msg)
+
#endif
+
BUILD_MASK_HEADER
/*
@@ -27,7 +39,7 @@ BUILD_MASK_HEADER
*/
-template<typename T>
+template<typename T, uint8_t B>
class BitCompressedVector
{
@@ -41,9 +53,9 @@ class BitCompressedVector
/*
* Constructor
*/
- BitCompressedVector(size_t size, unsigned char bits): _bits(bits), _reserved(size)
+ BitCompressedVector(size_t size): _reserved(size)
{
- _allocated_blocks = (size * bits) / (sizeof(data_t) * 8) + 1;
+ _allocated_blocks = (size * B) / (sizeof(data_t) * 8) + 1;
posix_memalign((void**) &_data, 64, _allocated_blocks * sizeof(data_t));
memset(_data, 0, _allocated_blocks * sizeof(data_t));
}
@@ -97,9 +109,9 @@ class BitCompressedVector
struct BitVectorProxy
{
size_t _index;
- BitCompressedVector<T> *_vector;
+ BitCompressedVector<T, B> *_vector;
- BitVectorProxy(size_t idx, BitCompressedVector<T> *v): _index(idx), _vector(v)
+ BitVectorProxy(size_t idx, BitCompressedVector<T, B> *v): _index(idx), _vector(v)
{}
// Implicit conversion operator used for rvalues of T
@@ -148,10 +160,6 @@ class BitCompressedVector
static const uint8_t _width = sizeof(data_t) * 8;
static const uint64_t _num_blocks = CACHE_LINE_SIZE / sizeof(data_t);
-
- // Number of bits to use
- byte _bits;
-
// Pointer to the data
data_t *_data;
@@ -162,20 +170,20 @@ class BitCompressedVector
// get the position of an index inside the list of data values
inline size_t _getPos(size_t index) const
{
- return (index * _bits) / _width;
+ return (index * B) / _width;
}
// get the offset of an index inside a block
inline size_t _getOffset(size_t index, size_t base) const
{
- return (index * _bits) - base;
+ return (index * B) - base;
}
};
-template<typename T>
-void BitCompressedVector<T>::mget(const size_t index, value_type_ptr data, size_t *actual) const
+template<typename T, uint8_t B>
+void BitCompressedVector<T, B>::mget(const size_t index, value_type_ptr data, size_t *actual) const
{
// First get the initial values
data_t pos = _getPos(index);
@@ -187,16 +195,16 @@ void BitCompressedVector<T>::mget(const size_t index, value_type_ptr data, size_
data_t bounds = _width - offset;
// Base Mask
- data_t baseMask = global_bit_masks[_bits];
+ data_t baseMask = global_bit_masks[B];
// Counter and block
size_t counter = 0;
// Align the block according to the offset
data_t block = _data[pos] >> offset;
- size_t left = (_num_blocks * _width) / _bits;
- size_t current = (pos * _width + offset) / _bits;
+ size_t left = (_num_blocks * _width) / B;
+ size_t current = (pos * _width + offset) / B;
size_t upper = left < (_reserved - current) ? left : _reserved - current;
while(counter < upper)
@@ -205,14 +213,14 @@ void BitCompressedVector<T>::mget(const size_t index, value_type_ptr data, size_
// Extract the value
currentValue = (baseMask & block);
- if (bounds > _bits)
+ if (bounds > B)
{
- bounds -= _bits;
- block >>= _bits;
+ bounds -= B;
+ block >>= B;
} else {
- offset = _bits - bounds;
+ offset = B - bounds;
mask = global_bit_masks[offset];
currentValue |= (mask & _data[++pos]) << bounds;
@@ -228,8 +236,8 @@ void BitCompressedVector<T>::mget(const size_t index, value_type_ptr data, size_
*actual = counter;
}
-template<typename T>
-void BitCompressedVector<T>::mget_fixed(const size_t index, value_type_ptr data, size_t *limit) const
+template<typename T, uint8_t B>
+void BitCompressedVector<T, B>::mget_fixed(const size_t index, value_type_ptr data, size_t *limit) const
{
// First get the initial values
data_t pos = _getPos(index);
@@ -241,7 +249,7 @@ void BitCompressedVector<T>::mget_fixed(const size_t index, value_type_ptr data,
data_t bounds = _width - offset;
// Base Mask
- data_t baseMask = global_bit_masks[_bits];
+ data_t baseMask = global_bit_masks[B];
// Align the block according to the offset
data_t block = _data[pos] >> offset;
@@ -254,14 +262,14 @@ void BitCompressedVector<T>::mget_fixed(const size_t index, value_type_ptr data,
// Extract the value
currentValue = (baseMask & block);
- if (bounds > _bits)
+ if (bounds > B)
{
- bounds -= _bits;
- block >>= _bits;
+ bounds -= B;
+ block >>= B;
} else {
- offset = _bits - bounds;
+ offset = B - bounds;
mask = global_bit_masks[offset];
currentValue |= (mask & _data[++pos]) << bounds;
@@ -279,22 +287,22 @@ void BitCompressedVector<T>::mget_fixed(const size_t index, value_type_ptr data,
*limit = counter;
}
-template<typename T>
-void BitCompressedVector<T>::set(const size_t index, const value_type v)
+template<typename T, uint8_t B>
+void BitCompressedVector<T, B>::set(const size_t index, const value_type v)
{
data_t pos = _getPos(index);
data_t offset = _getOffset(index, pos * _width);
data_t bounds = _width - offset;
data_t mask, baseMask;
- baseMask = global_bit_masks[_bits];
+ baseMask = global_bit_masks[B];
mask = ~(baseMask << offset);
_data[pos] &= mask;
_data[pos] = _data[pos] | ((data_t) v << offset);
- if (bounds < _bits)
+ if (bounds < B)
{
mask = ~(baseMask << offset); // we have a an overflow here thatswhy we do not need to care about the original stuff
@@ -304,8 +312,8 @@ void BitCompressedVector<T>::set(const size_t index, const value_type v)
}
-template<typename T>
-typename BitCompressedVector<T>::value_type BitCompressedVector<T>::get(const size_t index) const
+template<typename T, uint8_t B>
+typename BitCompressedVector<T, B>::value_type BitCompressedVector<T, B>::get(const size_t index) const
{
value_type result;
data_t mask;
@@ -314,14 +322,14 @@ typename BitCompressedVector<T>::value_type BitCompressedVector<T>::get(const si
data_t offset = _getOffset(index, pos * _width);
data_t bounds = _width - offset; // This is almost static expression, that could be handled with a switch case
- mask = global_bit_masks[_bits];
+ mask = global_bit_masks[B];
mask <<= offset;
result = (mask & _data[pos]) >> offset;
- if (bounds < _bits)
+ if (bounds < B)
{
- data_t b = _bits - bounds;
+ data_t b = B - bounds;
mask = global_bit_masks[b];
result |= (mask & _data[pos + 1]) << bounds;
View
@@ -0,0 +1,8 @@
+{
+ "folders":
+ [
+ {
+ "path": "/Users/grund/development/tmp/bcv"
+ }
+ ]
+}
View
@@ -9,10 +9,70 @@
#define BITS 5
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#include <smmintrin.h>
+
+void pshufb_test(long SIZE)
+{
+ __m128i input, mask;
+
+ // 0, 1, 2, 3
+ input = _mm_set_epi64((__m64) 0ll, (__m64) 100384ll);
+
+ // now shuffle the bytes
+ //v1 = 0..4, v2 = 5..9, v3 = 10..14, v4=15..19
+
+ // two byte is the max copy for this element sie of 5 bit
+ // 1,2 -> 0,1 clear 2 3 offset 0, mask 5
+ // 1,2 -> 4,5 clear 6 7 offset 5, mask 5
+ // 2,3 -> 8,9 clear 10,11 offset 2 bit, mask 5
+ // 2,3 -> 12,13 clear 14,15 offset 7 bit, mask 5
+ mask = _mm_set_epi8(
+ 0x80, 0x80,
+ 0x02, 0x01,
+ 0x80, 0x80,
+ 0x02, 0x01,
+ 0x80, 0x80,
+ 0x01, 0x00,
+ 0x80, 0x80,
+ 0x01, 0x00
+ );
+
+ // Shuffle
+ __m128i result = _mm_shuffle_epi8(input, mask);
+
+ std::cout << (uint64_t) _mm_extract_epi64(result,0) << std::endl;
+ std::cout << (uint64_t) _mm_extract_epi64(result,1) << std::endl;
+
+ // Align by independent shifts
+ __m128i mult_mask = _mm_set_epi32(1,32,4,128);
+ __m128i mult = _mm_mullo_epi32(result, mult_mask);
+
+ std::cout << (uint64_t) _mm_extract_epi64(mult,0) << std::endl;
+ std::cout << (uint64_t) _mm_extract_epi64(mult,1) << std::endl;
+
+ // Combined shift right
+ __m128i shifter = _mm_srli_epi32(mult, 7);
+
+ // And mask everything
+ int a,b,c,d;
+ a = _mm_extract_epi32(shifter, 0);
+ b = _mm_extract_epi32(shifter, 1);
+ c = _mm_extract_epi32(shifter, 2);
+ d = _mm_extract_epi32(shifter, 3);
+
+ std::cout << " " << (a & 31)
+ << " " << (b & 31)
+ << " " << (c & 31)
+ << " " << (d & 31) << std::endl;
+}
+
+
void test_set(long SIZE)
{
std::cout << "[TEST ] set/get interleaved ..." << std::flush;
- BitCompressedVector<int> v(SIZE, BITS);
+ BitCompressedVector<int, BITS> v(SIZE);
for(size_t i=0; i < SIZE; ++i)
{
int a = i % (1UL << BITS);
@@ -25,7 +85,7 @@ void test_set(long SIZE)
void test_get(long SIZE)
{
std::cout << "[TEST ] set/get separated ..." << std::flush;
- BitCompressedVector<int> v(SIZE, BITS);
+ BitCompressedVector<int, BITS> v(SIZE);
for(size_t i=0; i < SIZE; ++i)
{
int a = i % (1UL << BITS);
@@ -44,7 +104,7 @@ void test_mget(long SIZE)
{
std::cout << "[TEST ] set/mget separated ..." << std::flush;
long sum = 0, sum2 = 0;
- BitCompressedVector<int> v(SIZE, BITS);
+ BitCompressedVector<int, BITS> v(SIZE);
for(size_t i=0; i < SIZE; ++i)
{
int a = i % (1UL << BITS);
@@ -79,7 +139,7 @@ void test_mget_fixed(long SIZE)
{
std::cout << "[TEST ] set/mget_fixed separated ..." << std::flush;
long sum = 0, sum2 = 0;
- BitCompressedVector<int> v(SIZE, BITS);
+ BitCompressedVector<int, BITS> v(SIZE);
for(size_t i=0; i < SIZE; ++i)
{
int a = i % (1UL << BITS);
@@ -114,7 +174,7 @@ void fill(C& v, size_t size)
void performance(size_t size)
{
- BitCompressedVector<int> v(size, BITS);
+ BitCompressedVector<int, BITS> v(size);
std::vector<int> v2(size);
fill(v, size);
@@ -218,11 +278,12 @@ int main(int argc, char* argv[])
// Setting size
long SIZE = atol(argv[1]);
+ //pshufb_test(SIZE);
+
test_set(SIZE);
test_get(SIZE);
test_mget(SIZE);
test_mget_fixed(SIZE);
-
performance(SIZE);
return 0;

0 comments on commit ac88298

Please sign in to comment.