diff --git a/src/main/java/me/lemire/integercompression/BitPacking.java b/src/main/java/me/lemire/integercompression/BitPacking.java index e83c9e0..8652be4 100644 --- a/src/main/java/me/lemire/integercompression/BitPacking.java +++ b/src/main/java/me/lemire/integercompression/BitPacking.java @@ -1690,7 +1690,7 @@ protected static void fastpack9(final int[] in, int inpos, } /** - * Unpack 32 integers + * Pack without mask 32 integers * * @param in * source array @@ -3005,7 +3005,7 @@ protected static void fastpackwithoutmask9(final int[] in, int inpos, } /** - * Pack the 32 integers + * Unpack the 32 integers * * @param in * source array diff --git a/src/main/java/me/lemire/integercompression/DeltaZigzagVariableByte.java b/src/main/java/me/lemire/integercompression/DeltaZigzagVariableByte.java index ca9d0ad..2f8c709 100644 --- a/src/main/java/me/lemire/integercompression/DeltaZigzagVariableByte.java +++ b/src/main/java/me/lemire/integercompression/DeltaZigzagVariableByte.java @@ -134,6 +134,9 @@ public void uncompress(int[] inBuf, IntWrapper inPos, int inLen, * In case you need a different way to allocate buffers, you can override this method * with a custom behavior. The default implementation allocates a new Java direct * {@link ByteBuffer} on each invocation. + * + * @param sizeInBytes + * @return */ protected ByteBuffer makeBuffer(int sizeInBytes) { return ByteBuffer.allocateDirect(sizeInBytes); diff --git a/src/main/java/me/lemire/integercompression/FastPFOR.java b/src/main/java/me/lemire/integercompression/FastPFOR.java index 36226c0..47969f4 100644 --- a/src/main/java/me/lemire/integercompression/FastPFOR.java +++ b/src/main/java/me/lemire/integercompression/FastPFOR.java @@ -336,6 +336,9 @@ public String toString() { * In case you need a different way to allocate buffers, you can override this method * with a custom behavior. The default implementation allocates a new Java direct * {@link ByteBuffer} on each invocation. + * + * @param sizeInBytes + * @return */ protected ByteBuffer makeBuffer(int sizeInBytes) { return ByteBuffer.allocateDirect(sizeInBytes); diff --git a/src/main/java/me/lemire/integercompression/FastPFOR128.java b/src/main/java/me/lemire/integercompression/FastPFOR128.java index b124072..83a3e1f 100644 --- a/src/main/java/me/lemire/integercompression/FastPFOR128.java +++ b/src/main/java/me/lemire/integercompression/FastPFOR128.java @@ -317,6 +317,9 @@ public String toString() { * In case you need a different way to allocate buffers, you can override this method * with a custom behavior. The default implementation allocates a new Java direct * {@link ByteBuffer} on each invocation. + * + * @param sizeInBytes + * @return */ protected ByteBuffer makeBuffer(int sizeInBytes) { return ByteBuffer.allocateDirect(sizeInBytes); diff --git a/src/main/java/me/lemire/integercompression/IntCompressor.java b/src/main/java/me/lemire/integercompression/IntCompressor.java index 87e7bde..abaeea9 100644 --- a/src/main/java/me/lemire/integercompression/IntCompressor.java +++ b/src/main/java/me/lemire/integercompression/IntCompressor.java @@ -36,7 +36,8 @@ public IntCompressor() { * @throws UncompressibleInputException if the data is too poorly compressible */ public int[] compress(int[] input) { - int [] compressed = new int[input.length + input.length / 100 + 1024]; + int[] compressed = new int[input.length + input.length / 100 + 1024]; + // Store at index=0 the length of the input, hence enabling .headlessCompress compressed[0] = input.length; IntWrapper outpos = new IntWrapper(1); try { @@ -58,6 +59,7 @@ public int[] compress(int[] input) { * @return uncompressed array */ public int[] uncompress(int[] compressed) { + // Read at index=0 the length of the input, hence enabling .headlessUncompress int[] decompressed = new int[compressed[0]]; IntWrapper inpos = new IntWrapper(1); codec.headlessUncompress(compressed, inpos, diff --git a/src/main/java/me/lemire/integercompression/IntegerCODEC.java b/src/main/java/me/lemire/integercompression/IntegerCODEC.java index f2c9c7a..1dd9a4c 100644 --- a/src/main/java/me/lemire/integercompression/IntegerCODEC.java +++ b/src/main/java/me/lemire/integercompression/IntegerCODEC.java @@ -25,7 +25,7 @@ public interface IntegerCODEC { * @param in * input array * @param inpos - * location in the input array + * where to start reading in the array * @param inlength * how many integers to compress * @param out @@ -52,7 +52,7 @@ public void compress(int[] in, IntWrapper inpos, int inlength, * @param out * array where to write the compressed output * @param outpos - * where to write the compressed output in out + * where to start writing the uncompressed output in out */ public void uncompress(int[] in, IntWrapper inpos, int inlength, int[] out, IntWrapper outpos); diff --git a/src/main/java/me/lemire/integercompression/SkippableIntegerCODEC.java b/src/main/java/me/lemire/integercompression/SkippableIntegerCODEC.java index 8b4dd8b..66143b9 100644 --- a/src/main/java/me/lemire/integercompression/SkippableIntegerCODEC.java +++ b/src/main/java/me/lemire/integercompression/SkippableIntegerCODEC.java @@ -13,8 +13,8 @@ * variation on the IntegerCODEC interface meant to be used for random access * (i.e., given a large array, you can segment it and decode just the subarray you need). * - * The main difference is that we must specify the number of integers we wish to - * decode. This information should be stored elsewhere. + * The main difference is that you must specify the number of integers you wish to + * uncompress. This information should be stored elsewhere. * * This interface was designed by the Terrier team for their search engine. * @@ -30,10 +30,13 @@ public interface SkippableIntegerCODEC { * inpos will be incremented by 12 while outpos will be incremented by 3. We * use IntWrapper to pass the values by reference. * + * Implementation note: contrary to {@link IntegerCODEC#compress}, + * this may skip writing information about the number of encoded integers. + * * @param in * input array * @param inpos - * location in the input array + * where to start reading in the array * @param inlength * how many integers to compress * @param out @@ -57,11 +60,11 @@ public void headlessCompress(int[] in, IntWrapper inpos, int inlength, int[] out * @param inlength * length of the compressed data (ignored by some schemes) * @param out - * array where to write the compressed output + * array where to write the uncompressed output * @param outpos - * where to write the compressed output in out + * where to start writing the uncompressed output in out * @param num - * number of integers we want to decode, the actual number of integers decoded can be less + * number of integers we want to decode. May be less than the actual number of compressed integers */ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, int[] out, IntWrapper outpos, int num); diff --git a/src/main/java/me/lemire/integercompression/VariableByte.java b/src/main/java/me/lemire/integercompression/VariableByte.java index 09e479b..92cfaeb 100644 --- a/src/main/java/me/lemire/integercompression/VariableByte.java +++ b/src/main/java/me/lemire/integercompression/VariableByte.java @@ -214,6 +214,9 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, int[] o * In case you need a different way to allocate buffers, you can override this method * with a custom behavior. The default implementation allocates a new Java direct * {@link ByteBuffer} on each invocation. + * + * @param sizeInBytes + * @return */ protected ByteBuffer makeBuffer(int sizeInBytes) { return ByteBuffer.allocateDirect(sizeInBytes); diff --git a/src/main/java/me/lemire/longcompression/ByteLongCODEC.java b/src/main/java/me/lemire/longcompression/ByteLongCODEC.java index e405370..dbc6864 100644 --- a/src/main/java/me/lemire/longcompression/ByteLongCODEC.java +++ b/src/main/java/me/lemire/longcompression/ByteLongCODEC.java @@ -57,6 +57,6 @@ public void compress(long[] in, IntWrapper inpos, int inlength, * where to write the compressed output in out */ public void uncompress(byte[] in, IntWrapper inpos, int inlength, - long[] out, IntWrapper outpos); + long[] out, IntWrapper outpos); } diff --git a/src/main/java/me/lemire/longcompression/LongAs2IntsCodec.java b/src/main/java/me/lemire/longcompression/LongAs2IntsCodec.java index 3b2bc76..35c1166 100644 --- a/src/main/java/me/lemire/longcompression/LongAs2IntsCodec.java +++ b/src/main/java/me/lemire/longcompression/LongAs2IntsCodec.java @@ -16,174 +16,174 @@ * */ public class LongAs2IntsCodec implements LongCODEC { - final IntegerCODEC highPartsCodec; - final IntegerCODEC lowPartsCodec; - - public LongAs2IntsCodec(IntegerCODEC highPartsCodec, IntegerCODEC lowPartsCodec) { - this.highPartsCodec = highPartsCodec; - this.lowPartsCodec = lowPartsCodec; - } - - /** - * By default, we expect longs to be slightly above Integer.MAX_VALUE. Hence highParts to be small and positive - * integers. For lowParts, we rely on {@link IntCompressor} default IntegerCODEC - */ - public LongAs2IntsCodec() { - this(new VariableByte(), new Composition(new BinaryPacking(), new VariableByte())); - } - - @Override - public void compress(long[] in, IntWrapper inpos, int inlength, long[] out, IntWrapper outpos) { - if (inlength == 0) { - return; - } - - int[] highParts = new int[inlength]; - int[] lowParts = new int[inlength]; - - for (int i = 0; i < inlength; i++) { - int inPosition = inpos.get() + i; - - highParts[i] = RoaringIntPacking.high(in[inPosition]); - lowParts[i] = RoaringIntPacking.low(in[inPosition]); - } - - // TODO What would be a relevant buffer size? - int[] buffer = new int[inlength * 16]; - - int outPosition = outpos.get(); - - boolean hasLeftover; - { - // The first integer is reserved to hold the number of compressed ints - IntWrapper highPartsOutPosition = new IntWrapper(1); - - highPartsCodec.compress(highParts, new IntWrapper(), inlength, buffer, highPartsOutPosition); - - // Record the compressedHighparts length - buffer[0] = highPartsOutPosition.get() - 1; - - for (int i = 0; i < highPartsOutPosition.get() / 2; i++) { - long pack = RoaringIntPacking.pack(buffer[i * 2], buffer[i * 2 + 1]); - out[outPosition++] = pack; - } - - if (1 == highPartsOutPosition.get() % 2) { - // Shift the trailing integer as first in the buffer - hasLeftover = true; - buffer[0] = buffer[highPartsOutPosition.get() - 1]; - } else { - hasLeftover = false; - } - } - - { - // The first integer is reserved to hold the number of compressed ints - IntWrapper lowPartsOutPosition = new IntWrapper(1); - if (hasLeftover) { - // Keep the trailing int from highParts before the reserved int from lowParts compressed length - lowPartsOutPosition.set(2); - } - - lowPartsCodec.compress(lowParts, new IntWrapper(0), inlength, buffer, lowPartsOutPosition); - - // Record the compressedHighparts length - buffer[hasLeftover ? 1 : 0] = lowPartsOutPosition.get() - (hasLeftover ? 2 : 1); - - for (int i = 0; i < lowPartsOutPosition.get() / 2; i++) { - long pack = RoaringIntPacking.pack(buffer[i * 2], buffer[i * 2 + 1]); - out[outPosition++] = pack; - } - - if (1 == lowPartsOutPosition.get() % 2) { - // The trailing integer is packed with a 0 - long pack = RoaringIntPacking.pack(buffer[lowPartsOutPosition.get() - 1], 0); - out[outPosition++] = pack; - } - } - - inpos.add(inlength); - outpos.set(outPosition); - } - - /** - * inlength is ignored by this codec. We may rely on it instead of storing the compressedLowPart length - */ - @Override - public void uncompress(long[] in, IntWrapper inpos, int inlength, long[] out, IntWrapper outpos) { - if (inlength == 0) { - return; - } - - int longIndex = inpos.get(); - - int nbCompressedHighParts = RoaringIntPacking.high(in[longIndex]); - int[] compressedHighParts = new int[nbCompressedHighParts]; - - // !highPart as we just read the highPart for nbCompressedHighParts - boolean highPart = false; - for (int i = 0; i < nbCompressedHighParts; i++) { - int nextInt; - if (highPart) { - nextInt = RoaringIntPacking.high(in[longIndex + (i + 1) / 2]); - } else { - nextInt = RoaringIntPacking.low(in[longIndex + (i + 1) / 2]); - } - compressedHighParts[i] = nextInt; - - highPart = !highPart; - } - - // TODO What would be a relevant buffer size? - int[] buffer = new int[inlength * 16]; - - IntWrapper highPartsOutPosition = new IntWrapper(); - highPartsCodec.uncompress(compressedHighParts, - new IntWrapper(), - compressedHighParts.length, - buffer, - highPartsOutPosition); - int[] highParts = Arrays.copyOf(buffer, highPartsOutPosition.get()); - - // +1 as we initially read nbCompressedHighParts - int intIndexNbCompressedLowParts = longIndex * 2 + 1 + nbCompressedHighParts; - int nbCompressedLowParts; - if (highPart) { - nbCompressedLowParts = RoaringIntPacking.high(in[intIndexNbCompressedLowParts / 2]); - } else { - nbCompressedLowParts = RoaringIntPacking.low(in[intIndexNbCompressedLowParts / 2]); - } - highPart = !highPart; - - int[] compressedLowParts = new int[nbCompressedLowParts]; - for (int i = 0; i < nbCompressedLowParts; i++) { - int nextInt; - if (highPart) { - nextInt = RoaringIntPacking.high(in[(intIndexNbCompressedLowParts + 1 + i) / 2]); - } else { - nextInt = RoaringIntPacking.low(in[(intIndexNbCompressedLowParts + 1 + i) / 2]); - } - compressedLowParts[i] = nextInt; - - highPart = !highPart; - } - - IntWrapper lowPartsOutPosition = new IntWrapper(); - lowPartsCodec.uncompress(compressedLowParts, - new IntWrapper(), - compressedLowParts.length, - buffer, - lowPartsOutPosition); - int[] lowParts = Arrays.copyOf(buffer, lowPartsOutPosition.get()); - assert highParts.length == lowParts.length; - - int outposition = outpos.get(); - for (int i = 0; i < highParts.length; i++) { - out[outposition++] = RoaringIntPacking.pack(highParts[i], lowParts[i]); - } - - inpos.add(inlength); - outpos.set(outposition); - } + final IntegerCODEC highPartsCodec; + final IntegerCODEC lowPartsCodec; + + public LongAs2IntsCodec(IntegerCODEC highPartsCodec, IntegerCODEC lowPartsCodec) { + this.highPartsCodec = highPartsCodec; + this.lowPartsCodec = lowPartsCodec; + } + + /** + * By default, we expect longs to be slightly above Integer.MAX_VALUE. Hence highParts to be small and positive + * integers. For lowParts, we rely on {@link IntCompressor} default IntegerCODEC + */ + public LongAs2IntsCodec() { + this(new VariableByte(), new Composition(new BinaryPacking(), new VariableByte())); + } + + @Override + public void compress(long[] in, IntWrapper inpos, int inlength, long[] out, IntWrapper outpos) { + if (inlength == 0) { + return; + } + + int[] highParts = new int[inlength]; + int[] lowParts = new int[inlength]; + + for (int i = 0; i < inlength; i++) { + int inPosition = inpos.get() + i; + + highParts[i] = RoaringIntPacking.high(in[inPosition]); + lowParts[i] = RoaringIntPacking.low(in[inPosition]); + } + + // TODO What would be a relevant buffer size? + int[] buffer = new int[inlength * 16]; + + int outPosition = outpos.get(); + + boolean hasLeftover; + { + // The first integer is reserved to hold the number of compressed ints + IntWrapper highPartsOutPosition = new IntWrapper(1); + + highPartsCodec.compress(highParts, new IntWrapper(), inlength, buffer, highPartsOutPosition); + + // Record the compressedHighparts length + buffer[0] = highPartsOutPosition.get() - 1; + + for (int i = 0; i < highPartsOutPosition.get() / 2; i++) { + long pack = RoaringIntPacking.pack(buffer[i * 2], buffer[i * 2 + 1]); + out[outPosition++] = pack; + } + + if (1 == highPartsOutPosition.get() % 2) { + // Shift the trailing integer as first in the buffer + hasLeftover = true; + buffer[0] = buffer[highPartsOutPosition.get() - 1]; + } else { + hasLeftover = false; + } + } + + { + // The first integer is reserved to hold the number of compressed ints + IntWrapper lowPartsOutPosition = new IntWrapper(1); + if (hasLeftover) { + // Keep the trailing int from highParts before the reserved int from lowParts compressed length + lowPartsOutPosition.set(2); + } + + lowPartsCodec.compress(lowParts, new IntWrapper(0), inlength, buffer, lowPartsOutPosition); + + // Record the compressedHighparts length + buffer[hasLeftover ? 1 : 0] = lowPartsOutPosition.get() - (hasLeftover ? 2 : 1); + + for (int i = 0; i < lowPartsOutPosition.get() / 2; i++) { + long pack = RoaringIntPacking.pack(buffer[i * 2], buffer[i * 2 + 1]); + out[outPosition++] = pack; + } + + if (1 == lowPartsOutPosition.get() % 2) { + // The trailing integer is packed with a 0 + long pack = RoaringIntPacking.pack(buffer[lowPartsOutPosition.get() - 1], 0); + out[outPosition++] = pack; + } + } + + inpos.add(inlength); + outpos.set(outPosition); + } + + /** + * inlength is ignored by this codec. We may rely on it instead of storing the compressedLowPart length + */ + @Override + public void uncompress(long[] in, IntWrapper inpos, int inlength, long[] out, IntWrapper outpos) { + if (inlength == 0) { + return; + } + + int longIndex = inpos.get(); + + int nbCompressedHighParts = RoaringIntPacking.high(in[longIndex]); + int[] compressedHighParts = new int[nbCompressedHighParts]; + + // !highPart as we just read the highPart for nbCompressedHighParts + boolean highPart = false; + for (int i = 0; i < nbCompressedHighParts; i++) { + int nextInt; + if (highPart) { + nextInt = RoaringIntPacking.high(in[longIndex + (i + 1) / 2]); + } else { + nextInt = RoaringIntPacking.low(in[longIndex + (i + 1) / 2]); + } + compressedHighParts[i] = nextInt; + + highPart = !highPart; + } + + // TODO What would be a relevant buffer size? + int[] buffer = new int[inlength * 16]; + + IntWrapper highPartsOutPosition = new IntWrapper(); + highPartsCodec.uncompress(compressedHighParts, + new IntWrapper(), + compressedHighParts.length, + buffer, + highPartsOutPosition); + int[] highParts = Arrays.copyOf(buffer, highPartsOutPosition.get()); + + // +1 as we initially read nbCompressedHighParts + int intIndexNbCompressedLowParts = longIndex * 2 + 1 + nbCompressedHighParts; + int nbCompressedLowParts; + if (highPart) { + nbCompressedLowParts = RoaringIntPacking.high(in[intIndexNbCompressedLowParts / 2]); + } else { + nbCompressedLowParts = RoaringIntPacking.low(in[intIndexNbCompressedLowParts / 2]); + } + highPart = !highPart; + + int[] compressedLowParts = new int[nbCompressedLowParts]; + for (int i = 0; i < nbCompressedLowParts; i++) { + int nextInt; + if (highPart) { + nextInt = RoaringIntPacking.high(in[(intIndexNbCompressedLowParts + 1 + i) / 2]); + } else { + nextInt = RoaringIntPacking.low(in[(intIndexNbCompressedLowParts + 1 + i) / 2]); + } + compressedLowParts[i] = nextInt; + + highPart = !highPart; + } + + IntWrapper lowPartsOutPosition = new IntWrapper(); + lowPartsCodec.uncompress(compressedLowParts, + new IntWrapper(), + compressedLowParts.length, + buffer, + lowPartsOutPosition); + int[] lowParts = Arrays.copyOf(buffer, lowPartsOutPosition.get()); + assert highParts.length == lowParts.length; + + int outposition = outpos.get(); + for (int i = 0; i < highParts.length; i++) { + out[outposition++] = RoaringIntPacking.pack(highParts[i], lowParts[i]); + } + + inpos.add(inlength); + outpos.set(outposition); + } } diff --git a/src/main/java/me/lemire/longcompression/LongBinaryPacking.java b/src/main/java/me/lemire/longcompression/LongBinaryPacking.java new file mode 100644 index 0000000..33bb8f1 --- /dev/null +++ b/src/main/java/me/lemire/longcompression/LongBinaryPacking.java @@ -0,0 +1,143 @@ +package me.lemire.longcompression; + +import me.lemire.integercompression.BinaryPacking; +import me.lemire.integercompression.IntWrapper; +import me.lemire.integercompression.Util; + +/** + * Scheme based on a commonly used idea: can be extremely fast. + * It encodes integers in blocks of 64 longs. For arrays containing + * an arbitrary number of longs, you should use it in conjunction + * with another CODEC: + * + *
LongCODEC ic = + * new Composition(new LongBinaryPacking(), new LongVariableByte()).+ * + * Note that this does not use differential coding: if you are working on sorted + * lists, you must compute the deltas separately. + * + *
+ * For details, please see {@link BinaryPacking} + *
+ * + * @author Benoit Lacelle + */ +public final class LongBinaryPacking implements LongCODEC, SkippableLongCODEC { + final static int BLOCK_SIZE = 64; + + @Override + public void compress(long[] in, IntWrapper inpos, int inlength, + long[] out, IntWrapper outpos) { + inlength = Util.greatestMultiple(inlength, BLOCK_SIZE); + if (inlength == 0) + return; + out[outpos.get()] = inlength; + outpos.increment(); + headlessCompress(in, inpos, inlength, out, outpos); + } + + @Override + public void headlessCompress(long[] in, IntWrapper inpos, int inlength, + long[] out, IntWrapper outpos) { + inlength = Util.greatestMultiple(inlength, BLOCK_SIZE); + int tmpoutpos = outpos.get(); + int s = inpos.get(); + // Compress by block of 8 * 64 longs as much as possible + for (; s + BLOCK_SIZE * 8 - 1 < inpos.get() + inlength; s += BLOCK_SIZE * 8) { + // maxbits can be anything between 0 and 64 included: expressed within a byte (1 << 6) + final int mbits1 = LongUtil.maxbits(in, s + 0 * BLOCK_SIZE, BLOCK_SIZE); + final int mbits2 = LongUtil.maxbits(in, s + 1 * BLOCK_SIZE, BLOCK_SIZE); + final int mbits3 = LongUtil.maxbits(in, s + 2 * BLOCK_SIZE, BLOCK_SIZE); + final int mbits4 = LongUtil.maxbits(in, s + 3 * BLOCK_SIZE, BLOCK_SIZE); + final int mbits5 = LongUtil.maxbits(in, s + 4 * BLOCK_SIZE, BLOCK_SIZE); + final int mbits6 = LongUtil.maxbits(in, s + 5 * BLOCK_SIZE, BLOCK_SIZE); + final int mbits7 = LongUtil.maxbits(in, s + 6 * BLOCK_SIZE, BLOCK_SIZE); + final int mbits8 = LongUtil.maxbits(in, s + 7 * BLOCK_SIZE, BLOCK_SIZE); + // The first long expressed the maxbits for the 8 buckets + out[tmpoutpos++] = ((long) mbits1 << 56) | ((long) mbits2 << 48) | ((long) mbits3 << 40) | ((long) mbits4 << 32) | (mbits5 << 24) | (mbits6 << 16) | (mbits7 << 8) | (mbits8); + LongBitPacking.fastpackwithoutmask(in, s + 0 * BLOCK_SIZE, out, tmpoutpos, (int) mbits1); + tmpoutpos += mbits1; + LongBitPacking.fastpackwithoutmask(in, s + 1 * BLOCK_SIZE, out, tmpoutpos, (int) mbits2); + tmpoutpos += mbits2; + LongBitPacking.fastpackwithoutmask(in, s + 2 * BLOCK_SIZE, out, tmpoutpos, (int) mbits3); + tmpoutpos += mbits3; + LongBitPacking.fastpackwithoutmask(in, s + 3 * BLOCK_SIZE, out, tmpoutpos, (int) mbits4); + tmpoutpos += mbits4; + LongBitPacking.fastpackwithoutmask(in, s + 4 * BLOCK_SIZE, out, tmpoutpos, (int) mbits5); + tmpoutpos += mbits5; + LongBitPacking.fastpackwithoutmask(in, s + 5 * BLOCK_SIZE, out, tmpoutpos, (int) mbits6); + tmpoutpos += mbits6; + LongBitPacking.fastpackwithoutmask(in, s + 6 * BLOCK_SIZE, out, tmpoutpos, (int) mbits7); + tmpoutpos += mbits7; + LongBitPacking.fastpackwithoutmask(in, s + 7 * BLOCK_SIZE, out, tmpoutpos, (int) mbits8); + tmpoutpos += mbits8; + } + // Then we compress up to 7 blocks of 64 longs + for (; s < inpos.get() + inlength; s += BLOCK_SIZE ) { + final int mbits = LongUtil.maxbits(in, s, BLOCK_SIZE); + out[tmpoutpos++] = mbits; + LongBitPacking.fastpackwithoutmask(in, s, out, tmpoutpos, mbits); + tmpoutpos += mbits; + } + inpos.add(inlength); + outpos.set(tmpoutpos); + } + + @Override + public void uncompress(long[] in, IntWrapper inpos, int inlength, + long[] out, IntWrapper outpos) { + if (inlength == 0) + return; + final int outlength = (int) in[inpos.get()]; + inpos.increment(); + headlessUncompress(in,inpos, inlength,out,outpos,outlength); + } + + @Override + public void headlessUncompress(long[] in, IntWrapper inpos, int inlength, + long[] out, IntWrapper outpos, int num) { + final int outlength = Util.greatestMultiple(num, BLOCK_SIZE); + int tmpinpos = inpos.get(); + int s = outpos.get(); + for (; s + BLOCK_SIZE * 8 - 1 < outpos.get() + outlength; s += BLOCK_SIZE * 8) { + final int mbits1 = (int) ((in[tmpinpos] >>> 56)); + final int mbits2 = (int) ((in[tmpinpos] >>> 48) & 0xFF); + final int mbits3 = (int) ((in[tmpinpos] >>> 40) & 0xFF); + final int mbits4 = (int) ((in[tmpinpos] >>> 32) & 0xFF); + final int mbits5 = (int) ((in[tmpinpos] >>> 24) & 0xFF); + final int mbits6 = (int) ((in[tmpinpos] >>> 16) & 0xFF); + final int mbits7 = (int) ((in[tmpinpos] >>> 8) & 0xFF); + final int mbits8 = (int) ((in[tmpinpos]) & 0xFF); + ++tmpinpos; + LongBitPacking.fastunpack(in, tmpinpos, out, s + 0 * BLOCK_SIZE, mbits1); + tmpinpos += mbits1; + LongBitPacking.fastunpack(in, tmpinpos, out, s + 1 * BLOCK_SIZE, mbits2); + tmpinpos += mbits2; + LongBitPacking.fastunpack(in, tmpinpos, out, s + 2 * BLOCK_SIZE, mbits3); + tmpinpos += mbits3; + LongBitPacking.fastunpack(in, tmpinpos, out, s + 3 * BLOCK_SIZE, mbits4); + tmpinpos += mbits4; + LongBitPacking.fastunpack(in, tmpinpos, out, s + 4 * BLOCK_SIZE, mbits5); + tmpinpos += mbits5; + LongBitPacking.fastunpack(in, tmpinpos, out, s + 5 * BLOCK_SIZE, mbits6); + tmpinpos += mbits6; + LongBitPacking.fastunpack(in, tmpinpos, out, s + 6 * BLOCK_SIZE, mbits7); + tmpinpos += mbits7; + LongBitPacking.fastunpack(in, tmpinpos, out, s + 7 * BLOCK_SIZE, mbits8); + tmpinpos += mbits8; + } + for (; s < outpos.get() + outlength; s += BLOCK_SIZE ) { + final int mbits = (int) in[tmpinpos]; + ++tmpinpos; + LongBitPacking.fastunpack(in, tmpinpos, out, s, mbits); + tmpinpos += mbits; + } + outpos.add(outlength); + inpos.set(tmpinpos); + } + + @Override + public String toString() { + return this.getClass().getSimpleName(); + } +} diff --git a/src/main/java/me/lemire/longcompression/LongBitPacking.java b/src/main/java/me/lemire/longcompression/LongBitPacking.java new file mode 100644 index 0000000..2d282ec --- /dev/null +++ b/src/main/java/me/lemire/longcompression/LongBitPacking.java @@ -0,0 +1,146 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + +package me.lemire.longcompression; + +import java.util.Arrays; + +/** + * Bitpacking routines + * + *For details, please see
+ *+ * Daniel Lemire and Leonid Boytsov, Decoding billions of integers per second + * through vectorization Software: Practice & Experience + * http://onlinelibrary.wiley.com/doi/10.1002/spe.2203/abstract + * http://arxiv.org/abs/1209.2137 + *
+ * + * @author Benoit Lacelle + * + */ +public final class LongBitPacking { + + /** + * Pack 64 longs + * + * @param in + * source array + * @param inpos + * position in source array + * @param out + * output array + * @param outpos + * position in output array + * @param bit + * number of bits to use per long + */ + public static void fastpackwithoutmask(final long[] in, final int inpos, + final long[] out, final int outpos, final int bit) { + if (bit == 0) { + fastpackwithoutmask0(in, inpos, out, outpos); + } else if (bit == 64) { + fastpackwithoutmask64(in, inpos, out, outpos); + } else if (bit > 0 && bit < 64) { + slowpackwithoutmask(in, inpos, out, outpos, bit); + } else { + throw new IllegalArgumentException("Unsupported bit width: " + bit); + } + } + + protected static void fastpackwithoutmask0(final long[] in, int inpos, + final long[] out, int outpos) { + // nothing + } + + protected static void fastpackwithoutmask64(final long[] in, int inpos, + final long[] out, int outpos) { + System.arraycopy(in, inpos, out, outpos, 64); + } + + protected static void slowpackwithoutmask(final long[] in, int inpos, + final long[] out, int outpos, final int bit) { + int bucket = 0; + int shift = 0; + + out[outpos + bucket] = 0L; + for (int i = 0 ; i < 64 ; i++) { + if (shift >= 64) { + bucket++; + out[bucket + outpos] = 0L; + shift -= 64; + + if (shift > 0) { + // There is some leftovers from previous input in the next bucket + out[outpos + bucket] |= in[inpos + i - 1] >> (bit - shift); + } + } + out[outpos + bucket] |= in[inpos + i] << shift; + + shift += bit; + } + } + + + /** + * Unpack the 64 longs + * + * @param in + * source array + * @param inpos + * starting point in the source array + * @param out + * output array + * @param outpos + * starting point in the output array + * @param bit + * how many bits to use per integer + */ + public static void fastunpack(final long[] in, final int inpos, + final long[] out, final int outpos, final int bit) { + if (bit == 0) { + fastunpack0(in, inpos, out, outpos); + } else if (bit == 64) { + fastunpack64(in, inpos, out, outpos); + } else if (bit > 0 && bit < 64) { + slowunpack(in, inpos, out, outpos, bit); + } else { + throw new IllegalArgumentException("Unsupported bit width: " + bit); + } + } + + + protected static void fastunpack0(final long[] in, int inpos, + final long[] out, int outpos) { + Arrays.fill(out, outpos, outpos + 64, 0); + } + + protected static void fastunpack64(final long[] in, int inpos, + final long[] out, int outpos) { + System.arraycopy(in, inpos, out, outpos, 64); + } + + protected static void slowunpack(final long[] in, int inpos, + final long[] out, int outpos, final int bit) { + int bucket = 0; + int shift = 0; + for (int i = 0 ; i < 64 ; i++) { + if (shift >= 64) { + bucket++; + shift -= 64; + + if (shift > 0) { + // There is some leftovers from previous input in the next bucket + out[outpos + i - 1] |= (in[inpos + bucket] << (bit - shift) & ((1L << bit) - 1)); + } + } + out[outpos + i] = ((in[inpos + bucket] >>> shift) & ((1L << bit) - 1)); + + shift += bit; + } + } +} diff --git a/src/main/java/me/lemire/longcompression/LongCODEC.java b/src/main/java/me/lemire/longcompression/LongCODEC.java index c0f67b2..1068f9f 100644 --- a/src/main/java/me/lemire/longcompression/LongCODEC.java +++ b/src/main/java/me/lemire/longcompression/LongCODEC.java @@ -27,7 +27,7 @@ public interface LongCODEC { * @param in * input array * @param inpos - * location in the input array + * where to start reading in the array * @param inlength * how many longs to compress * @param out @@ -52,9 +52,9 @@ public void compress(long[] in, IntWrapper inpos, int inlength, * length of the compressed data (ignored by some * schemes) * @param out - * array where to write the compressed output + * array where to write the uncompressed output * @param outpos - * where to write the compressed output in out + * where to start writing the uncompressed output in out */ public void uncompress(long[] in, IntWrapper inpos, int inlength, long[] out, IntWrapper outpos); diff --git a/src/main/java/me/lemire/longcompression/LongCompressor.java b/src/main/java/me/lemire/longcompression/LongCompressor.java new file mode 100644 index 0000000..a2c79fd --- /dev/null +++ b/src/main/java/me/lemire/longcompression/LongCompressor.java @@ -0,0 +1,75 @@ +package me.lemire.longcompression; + +import java.util.Arrays; + +import me.lemire.integercompression.IntWrapper; +import me.lemire.integercompression.UncompressibleInputException; + +/** + * This is a convenience class that wraps a codec to provide + * a "friendly" API. + * + * @author Benoit Lacelle + */ +public class LongCompressor { + + SkippableLongCODEC codec; + + /** + * Constructor wrapping a codec. + * + * @param c the underlying codec + */ + public LongCompressor(SkippableLongCODEC c) { + codec = c; + } + + /** + * Constructor with default codec. + */ + public LongCompressor() { + codec = new SkippableLongComposition(new LongBinaryPacking(), + new LongVariableByte()); + } + + /** + * Compress an array and returns the compressed result as a new array. + * + * @param input array to be compressed + * @return compressed array + * @throws UncompressibleInputException if the data is too poorly compressible + */ + public long[] compress(long[] input) { + long[] compressed = new long[input.length + input.length / 100 + 1024]; + // Store at index=0 the length of the input, hence enabling .headlessCompress + compressed[0] = input.length; + IntWrapper outpos = new IntWrapper(1); + try { + codec.headlessCompress(input, new IntWrapper(0), + input.length, compressed, outpos); + } catch (IndexOutOfBoundsException ioebe) { + throw new UncompressibleInputException("Your input is too poorly compressible " + + "with the current codec : "+codec); + } + compressed = Arrays.copyOf(compressed,outpos.intValue()); + return compressed; + } + + /** + * Uncompress an array and returns the uncompressed result as a new array. + * + * @param compressed compressed array + * @return uncompressed array + */ + public long[] uncompress(long[] compressed) { + // Read at index=0 the length of the input, hence enabling .headlessUncompress + long[] decompressed = new long[(int) compressed[0]]; + IntWrapper inpos = new IntWrapper(1); + codec.headlessUncompress(compressed, inpos, + compressed.length - inpos.intValue(), + decompressed, new IntWrapper(0), + decompressed.length); + return decompressed; + } + +} diff --git a/src/main/java/me/lemire/longcompression/LongUtil.java b/src/main/java/me/lemire/longcompression/LongUtil.java index c06433f..7bdce83 100644 --- a/src/main/java/me/lemire/longcompression/LongUtil.java +++ b/src/main/java/me/lemire/longcompression/LongUtil.java @@ -15,8 +15,38 @@ */ @Deprecated public class LongUtil { + + /** + * Compute the maximum of the integer logarithms (ceil(log(x+1)) of a range + * of value + * + * @param i + * source array + * @param pos + * starting position + * @param length + * number of integers to consider + * @return integer logarithm + */ + public static int maxbits(long[] i, int pos, int length) { + long mask = 0; + for (int k = pos; k < pos + length; ++k) + mask |= i[k]; + return bits(mask); + } - protected static String longToBinaryWithLeading(long l) { - return String.format("%64s", Long.toBinaryString(l)).replace(' ', '0'); - } + /** + * Compute the integer logarithms (ceil(log(x+1)) of a value + * + * @param i + * source value + * @return integer logarithm + */ + public static int bits(long i) { + return 64 - Long.numberOfLeadingZeros(i); + } + + protected static String longToBinaryWithLeading(long l) { + return String.format("%64s", Long.toBinaryString(l)).replace(' ', '0'); + } } diff --git a/src/main/java/me/lemire/longcompression/LongVariableByte.java b/src/main/java/me/lemire/longcompression/LongVariableByte.java index f3d10ee..ad2b0eb 100644 --- a/src/main/java/me/lemire/longcompression/LongVariableByte.java +++ b/src/main/java/me/lemire/longcompression/LongVariableByte.java @@ -217,7 +217,7 @@ public void uncompress(long[] in, IntWrapper inpos, int inlength, long[] out, s += 8; // Shift to next long if s==64 p += s>>6; - // cycle from 63 to 0 + // Cycle from 63 to 0 s = s & 63; v += ((c & 127) << shift); if ((c & 128) == 128) { @@ -234,7 +234,7 @@ public void uncompress(long[] in, IntWrapper inpos, int inlength, long[] out, @Override public void uncompress(byte[] in, IntWrapper inpos, int inlength, - long[] out, IntWrapper outpos) { + long[] out, IntWrapper outpos) { int p = inpos.get(); int finalp = inpos.get() + inlength; int tmpoutpos = outpos.get(); @@ -311,7 +311,7 @@ public void headlessUncompress(long[] in, IntWrapper inpos, int inlength, long[] s += 8; // Shift to next long if s == 64 p += s>>6; - // cycle from 63 to 0 + // Cycle from 63 to 0 s = s & 63; v += ((c & 127) << shift); if ((c & 128) == 128) { diff --git a/src/main/java/me/lemire/longcompression/RoaringIntPacking.java b/src/main/java/me/lemire/longcompression/RoaringIntPacking.java index f109ab3..d6b6baa 100644 --- a/src/main/java/me/lemire/longcompression/RoaringIntPacking.java +++ b/src/main/java/me/lemire/longcompression/RoaringIntPacking.java @@ -3,9 +3,6 @@ */ package me.lemire.longcompression; -import java.math.BigInteger; -import java.util.Comparator; - /** * Used to hold the logic packing 2 integers in a long, and separating a long in two integers. It is * useful in {@link Roaring64NavigableMap} as the implementation split the input long in two @@ -46,63 +43,4 @@ public static int low(long id) { public static long pack(int high, int low) { return (((long) high) << 32) | (low & 0xffffffffL); } - - - /** - * - * @param signedLongs true if long put in a {@link Roaring64NavigableMap} should be considered as - * signed long. - * @return the int representing the highest value which can be set as high value in a - * {@link Roaring64NavigableMap} - */ - public static int highestHigh(boolean signedLongs) { - if (signedLongs) { - return Integer.MAX_VALUE; - } else { - return -1; - } - } - - /** - * @return A comparator for unsigned longs: a negative long is a long greater than Long.MAX_VALUE - */ - public static Comparator