Skip to content

Commit

Permalink
upgrade to lzf compress 0.9
Browse files Browse the repository at this point in the history
  • Loading branch information
kimchy committed Nov 13, 2011
1 parent 27a7b06 commit 4bbf298
Show file tree
Hide file tree
Showing 9 changed files with 917 additions and 387 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
package org.elasticsearch.common.compress.lzf;

import java.io.IOException;
import java.io.InputStream;

/**
* Decoder that handles decoding of sequence of encoded LZF chunks,
* combining them into a single contiguous result byte array.
*
* @author Tatu Saloranta (tatu@ning.com)
* @since 0.9
*/
public abstract class ChunkDecoder {
protected final static byte BYTE_NULL = 0;
protected final static int HEADER_BYTES = 5;

public ChunkDecoder() {
}

/*
///////////////////////////////////////////////////////////////////////
// Public API
///////////////////////////////////////////////////////////////////////
*/

/**
* Method for decompressing a block of input data encoded in LZF
* block structure (compatible with lzf command line utility),
* and can consist of any number of blocks.
* Note that input MUST consists of a sequence of one or more complete
* chunks; partial chunks can not be handled.
*/
public final byte[] decode(final byte[] inputBuffer) throws IOException {
byte[] result = new byte[calculateUncompressedSize(inputBuffer, 0, inputBuffer.length)];
decode(inputBuffer, 0, inputBuffer.length, result);
return result;
}

/**
* Method for decompressing a block of input data encoded in LZF
* block structure (compatible with lzf command line utility),
* and can consist of any number of blocks.
* Note that input MUST consists of a sequence of one or more complete
* chunks; partial chunks can not be handled.
*/
public final byte[] decode(final byte[] inputBuffer, int inputPtr, int inputLen) throws IOException {
byte[] result = new byte[calculateUncompressedSize(inputBuffer, inputPtr, inputLen)];
decode(inputBuffer, inputPtr, inputLen, result);
return result;
}

/**
* Method for decompressing a block of input data encoded in LZF
* block structure (compatible with lzf command line utility),
* and can consist of any number of blocks.
* Note that input MUST consists of a sequence of one or more complete
* chunks; partial chunks can not be handled.
*/
public final int decode(final byte[] inputBuffer, final byte[] targetBuffer) throws IOException {
return decode(inputBuffer, 0, inputBuffer.length, targetBuffer);
}

/**
* Method for decompressing a block of input data encoded in LZF
* block structure (compatible with lzf command line utility),
* and can consist of any number of blocks.
* Note that input MUST consists of a sequence of one or more complete
* chunks; partial chunks can not be handled.
*/
public int decode(final byte[] sourceBuffer, int inPtr, int inLength,
final byte[] targetBuffer) throws IOException {
byte[] result = targetBuffer;
int outPtr = 0;
int blockNr = 0;

final int end = inPtr + inLength - 1; // -1 to offset possible end marker

while (inPtr < end) {
// let's do basic sanity checks; no point in skimping with these checks
if (sourceBuffer[inPtr] != LZFChunk.BYTE_Z || sourceBuffer[inPtr + 1] != LZFChunk.BYTE_V) {
throw new IOException("Corrupt input data, block #" + blockNr + " (at offset " + inPtr + "): did not start with 'ZV' signature bytes");
}
inPtr += 2;
int type = sourceBuffer[inPtr++];
int len = uint16(sourceBuffer, inPtr);
inPtr += 2;
if (type == LZFChunk.BLOCK_TYPE_NON_COMPRESSED) { // uncompressed
System.arraycopy(sourceBuffer, inPtr, result, outPtr, len);
outPtr += len;
} else { // compressed
int uncompLen = uint16(sourceBuffer, inPtr);
inPtr += 2;
decodeChunk(sourceBuffer, inPtr, result, outPtr, outPtr + uncompLen);
outPtr += uncompLen;
}
inPtr += len;
++blockNr;
}
return outPtr;
}

/**
* Main decode from a stream. Decompressed bytes are placed in the outputBuffer, inputBuffer
* is a "scratch-area".
*
* @param is An input stream of LZF compressed bytes
* @param inputBuffer A byte array used as a scratch area.
* @param outputBuffer A byte array in which the result is returned
* @return The number of bytes placed in the outputBuffer.
*/
public abstract int decodeChunk(final InputStream is, final byte[] inputBuffer, final byte[] outputBuffer)
throws IOException;

/**
* Main decode method for individual chunks.
*/
public abstract void decodeChunk(byte[] in, int inPos, byte[] out, int outPos, int outEnd)
throws IOException;

/*
///////////////////////////////////////////////////////////////////////
// Public static methods
///////////////////////////////////////////////////////////////////////
*/

/**
* Helper method that will calculate total uncompressed size, for sequence of
* one or more LZF blocks stored in given byte array.
* Will do basic sanity checking, so that this method can be called to
* verify against some types of corruption.
*/
public static int calculateUncompressedSize(byte[] data, int ptr, int length) throws IOException {
int uncompressedSize = 0;
int blockNr = 0;
final int end = ptr + length;

while (ptr < end) {
// can use optional end marker
if (ptr == (data.length + 1) && data[ptr] == BYTE_NULL) {
++ptr; // so that we'll be at end
break;
}
// simpler to handle bounds checks by catching exception here...
try {
if (data[ptr] != LZFChunk.BYTE_Z || data[ptr + 1] != LZFChunk.BYTE_V) {
throw new IOException("Corrupt input data, block #" + blockNr + " (at offset " + ptr + "): did not start with 'ZV' signature bytes");
}
int type = (int) data[ptr + 2];
int blockLen = uint16(data, ptr + 3);
if (type == LZFChunk.BLOCK_TYPE_NON_COMPRESSED) { // uncompressed
ptr += 5;
uncompressedSize += blockLen;
} else if (type == LZFChunk.BLOCK_TYPE_COMPRESSED) { // compressed
uncompressedSize += uint16(data, ptr + 5);
ptr += 7;
} else { // unknown... CRC-32 would be 2, but that's not implemented by cli tool
throw new IOException("Corrupt input data, block #" + blockNr + " (at offset " + ptr + "): unrecognized block type " + (type & 0xFF));
}
ptr += blockLen;
} catch (ArrayIndexOutOfBoundsException e) {
throw new IOException("Corrupt input data, block #" + blockNr + " (at offset " + ptr + "): truncated block header");
}
++blockNr;
}
// one more sanity check:
if (ptr != data.length) {
throw new IOException("Corrupt input data: block #" + blockNr + " extends " + (data.length - ptr) + " beyond end of input");
}
return uncompressedSize;
}

/*
///////////////////////////////////////////////////////////////////////
// Internal methods
///////////////////////////////////////////////////////////////////////
*/

protected final static int uint16(byte[] data, int ptr) {
return ((data[ptr] & 0xFF) << 8) + (data[ptr + 1] & 0xFF);
}

/**
* Helper method to forcibly load header bytes that must be read before
* chunk can be handled.
*/
protected final static int readHeader(final InputStream is, final byte[] inputBuffer)
throws IOException {
// Ok: simple case first, where we just get all data we need
int needed = HEADER_BYTES;
int count = is.read(inputBuffer, 0, needed);

if (count == needed) {
return count;
}
if (count <= 0) {
return 0;
}

// if not, a source that trickles data (network etc); must loop
int offset = count;
needed -= count;

do {
count = is.read(inputBuffer, offset, needed);
if (count <= 0) {
break;
}
offset += count;
needed -= count;
} while (needed > 0);
return offset;
}

protected final static void readFully(InputStream is, boolean compressed,
byte[] outputBuffer, int offset, int len) throws IOException {
int left = len;
while (left > 0) {
int count = is.read(outputBuffer, offset, left);
if (count < 0) { // EOF not allowed here
throw new IOException("EOF in " + len + " byte ("
+ (compressed ? "" : "un") + "compressed) block: could only read "
+ (len - left) + " bytes");
}
offset += count;
left -= count;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
* is only used if it actually reduces chunk size (including overhead
* of additional header bytes)
*
* @author tatu@ning.com
* @author Tatu Saloranta (tatu@ning.com)
*/
public class ChunkEncoder {
// Beyond certain point we won't be able to compress; let's use 16 bytes as cut-off
Expand All @@ -38,6 +38,10 @@ public class ChunkEncoder {

private final BufferRecycler _recycler;

/**
* Hash table contains lookup based on 3-byte sequence; key is hash
* of such triplet, value is offset in buffer.
*/
private int[] _hashTable;

private final int _hashModulo;
Expand Down Expand Up @@ -78,7 +82,7 @@ public ChunkEncoder(int totalLength, BufferRecycler recycler) {

/**
* Method to close once encoder is no longer in use. Note: after calling
* this method, further calls to {@link #_encodeChunk} will fail
* this method, further calls to {@link #encodeChunk} will fail
*/
public void close() {
byte[] buf = _encodeBuffer;
Expand Down Expand Up @@ -177,26 +181,26 @@ private final int hash(int h) {
private int tryCompress(byte[] in, int inPos, int inEnd, byte[] out, int outPos) {
final int[] hashTable = _hashTable;
++outPos;
int hash = first(in, 0);
int seen = first(in, 0); // past 4 bytes we have seen... (last one is LSB)
int literals = 0;
inEnd -= 4;
final int firstPos = inPos; // so that we won't have back references across block boundary

while (inPos < inEnd) {
byte p2 = in[inPos + 2];
// next
hash = (hash << 8) + (p2 & 255);
int off = hash(hash);
seen = (seen << 8) + (p2 & 255);
int off = hash(seen);
int ref = hashTable[off];
hashTable[off] = inPos;

// First expected common case: no back-ref (for whatever reason)
if (ref >= inPos // can't refer forward (i.e. leftovers)
|| ref < firstPos // or to previous block
|| (off = inPos - ref - 1) >= MAX_OFF
|| (off = inPos - ref) > MAX_OFF
|| in[ref + 2] != p2 // must match hash
|| in[ref + 1] != (byte) (hash >> 8)
|| in[ref] != (byte) (hash >> 16)) {
|| in[ref + 1] != (byte) (seen >> 8)
|| in[ref] != (byte) (seen >> 16)) {
out[outPos++] = in[inPos++];
literals++;
if (literals == LZFChunk.MAX_LITERAL) {
Expand All @@ -222,6 +226,7 @@ private int tryCompress(byte[] in, int inPos, int inEnd, byte[] out, int outPos)
len++;
}
len -= 2;
--off; // was off by one earlier
if (len < 7) {
out[outPos++] = (byte) ((off >> 8) + (len << 5));
} else {
Expand All @@ -231,19 +236,20 @@ private int tryCompress(byte[] in, int inPos, int inEnd, byte[] out, int outPos)
out[outPos++] = (byte) off;
outPos++;
inPos += len;
hash = first(in, inPos);
hash = (hash << 8) + (in[inPos + 2] & 255);
hashTable[hash(hash)] = inPos++;
hash = (hash << 8) + (in[inPos + 2] & 255); // hash = next(hash, in, inPos);
hashTable[hash(hash)] = inPos++;
seen = first(in, inPos);
seen = (seen << 8) + (in[inPos + 2] & 255);
hashTable[hash(seen)] = inPos;
++inPos;
seen = (seen << 8) + (in[inPos + 2] & 255); // hash = next(hash, in, inPos);
hashTable[hash(seen)] = inPos;
++inPos;
}
inEnd += 4;
// try offlining the tail
return tryCompressTail(in, inPos, inEnd, out, outPos, literals);
return handleTail(in, inPos, inEnd + 4, out, outPos, literals);
}

private int tryCompressTail(byte[] in, int inPos, int inEnd, byte[] out, int outPos,
int literals) {
private int handleTail(byte[] in, int inPos, int inEnd, byte[] out, int outPos,
int literals) {
while (inPos < inEnd) {
out[outPos++] = in[inPos++];
literals++;
Expand Down
Loading

5 comments on commit 4bbf298

@imotov
Copy link
Contributor

@imotov imotov commented on 4bbf298 Nov 14, 2011

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After this commit test org.elasticsearch.test.integration.recovery.RecoveryWhileUnderLoadTests started to reliably crash JVM on two different machines. https://gist.github.com/e735cc237efe5149c7f7

@kimchy
Copy link
Member Author

@kimchy kimchy commented on 4bbf298 Nov 14, 2011

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it might be related to this one: ning/compress#13, I will default it to the vanilla decoder and not the one that uses unsafe. Funny, I did not see any failures (lion, 1.6.0_29).

@imotov
Copy link
Contributor

@imotov imotov commented on 4bbf298 Nov 14, 2011

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was failing with 1.6.0_26 on both Lion and Snow Leopard. After upgraded Lion machine to 1.6.0_29 it stopped crashing.

@cowtowncoder
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, chances are issue #13 is the culprit. I am trying to reproduce it, but so far have been unable on my Snow Leopard machine.
Any help is appreciated here, apologies for crashes, I thought I had tested this well, but chances there are some platform dependant parts here.

@cowtowncoder
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just released 0.9.1, with what I hope to be the fix -- at very least resolves one issue with Unsafe.

Please sign in to comment.