upgrade to lzf compress 0.9

elastic · Nov 13, 2011 · 4bbf298 · 4bbf298 · imotov · Nov 14, 2011
1 parent 27a7b06
commit 4bbf298
Show file tree

Hide file tree

Showing 9 changed files with 917 additions and 387 deletions.
diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/common/compress/lzf/ChunkDecoder.java b/modules/elasticsearch/src/main/java/org/elasticsearch/common/compress/lzf/ChunkDecoder.java
@@ -0,0 +1,228 @@
+package org.elasticsearch.common.compress.lzf;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * Decoder that handles decoding of sequence of encoded LZF chunks,
+ * combining them into a single contiguous result byte array.
+ *
+ * @author Tatu Saloranta (tatu@ning.com)
+ * @since 0.9
+ */
+public abstract class ChunkDecoder {
+    protected final static byte BYTE_NULL = 0;
+    protected final static int HEADER_BYTES = 5;
+
+    public ChunkDecoder() {
+    }
+
+    /*
+    ///////////////////////////////////////////////////////////////////////
+    // Public API
+    ///////////////////////////////////////////////////////////////////////
+     */
+
+    /**
+     * Method for decompressing a block of input data encoded in LZF
+     * block structure (compatible with lzf command line utility),
+     * and can consist of any number of blocks.
+     * Note that input MUST consists of a sequence of one or more complete
+     * chunks; partial chunks can not be handled.
+     */
+    public final byte[] decode(final byte[] inputBuffer) throws IOException {
+        byte[] result = new byte[calculateUncompressedSize(inputBuffer, 0, inputBuffer.length)];
+        decode(inputBuffer, 0, inputBuffer.length, result);
+        return result;
+    }
+
+    /**
+     * Method for decompressing a block of input data encoded in LZF
+     * block structure (compatible with lzf command line utility),
+     * and can consist of any number of blocks.
+     * Note that input MUST consists of a sequence of one or more complete
+     * chunks; partial chunks can not be handled.
+     */
+    public final byte[] decode(final byte[] inputBuffer, int inputPtr, int inputLen) throws IOException {
+        byte[] result = new byte[calculateUncompressedSize(inputBuffer, inputPtr, inputLen)];
+        decode(inputBuffer, inputPtr, inputLen, result);
+        return result;
+    }
+
+    /**
+     * Method for decompressing a block of input data encoded in LZF
+     * block structure (compatible with lzf command line utility),
+     * and can consist of any number of blocks.
+     * Note that input MUST consists of a sequence of one or more complete
+     * chunks; partial chunks can not be handled.
+     */
+    public final int decode(final byte[] inputBuffer, final byte[] targetBuffer) throws IOException {
+        return decode(inputBuffer, 0, inputBuffer.length, targetBuffer);
+    }
+
+    /**
+     * Method for decompressing a block of input data encoded in LZF
+     * block structure (compatible with lzf command line utility),
+     * and can consist of any number of blocks.
+     * Note that input MUST consists of a sequence of one or more complete
+     * chunks; partial chunks can not be handled.
+     */
+    public int decode(final byte[] sourceBuffer, int inPtr, int inLength,
+                      final byte[] targetBuffer) throws IOException {
+        byte[] result = targetBuffer;
+        int outPtr = 0;
+        int blockNr = 0;
+
+        final int end = inPtr + inLength - 1; // -1 to offset possible end marker
+
+        while (inPtr < end) {
+            // let's do basic sanity checks; no point in skimping with these checks
+            if (sourceBuffer[inPtr] != LZFChunk.BYTE_Z || sourceBuffer[inPtr + 1] != LZFChunk.BYTE_V) {
+                throw new IOException("Corrupt input data, block #" + blockNr + " (at offset " + inPtr + "): did not start with 'ZV' signature bytes");
+            }
+            inPtr += 2;
+            int type = sourceBuffer[inPtr++];
+            int len = uint16(sourceBuffer, inPtr);
+            inPtr += 2;
+            if (type == LZFChunk.BLOCK_TYPE_NON_COMPRESSED) { // uncompressed
+                System.arraycopy(sourceBuffer, inPtr, result, outPtr, len);
+                outPtr += len;
+            } else { // compressed
+                int uncompLen = uint16(sourceBuffer, inPtr);
+                inPtr += 2;
+                decodeChunk(sourceBuffer, inPtr, result, outPtr, outPtr + uncompLen);
+                outPtr += uncompLen;
+            }
+            inPtr += len;
+            ++blockNr;
+        }
+        return outPtr;
+    }
+
+    /**
+     * Main decode from a stream.  Decompressed bytes are placed in the outputBuffer, inputBuffer
+     * is a "scratch-area".
+     *
+     * @param is           An input stream of LZF compressed bytes
+     * @param inputBuffer  A byte array used as a scratch area.
+     * @param outputBuffer A byte array in which the result is returned
+     * @return The number of bytes placed in the outputBuffer.
+     */
+    public abstract int decodeChunk(final InputStream is, final byte[] inputBuffer, final byte[] outputBuffer)
+            throws IOException;
+
+    /**
+     * Main decode method for individual chunks.
+     */
+    public abstract void decodeChunk(byte[] in, int inPos, byte[] out, int outPos, int outEnd)
+            throws IOException;
+
+    /*
+    ///////////////////////////////////////////////////////////////////////
+    // Public static methods
+    ///////////////////////////////////////////////////////////////////////
+     */
+
+    /**
+     * Helper method that will calculate total uncompressed size, for sequence of
+     * one or more LZF blocks stored in given byte array.
+     * Will do basic sanity checking, so that this method can be called to
+     * verify against some types of corruption.
+     */
+    public static int calculateUncompressedSize(byte[] data, int ptr, int length) throws IOException {
+        int uncompressedSize = 0;
+        int blockNr = 0;
+        final int end = ptr + length;
+
+        while (ptr < end) {
+            // can use optional end marker
+            if (ptr == (data.length + 1) && data[ptr] == BYTE_NULL) {
+                ++ptr; // so that we'll be at end
+                break;
+            }
+            // simpler to handle bounds checks by catching exception here...
+            try {
+                if (data[ptr] != LZFChunk.BYTE_Z || data[ptr + 1] != LZFChunk.BYTE_V) {
+                    throw new IOException("Corrupt input data, block #" + blockNr + " (at offset " + ptr + "): did not start with 'ZV' signature bytes");
+                }
+                int type = (int) data[ptr + 2];
+                int blockLen = uint16(data, ptr + 3);
+                if (type == LZFChunk.BLOCK_TYPE_NON_COMPRESSED) { // uncompressed
+                    ptr += 5;
+                    uncompressedSize += blockLen;
+                } else if (type == LZFChunk.BLOCK_TYPE_COMPRESSED) { // compressed
+                    uncompressedSize += uint16(data, ptr + 5);
+                    ptr += 7;
+                } else { // unknown... CRC-32 would be 2, but that's not implemented by cli tool
+                    throw new IOException("Corrupt input data, block #" + blockNr + " (at offset " + ptr + "): unrecognized block type " + (type & 0xFF));
+                }
+                ptr += blockLen;
+            } catch (ArrayIndexOutOfBoundsException e) {
+                throw new IOException("Corrupt input data, block #" + blockNr + " (at offset " + ptr + "): truncated block header");
+            }
+            ++blockNr;
+        }
+        // one more sanity check:
+        if (ptr != data.length) {
+            throw new IOException("Corrupt input data: block #" + blockNr + " extends " + (data.length - ptr) + " beyond end of input");
+        }
+        return uncompressedSize;
+    }
+
+    /*
+   ///////////////////////////////////////////////////////////////////////
+   // Internal methods
+   ///////////////////////////////////////////////////////////////////////
+    */
+
+    protected final static int uint16(byte[] data, int ptr) {
+        return ((data[ptr] & 0xFF) << 8) + (data[ptr + 1] & 0xFF);
+    }
+
+    /**
+     * Helper method to forcibly load header bytes that must be read before
+     * chunk can be handled.
+     */
+    protected final static int readHeader(final InputStream is, final byte[] inputBuffer)
+            throws IOException {
+        // Ok: simple case first, where we just get all data we need
+        int needed = HEADER_BYTES;
+        int count = is.read(inputBuffer, 0, needed);
+
+        if (count == needed) {
+            return count;
+        }
+        if (count <= 0) {
+            return 0;
+        }
+
+        // if not, a source that trickles data (network etc); must loop
+        int offset = count;
+        needed -= count;
+
+        do {
+            count = is.read(inputBuffer, offset, needed);
+            if (count <= 0) {
+                break;
+            }
+            offset += count;
+            needed -= count;
+        } while (needed > 0);
+        return offset;
+    }
+
+    protected final static void readFully(InputStream is, boolean compressed,
+                                          byte[] outputBuffer, int offset, int len) throws IOException {
+        int left = len;
+        while (left > 0) {
+            int count = is.read(outputBuffer, offset, left);
+            if (count < 0) { // EOF not allowed here
+                throw new IOException("EOF in " + len + " byte ("
+                        + (compressed ? "" : "un") + "compressed) block: could only read "
+                        + (len - left) + " bytes");
+            }
+            offset += count;
+            left -= count;
+        }
+    }
+}
diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/common/compress/lzf/ChunkEncoder.java b/modules/elasticsearch/src/main/java/org/elasticsearch/common/compress/lzf/ChunkEncoder.java
@@ -20,7 +20,7 @@
  * is only used if it actually reduces chunk size (including overhead
  * of additional header bytes)
  *
- * @author tatu@ning.com
+ * @author Tatu Saloranta (tatu@ning.com)
  */
 public class ChunkEncoder {
     // Beyond certain point we won't be able to compress; let's use 16 bytes as cut-off
@@ -38,6 +38,10 @@ public class ChunkEncoder {
 
     private final BufferRecycler _recycler;
 
+    /**
+     * Hash table contains lookup based on 3-byte sequence; key is hash
+     * of such triplet, value is offset in buffer.
+     */
     private int[] _hashTable;
 
     private final int _hashModulo;
@@ -78,7 +82,7 @@ public ChunkEncoder(int totalLength, BufferRecycler recycler) {
 
     /**
      * Method to close once encoder is no longer in use. Note: after calling
-     * this method, further calls to {@link #_encodeChunk} will fail
+     * this method, further calls to {@link #encodeChunk} will fail
      */
     public void close() {
         byte[] buf = _encodeBuffer;
@@ -177,26 +181,26 @@ private final int hash(int h) {
     private int tryCompress(byte[] in, int inPos, int inEnd, byte[] out, int outPos) {
         final int[] hashTable = _hashTable;
         ++outPos;
-        int hash = first(in, 0);
+        int seen = first(in, 0); // past 4 bytes we have seen... (last one is LSB)
         int literals = 0;
         inEnd -= 4;
         final int firstPos = inPos; // so that we won't have back references across block boundary
 
         while (inPos < inEnd) {
             byte p2 = in[inPos + 2];
             // next
-            hash = (hash << 8) + (p2 & 255);
-            int off = hash(hash);
+            seen = (seen << 8) + (p2 & 255);
+            int off = hash(seen);
             int ref = hashTable[off];
             hashTable[off] = inPos;
 
             // First expected common case: no back-ref (for whatever reason)
             if (ref >= inPos // can't refer forward (i.e. leftovers)
                     || ref < firstPos // or to previous block
-                    || (off = inPos - ref - 1) >= MAX_OFF
+                    || (off = inPos - ref) > MAX_OFF
                     || in[ref + 2] != p2 // must match hash
-                    || in[ref + 1] != (byte) (hash >> 8)
-                    || in[ref] != (byte) (hash >> 16)) {
+                    || in[ref + 1] != (byte) (seen >> 8)
+                    || in[ref] != (byte) (seen >> 16)) {
                 out[outPos++] = in[inPos++];
                 literals++;
                 if (literals == LZFChunk.MAX_LITERAL) {
@@ -222,6 +226,7 @@ private int tryCompress(byte[] in, int inPos, int inEnd, byte[] out, int outPos)
                 len++;
             }
             len -= 2;
+            --off; // was off by one earlier
             if (len < 7) {
                 out[outPos++] = (byte) ((off >> 8) + (len << 5));
             } else {
@@ -231,19 +236,20 @@ private int tryCompress(byte[] in, int inPos, int inEnd, byte[] out, int outPos)
             out[outPos++] = (byte) off;
             outPos++;
             inPos += len;
-            hash = first(in, inPos);
-            hash = (hash << 8) + (in[inPos + 2] & 255);
-            hashTable[hash(hash)] = inPos++;
-            hash = (hash << 8) + (in[inPos + 2] & 255); // hash = next(hash, in, inPos);
-            hashTable[hash(hash)] = inPos++;
+            seen = first(in, inPos);
+            seen = (seen << 8) + (in[inPos + 2] & 255);
+            hashTable[hash(seen)] = inPos;
+            ++inPos;
+            seen = (seen << 8) + (in[inPos + 2] & 255); // hash = next(hash, in, inPos);
+            hashTable[hash(seen)] = inPos;
+            ++inPos;
         }
-        inEnd += 4;
         // try offlining the tail
-        return tryCompressTail(in, inPos, inEnd, out, outPos, literals);
+        return handleTail(in, inPos, inEnd + 4, out, outPos, literals);
     }
 
-    private int tryCompressTail(byte[] in, int inPos, int inEnd, byte[] out, int outPos,
-                                int literals) {
+    private int handleTail(byte[] in, int inPos, int inEnd, byte[] out, int outPos,
+                           int literals) {
         while (inPos < inEnd) {
             out[outPos++] = in[inPos++];
             literals++;