HuffmanTable: rewrite Huffman decoder related code

This is pretty much a rewrite of all Huffman decoding related code. There have been several copies of this functionality with minor differences (and different bugs). They are all replaced by one new implementation in HuffmanTable.h. Code documentation of the inner workings has been significantly increased. Great emphasis has been put on optimizing the hot paths of the decoding on down to the CPU instruction level. They are responsible for somehere nearly the complete decoding time of a standard LJPEG based raw. There is an optimized, unified lookup mechanism, replacing the old 'numbit' and 'bigTable' funtionality. To give an example: the number of executed instructions and executed branches went down by about 30% - the runtime by 10%-20% for some sample CR2. This cleanup is not quite complete. The LJPEGDecoder and friends clases using this code have only been changed minally to support the new code. This will follow in a separte commit.
darktable-org · Jan 17, 2017 · b213eaf · b213eaf
1 parent 64edbea
commit b213eaf
Show file tree

Hide file tree

Showing 10 changed files with 379 additions and 626 deletions.
diff --git a/RawSpeed/HasselbladDecompressor.cpp b/RawSpeed/HasselbladDecompressor.cpp
@@ -68,7 +68,7 @@ void HasselbladDecompressor::parseSOS() {
     uint32 td = b >> 4;
     if (td > 3)
       ThrowRDE("LJpegDecompressor::parseSOS: Invalid Huffman table selection");
-    if (!huff[td].initialized)
+    if (!huff[td])
       ThrowRDE("LJpegDecompressor::parseSOS: Invalid Huffman table selection, not defined.");
 
     if (count > 3)
@@ -107,9 +107,7 @@ void HasselbladDecompressor::parseSOS() {
 // Returns len bits as a signed value.
 // Highest bit is a sign bit
 inline int HasselbladDecompressor::getBits(int len) {
-  int diff = ph1_bits->getBits(len);
-  if ((diff & (1 << (len - 1))) == 0)
-    diff -= (1 << len) - 1;
+  int diff = HuffmanTable::signExtended(ph1_bits->getBits(len), len);
   if (diff == 65535)
     return -32768;
   return diff;
@@ -135,46 +133,7 @@ void HasselbladDecompressor::decodeScanHasselblad() {
 }
 
 int HasselbladDecompressor::HuffGetLength() {
-  int rv = 0;
-  int l, temp;
-  int code, val;
-
-  HuffmanTable *dctbl1 = &huff[0];
-  /*
-  * If the huffman code is less than 8 bits, we can use the fast
-  * table lookup to get its value.  It's more than 8 bits about
-  * 3-4% of the time.
-  */
-  ph1_bits->fill();
-
-  code = ph1_bits->peekBitsNoFill(8);
-  val = dctbl1->numbits[code];
-  l = val & 15;
-  if (l) {
-    ph1_bits->skipBitsNoFill(l);
-    return val >> 4;
-  }
-  ph1_bits->skipBits(8);
-  l = 8;
-
-  while (code > dctbl1->maxcode[l]) {
-    temp = ph1_bits->getBitsNoFill(1);
-    code = (code << 1) | temp;
-    l++;
-  }
-
-  /*
-  * With garbage input we may reach the sentinel value l = 17.
-  */
-
-  if (l > 16) {
-    ThrowRDE("Hasselblad, Corrupt JPEG data: bad Huffman code:%u\n", l);
-  } else {
-    rv = dctbl1->huffval[dctbl1->valptr[l] +
-                         ((int)(code - dctbl1->mincode[l]))];
-  }
-  return rv;
+  return huff[0]->decodeLength(*ph1_bits);
 }
 
-
 } // namespace RawSpeed
diff --git a/RawSpeed/HuffmanTable.h b/RawSpeed/HuffmanTable.h
@@ -0,0 +1,271 @@
+/*
+    RawSpeed - RAW file decoder.
+
+    Copyright (C) 2017 Axel Waggershauser
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+
+#pragma once
+
+#include "Buffer.h"
+
+/*
+* The following code is inspired by the IJG JPEG library.
+*
+* Copyright (C) 1991, 1992, Thomas G. Lane.
+* Part of the Independent JPEG Group's software.
+* See the file Copyright for more details.
+*
+* Copyright (c) 1993 Brian C. Smith, The Regents of the University
+* of California
+* All rights reserved.
+*
+* Copyright (c) 1994 Kongji Huang and Brian C. Smith.
+* Cornell University
+* All rights reserved.
+*
+* Permission to use, copy, modify, and distribute this software and its
+* documentation for any purpose, without fee, and without written agreement is
+* hereby granted, provided that the above copyright notice and the following
+* two paragraphs appear in all copies of this software.
+*
+* IN NO EVENT SHALL CORNELL UNIVERSITY BE LIABLE TO ANY PARTY FOR
+* DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT
+* OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF CORNELL
+* UNIVERSITY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+* CORNELL UNIVERSITY SPECIFICALLY DISCLAIMS ANY WARRANTIES,
+* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+* AND FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
+* ON AN "AS IS" BASIS, AND CORNELL UNIVERSITY HAS NO OBLIGATION TO
+* PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
+*/
+
+namespace RawSpeed {
+
+class HuffmanTable
+{
+  // private fields calculated from codesPerBits and codeValues
+  // they are index '1' based, so we can directly lookup the value
+  // for code length l without decrementing
+  vector<ushort16> maxCodeOL;    // index is length of code
+  vector<ushort16> codeOffsetOL; // index is length of code
+
+  // The code can be compiled with two different decode lookup table layouts.
+  // The idea is that different CPU architectures may perform better with
+  // one or the other, depending on the relative performance of their arithmetic
+  // core vs their memory access. For an Intel Core i7, the big table is better.
+#if 1
+  // lookup table containing 3 fields: payload:16|flag:8|len:8
+  // The payload may be the fully decoded diff or the length of the diff.
+  // The len field contains the number of bits, this lookup consumed.
+  // A lookup value of 0 means the code was too big to fit into the table.
+  // The optimal LookupDepth is also likely to depend on the CPU architecture.
+  static constexpr unsigned PayloadShift = 16;
+  static constexpr unsigned FlagMask = 0x100;
+  static constexpr unsigned LenMask = 0xff;
+  static constexpr unsigned LookupDepth = 13;
+  vector<int32> decodeLookup;
+#else
+  // lookup table containing 2 fields: payload:4|len:4
+  // the payload is the length of the diff, len is the length of the code
+  static constexpr unsigned LookupDepth = 15;
+  static constexpr unsigned PayloadShift = 4;
+  static constexpr unsigned FlagMask = 0;
+  static constexpr unsigned LenMask = 0x0f;
+  vector<uchar8> decodeLookup;
+#endif
+
+  bool fixDNGBug16 = false;
+
+  size_t maxCodePlusDiffLength() const {
+    return nCodesPerLength.size()-1 + codeValues.size()-1;
+  }
+
+public:
+
+  // These two fields directly represent the contents of a JPEG DHT field
+  // 1. The number of codes there are per bit length, this is index 1 based.
+  // (there are always 0 codes of length 0)
+  vector<int> nCodesPerLength; // index is length of code
+  // 2. This is the actual huffman encoded data, i.e. the 'alphabet'. Each value
+  // is the number of bits following the code that encode the difference to the
+  // last pixel. Valid values are in the range 0..16.
+  // signExtended() is used to decode the difference bits to a signed int.
+  vector<uchar8> codeValues;   // index is just sequential number
+
+  bool operator==(const HuffmanTable& other) const {
+    return nCodesPerLength == other.nCodesPerLength
+        && codeValues      == other.codeValues;
+  }
+
+  uint32 setNCodesPerLength(const Buffer& data) {
+    assert(data.getSize() == 16);
+    nCodesPerLength.resize(17);
+    copy(data.begin(), data.end(), &nCodesPerLength[1]);
+    return accumulate(data.begin(), data.end(), 0);
+  }
+
+  void setCodeValues(const Buffer& data) {
+    // spec says max 16 but Hasselblad ignores that -> allow 17
+    assert(data.getSize() <= 17);
+    codeValues.assign(data.begin(), data.end());
+  }
+
+  void setup(bool fullDecode, bool fixDNGBug16) {
+    this->fixDNGBug16 = fixDNGBug16;
+
+    // store the code lengths in bits, valid values are 0..16
+    vector<uchar8> code_len; // index is just sequential number
+    // store the codes themselfs (bit patterns found inside the stream)
+    vector<ushort16> codes;  // index is just sequential number
+
+    // trim empty entries from the codes per length table on the right
+    while (!nCodesPerLength.back())
+      nCodesPerLength.pop_back();
+    int maxCodeLength = nCodesPerLength.size()-1;
+
+    // Figure C.1: make table of Huffman code length for each symbol
+    // Figure C.2: generate the codes themselves
+    uint32 code = 0;
+    for (int l = 1; l <= maxCodeLength; ++l) {
+      assert(nCodesPerLength[l] < (1<<l));
+      for (int i = 0; i < nCodesPerLength[l]; ++i) {
+        assert(code <= 0xffff);
+        code_len.push_back(l);
+        codes.push_back(code++);
+      }
+      code <<= 1;
+    }
+
+    // Figure F.15: generate decoding tables
+    codeOffsetOL.resize(maxCodeLength+1, 0xffff);
+    maxCodeOL.resize(maxCodeLength+1);
+    int code_index = 0;
+    for (int l = 1; l <= maxCodeLength; l++) {
+      if (nCodesPerLength[l]) {
+        codeOffsetOL[l] = codes[code_index] - code_index;
+        code_index += nCodesPerLength[l];
+        maxCodeOL[l] = codes[code_index - 1];
+      }
+    }
+
+    // Generate lookup table for fast decoding lookup.
+    // See definition of decodeLookup above
+    decodeLookup.resize(1 << LookupDepth);
+    for (size_t i = 0; i < codes.size(); i++) {
+      uchar8 code_l = code_len[i];
+      if (code_l > (int)LookupDepth)
+        break;
+
+      ushort16 ll = codes[i] << (LookupDepth - code_l);
+      ushort16 ul = ll | ((1 << (LookupDepth - code_l)) - 1);
+      ushort16 diff_l = codeValues[i];
+      for (ushort16 c = ll; c <= ul; c++) {
+        if (!FlagMask || !fullDecode || diff_l + code_l > LookupDepth) {
+          // lookup bit depth is too small to fit both the encoded length
+          // and the final difference value.
+          // -> store only the length and do a normal sign extension later
+          decodeLookup[c] = diff_l << PayloadShift | code_l;
+        } else {
+          // diff_l + code_l <= lookupDepth
+          // The table bit depth is large enough to store both.
+          decodeLookup[c] = (code_l + diff_l) | FlagMask;
+
+          if (diff_l) {
+            uint32 diff = (c >> (LookupDepth - code_l - diff_l)) & ((1 << diff_l) - 1);
+            decodeLookup[c] |= (uint32)signExtended(diff, diff_l) << PayloadShift;
+          }
+        }
+      }
+    }
+  }
+
+  inline static int signExtended(uint32 diff, uint32 len) {
+#if 0
+#define _X(x) (1<<x)-1
+    constexpr static int offset[16] = {
+      0,     _X(1), _X(2),  _X(3),  _X(4),  _X(5),  _X(6),  _X(7),
+      _X(8), _X(9), _X(10), _X(11), _X(12), _X(13), _X(14), _X(15)};
+#undef _X
+    if ((diff & (1 << (len - 1))) == 0)
+      diff -= offset[len];
+#else
+    if ((diff & (1 << (len - 1))) == 0)
+      diff -= (1 << len) - 1;
+#endif
+    return diff;
+  }
+
+  template<typename BIT_STREAM> inline int decodeLength(BIT_STREAM& bs) const {
+    return decode<BIT_STREAM, false>(bs);
+  }
+
+  template<typename BIT_STREAM> inline int decodeNext(BIT_STREAM& bs) const {
+    return decode<BIT_STREAM, true>(bs);
+  }
+
+  // The bool template paraeter is to enable two versions:
+  // one returning only the length of the of diff bits (see Hasselblad),
+  // one to return the fully decoded diff.
+  // All ifs depending on this bool will be optimized out by the compiler
+  template<typename BIT_STREAM, bool FULL_DECODE> inline int decode(BIT_STREAM& bs) const {
+    // 32 is the absolute maximum combined length of code + diff
+    // for processors supporting bmi2 instructions, using maxCodePlusDiffLength()
+    // might be benifitial
+    bs.fill(32);
+    uint32 code = bs.peekBitsNoFill(LookupDepth);
+
+    int val = decodeLookup[code];
+    int len = val & LenMask;
+    // if the code is invalid (bitstream corrupted) len will be 0
+    bs.skipBitsNoFill(len);
+    if (FULL_DECODE && val & FlagMask) {
+      // if the flag bit is set, the payload is the already sign extended difference
+      return val >> PayloadShift;
+    } else if (len) {
+      // if the flag bit is not set but len != 0, the payload is the number of bits to sign extend and return
+      int l_diff = val >> PayloadShift;
+      return FULL_DECODE ? signExtended(bs.getBitsNoFill(l_diff), l_diff) : l_diff;
+    } else {
+      uint32 code_l = LookupDepth;
+      bs.skipBitsNoFill(code_l);
+      while (code_l < maxCodeOL.size() && code > maxCodeOL[code_l]) {
+        uint32 temp = bs.getBitsNoFill(1);
+        code = (code << 1) | temp;
+        code_l++;
+      }
+
+      if (code > maxCodeOL[code_l])
+        ThrowRDE("Corrupt JPEG data: bad Huffman code: %u (len: %u)", code, code_l);
+
+      int diff_l = codeValues[code - codeOffsetOL[code_l]];
+
+      if (!FULL_DECODE)
+        return diff_l;
+
+      if (diff_l == 16) {
+        if (fixDNGBug16)
+          bs.skipBits(16);
+        return -32768;
+      }
+
+      return diff_l ? signExtended(bs.getBitsNoFill(diff_l), diff_l) : 0;
+    }
+  }
+};
+
+} // namespace RawSpeed