Fixed various latent issues w.r.t deserialization, added some comment…

…s, clarified options/tradeoffs (and made controls for playing with them more easy to discover).
grmocg · Nov 29, 2012 · 3950a0d · 3950a0d
1 parent 5fe53bb
commit 3950a0d
Show file tree

Hide file tree

Showing 3 changed files with 170 additions and 60 deletions.
diff --git a/example_code/bit_bucket.py b/example_code/bit_bucket.py
@@ -17,6 +17,13 @@ def Clear(self):
     self.idx_byte = 0
     self.idx_boff = 0
 
+  def AdvanceToByteBoundary(self):
+    bits_to_advance = (8 - self.idx_boff) % 8
+    if bits_to_advance:
+      self.idx_boff += bits_to_advance
+      self.idx_boff %= 8
+      self.idx_byte += 1
+
   def StoreBit(self, bit):
     self.StoreBits( ([bit << 7], 1) )
 
@@ -77,14 +84,14 @@ def NumBits(self):
       num_bits -= 8
       num_bits += self.out_boff
     if num_bits < 0:
-      print "WTF"
+      print "What the..."
     return num_bits
 
   def BytesOfStorage(self):
     return (self.NumBits() + 7) / 8
 
   def BitsRemaining(self):
-    return self.NumBits() - (8*self.idx_byte + self.idx_boff)
+    return self.NumBits() - (8*self.idx_byte + self.idx_boff) - 1
 
   def AllConsumed(self):
     return self.NumBits() <= (8*self.idx_byte + self.idx_boff)
@@ -154,9 +161,14 @@ def GetBits(self, num_bits):
           if self.idx_boff >= 8:
             self.idx_byte += 1
             self.idx_boff -= 8
+            if self.idx_boff >= 8:
+              raise StandardError()
       if cur_boff:
         retval.append(cur_byte)
     if (old_idx_boff + num_bits) % 8 != self.idx_boff:
+      print "old_idx_boff(%d) + num_bits(%d) != self.idx_boff(%d) " % (
+          old_idx_boff, num_bits, self.idx_boff)
+      print "retval: ", (retval, num_bits)
       raise StandardError()
     return (retval, num_bits)
 

diff --git a/example_code/headers_codec.py b/example_code/headers_codec.py
@@ -20,54 +20,135 @@
 
 options = {}
 
-# TODO(try var-int encoding for indices)
-# TODO(use a separate huffman encoding for cookies, and possible path)
-# TODO(interpret cookies as binary instead of base-64, does it reduce entropy?)
-# TODO(index renumbering so things which are often used together
-#      have near indices. Possibly renumber whever something is referenced)
-
-
-def UnpackInt(data, params, huff):
-  bitlen = params
-  raw_data = data.GetBits(bitlen)[0]
+# Performance is a non-goal for this code.
+
+# TODO:try var-int encoding for indices, or use huffman-coding on the indices
+# TODO:use a separate huffman encoding for cookies, and possibly for path
+# TODO:interpret cookies as binary instead of base-64, does it reduce entropy?
+# TODO:make index renumbering useful so things which are often used together
+#      have near indices, or remove it as not worth the cost/complexity
+# TODO:use other mechanisms other than LRU to perform entry expiry
+# TODO:use canonical huffman codes, like the c++ version
+# TODO:use huffman coding on the operation type. Clones and toggles are by far
+#      the most common operations.
+# TODO:use huffman coding on the operation count. Small counts are far more
+#      common than large counts. Alternatively, simply use a smaller fixed-size.
+# TODO:modify the huffman-coding to always emit a code starting with 1 so that
+#      we can differentiate easily between strings that are huffman encoded or
+#      strings which are not huffman encoded by examining the first bit.
+#      Alternately, define different opcodes for the various variations.
+
+# Note: Huffman coding is used here instead of range-coding or
+# arithmetic-coding because of its relative CPU efficiency and because it is
+# fairly well known (though the canonical huffman code is a bit less well
+# known, it is still better known than most other codings)
+
+
+###### BEGIN IMPORTANT PARAMS ######
+#  THESE PARAMETERS ARE IMPORTANT
+
+# If strings_use_eof is true, then the bitlen is not necessary, and possibly
+#  detrimental, as it caps the maximum length of any particular string.
+string_length_field_bitlen = 0
+
+# If strings_use_eof is false, however, then string_length_field_bitlen
+#  MUST be >0
+strings_use_eof = 1
+
+# If strings_padded_to_byte_boundary is true, then it is potentially faster
+# (in an optimized implementation) to decode/encode, at the expense of some
+# compression efficiency.
+strings_padded_to_byte_boundary = 1
+
+# if strings_use_huffman is false, then strings will not be encoded with
+# huffman encoding
+strings_use_huffman = 1
+
+###### END IMPORTANT PARAMS ######
+
+
+def UnpackInt(input, bitlen, huff):
+  """
+  Reads an int from an input BitBucket and returns it.
+
+  'bitlen' is between 1 and 32 (inclusive), and represents the number of bits
+  to be read and interpreted as the int.
+
+  'huff' is unused.
+  """
+  raw_input = input.GetBits(bitlen)[0]
   rshift = 0
   if bitlen <=8:
-    arg = '%c%c%c%c' % (0,0, 0,raw_data[0])
+    arg = '%c%c%c%c' % (0,0, 0,raw_input[0])
     rshift = 8 - bitlen
   elif bitlen <=16:
-    arg = '%c%c%c%c' % (0,0, raw_data[0], raw_data[1])
+    arg = '%c%c%c%c' % (0,0, raw_input[0], raw_input[1])
     rshift = 16 - bitlen
   elif bitlen <=24:
-    arg = '%c%c%c%c' % (0,raw_data[0], raw_data[1], raw_data[2])
+    arg = '%c%c%c%c' % (0,raw_input[0], raw_input[1], raw_input[2])
     rshift = 24 - bitlen
   else:
-    arg = '%c%c%c%c' % (raw_data[0], raw_data[1], raw_data[2], raw_data[3])
+    arg = '%c%c%c%c' % (raw_input[0], raw_input[1], raw_input[2], raw_input[3])
     rshift = 32 - bitlen
   retval = (struct.unpack('>L', arg)[0] >> rshift)
   return retval
 
-def UnpackStr(data, params, huff):
-  (bitlen_size, use_eof, len_as_bits) = params
+def UnpackStr(input, params, huff):
+  """
+  Reads a string from an input BitBucket and returns it.
+
+  'input' is a BitBucket containing the data to be interpreted as a string.
+
+  'params' is (bitlen_size, use_eof, pad_to_byte_boundary, use_huffman)
+
+  'bitlen_size' indicates the size of the length field. A size of 0 is valid IFF
+  'use_eof' is true.
+
+  'use_eof' indicates that an EOF character will be used (for ascii strings,
+  this will be a null. For huffman-encoded strings, this will be the specific
+  to that huffman encoding).
+
+  If 'pad_to_byte_boundary' is true, then the 'bitlen_size' parameter
+  represents bits of size, else 'bitlen_size' represents bytes.
+
+
+  if 'use_huffman' is false, then the string is not huffman-encoded.
+
+  If 'huff' is None, then the string is not huffman-encoded. If 'huff' is not
+  None, then it must be a Huffman compatible object which is used to do huffman
+  decoding.
+  """
+  (bitlen_size, use_eof, pad_to_byte_boundary, use_huffman) = params
+  if not use_huffman:
+    huff = None
   if not use_eof and not bitlen_size:
     # without either a bitlen size or an EOF, we can't know when the string ends
     # having both is certainly fine, however.
     raise StandardError()
-  bitlen = -1
   if bitlen_size:
-    bitlen = UnpackInt(data, bitlen_size, huff)
-    if not len_as_bits:
-      bitlen *= 8
-  if huff:
-    retval = huff.DecodeFromBB(data, use_eof, bitlen)
-  else:
-    retval = data.GetBits(bitlen)[0]
+    bitlen = UnpackInt(input, bitlen_size, huff)
+    if huff:
+      retval = huff.DecodeFromBB(input, use_eof, bitlen)
+    else:
+      retval = input.GetBits(bitlen)[0]
+  else:  # bitlen_size == 0
+    if huff:
+      retval = huff.DecodeFromBB(input, use_eof, 0)
+    else:
+      retval = []
+      while True:
+        c = input.GetBits8()
+        retval.append(c)
+        if c == 0:
+          break
+  if pad_to_byte_boundary:
+    input.AdvanceToByteBoundary()
   retval = ListToStr(retval)
   return retval
 
 # this assumes the bits are near the LSB, but must be packed to be close to MSB
-def PackInt(data, params, val, huff):
-  bitlen = params
-  if bitlen <= 0 or bitlen > 32 or val  != val & ~(0x1 << bitlen):
+def PackInt(data, bitlen, val, huff):
+  if bitlen <= 0 or bitlen > 32 or val != (val & ~(0x1 << bitlen)):
     print 'bitlen: ', bitlen, ' val: ', val
     raise StandardError()
   if bitlen <= 8:
@@ -78,38 +159,38 @@ def PackInt(data, params, val, huff):
     tmp_val = struct.pack('>L', val << (24 - bitlen))[1:]
   else:
     tmp_val = struct.pack('>L', val << (32 - bitlen))
-
   data.StoreBits( (StrToList(tmp_val), bitlen) )
 
 def PackStr(data, params, val, huff):
-  (bitlen_size, use_eof, len_as_bits) = params
-  # if len_as_bits, then don't need eof.
+  (bitlen_size, use_eof, pad_to_byte_boundary, use_huffman) = params
   # if eof, then don't technically need bitlen at all...
+  if not use_huffman:
+    huff = None
 
   if not use_eof and not bitlen_size:
     # without either a bitlen size or an EOF, we can't know when the string ends
     # having both is certainly fine, however.
     raise StandardError()
+  val_as_list = StrToList(val)
+  len_in_bits = len(val) * 8
   if huff:
-    formatted_val = huff.Encode(StrToList(val), use_eof)
-    if not len_as_bits:
-      formatted_val = (formatted_val[0], len(formatted_val[0])*8)
-  else:
-    formatted_val = (StrToList(val), len(val)*8)
-  if bitlen_size and len_as_bits:
-    PackInt(data, bitlen_size, formatted_val[1], huff)
-  elif bitlen_size:
-    PackInt(data, bitlen_size, formatted_val[1]/8, huff)
-  data.StoreBits(formatted_val)
+    (val_as_list, len_in_bits) = huff.Encode(val_as_list, use_eof)
+    if pad_to_byte_boundary:
+      len_in_bits = len(val_as_list) *8
+  if bitlen_size:
+    PackInt(data, bitlen_size, len_in_bits, huff)
+  data.StoreBits( (val_as_list, len_in_bits) )
 
 
+str_pack_params = (string_length_field_bitlen, strings_use_eof,
+                   strings_padded_to_byte_boundary, strings_use_huffman)
 packing_instructions = {
-  'opcode'      : (8, PackInt, UnpackInt),
-  'index'       : (16, PackInt, UnpackInt),
-  'index_start' : (16, PackInt, UnpackInt),
-  'key_idx'     : (16, PackInt, UnpackInt),
-  'val'         : ((16, True, False), PackStr, UnpackStr),
-  'key'         : ((16, True, False), PackStr, UnpackStr),
+  'opcode'      : (  8,             PackInt, UnpackInt),
+  'index'       : ( 16,             PackInt, UnpackInt),
+  'index_start' : ( 16,             PackInt, UnpackInt),
+  'key_idx'     : ( 16,             PackInt, UnpackInt),
+  'val'         : (str_pack_params, PackStr, UnpackStr),
+  'key'         : (str_pack_params, PackStr, UnpackStr),
 }
 
 def PackOps(data, packing_instructions, ops, huff):
@@ -197,7 +278,6 @@ def PreProcessToggles(self, instructions):
   def OutputOps(self, packing_instructions, huff, data, ops, opcode):
     if not ops:
       return;
-
     ops_idx = 0
     ops_len = len(ops)
     while ops_len > ops_idx:
@@ -210,6 +290,7 @@ def OutputOps(self, packing_instructions, huff, data, ops, opcode):
         self.WriteOpData(data, ops[orig_idx + i], huff)
         ops_idx += 1
 
+
   def WriteOpData(self, data, op, huff):
     for field_name in packing_order:
       if not field_name in op:
@@ -283,15 +364,15 @@ def DeserializeInstructions(self, frame, packing_instructions, huff):
     flags = 0
     #print 'DeserializeInstructions'
     while flags == 0:
-      frame_len = bb.GetBits16()
+      frame_len = bb.GetBits16() * 8
       #print 'frame_len: ', frame_len
       flags = bb.GetBits8()
       #print 'flags: ', flags
       stream_id = bb.GetBits32()
       #print 'stream_id: ', stream_id
       frame_type = bb.GetBits8()
       #print 'frame_type: ', frame_type
-      while frame_len:
+      while frame_len > 16:  # 16 bits minimum for the opcode + count...
         bits_remaining_at_start = bb.BitsRemaining()
         opcode_val = bb.GetBits8()
         #print 'opcode_val: ', opcode_val
@@ -309,13 +390,16 @@ def DeserializeInstructions(self, frame, packing_instructions, huff):
             val = unpack_fn(bb, params, huff)
             #print val
             op[field_name] = val
+            #print "BitsRemaining: %d (%d)" % (bb.BitsRemaining(), bb.BitsRemaining() % 8)
+          #print "Deser %d" % (bb.NumBits() - bb.BitsRemaining())
           #print op
           ops.append(op)
         bits_consumed = (bits_remaining_at_start - bb.BitsRemaining())
-        if not bits_consumed % 8 == 0:
-          print "somehow didn't consume whole bytes..."
-          raise StandardError()
-        frame_len -= bits_consumed / 8
+        #if not bits_consumed % 8 == 0:
+        #  print "somehow didn't consume whole bytes..."
+        #  print "Bits consumed: %d (%d)" % (bits_consumed, bits_consumed % 8)
+        #  raise StandardError()
+        frame_len -= bits_consumed
     #print 'ops: ', ops
     return ops
 

diff --git a/example_code/huffman.py b/example_code/huffman.py
@@ -4,13 +4,16 @@
 import heapq
 from collections import deque
 from bit_bucket import BitBucket
+from common_utils import FormatAsBits
+import string
 
 class Huffman(object):
   def __init__(self, freq_table):
     self.code_tree = None
     self.code_table = []
     self.BuildCodeTree(freq_table)
     self.BuildCodeTable(self.code_tree)
+    print self.FormatCodeTable()
 
   def BuildCodeTree(self, freq_table):
     def MN(x):
@@ -96,9 +99,11 @@ def Encode(self, text, include_eof):
   def DecodeFromBB(self, bb, includes_eof, bits_to_decode):
     output = []
     total_bits = 0
-    if not includes_eof and bits_to_decode < 0:
+    if not includes_eof and bits_to_decode <= 0:
       # That can't work.
       raise StandardError()
+    if bits_to_decode <= 0:
+      bits_to_decode = -1
     while bits_to_decode < 0 or total_bits < bits_to_decode:
       root = self.code_tree
       while root[1] is None:
@@ -150,10 +155,19 @@ def Decode(self, text, includes_eof, bits_to_decode):
     return output
 
   def FormatCodeTable(self):
-    x = sorted([(chr(i), self.code_table[i])
-                for i in xrange(len(self.code_table))],
-               key=lambda x: (x[1][1], x[1][0]))
-    return repr(x)
+    printable = string.digits + string.letters + string.punctuation + ' ' + "\t"
+    x = sorted([(i,FormatAsBits( self.code_table[i]))
+                for i in xrange(len(self.code_table))])
+    retval = []
+    for entry in x:
+      code, description = entry
+      readable_code = ""
+      if code < 256 and chr(code) in printable and chr(code) != '\t':
+        readable_code = "'%c'" % chr(code)
+      while len(readable_code) < 5:
+          readable_code = " " + readable_code
+      retval.append('%s (%3d): %s' % (readable_code, code, description))
+    return '\n'.join(retval)
 
   def __repr__(self):
     output = ['[']