Skip to content

Commit

Permalink
Fixed various latent issues w.r.t deserialization, added some comment…
Browse files Browse the repository at this point in the history
…s, clarified options/tradeoffs (and made controls for playing with them more easy to discover).
  • Loading branch information
fenix-f38 committed Nov 29, 2012
1 parent 5fe53bb commit 3950a0d
Show file tree
Hide file tree
Showing 3 changed files with 170 additions and 60 deletions.
16 changes: 14 additions & 2 deletions example_code/bit_bucket.py
Expand Up @@ -17,6 +17,13 @@ def Clear(self):
self.idx_byte = 0
self.idx_boff = 0

def AdvanceToByteBoundary(self):
bits_to_advance = (8 - self.idx_boff) % 8
if bits_to_advance:
self.idx_boff += bits_to_advance
self.idx_boff %= 8
self.idx_byte += 1

def StoreBit(self, bit):
self.StoreBits( ([bit << 7], 1) )

Expand Down Expand Up @@ -77,14 +84,14 @@ def NumBits(self):
num_bits -= 8
num_bits += self.out_boff
if num_bits < 0:
print "WTF"
print "What the..."
return num_bits

def BytesOfStorage(self):
return (self.NumBits() + 7) / 8

def BitsRemaining(self):
return self.NumBits() - (8*self.idx_byte + self.idx_boff)
return self.NumBits() - (8*self.idx_byte + self.idx_boff) - 1

def AllConsumed(self):
return self.NumBits() <= (8*self.idx_byte + self.idx_boff)
Expand Down Expand Up @@ -154,9 +161,14 @@ def GetBits(self, num_bits):
if self.idx_boff >= 8:
self.idx_byte += 1
self.idx_boff -= 8
if self.idx_boff >= 8:
raise StandardError()
if cur_boff:
retval.append(cur_byte)
if (old_idx_boff + num_bits) % 8 != self.idx_boff:
print "old_idx_boff(%d) + num_bits(%d) != self.idx_boff(%d) " % (
old_idx_boff, num_bits, self.idx_boff)
print "retval: ", (retval, num_bits)
raise StandardError()
return (retval, num_bits)

Expand Down
190 changes: 137 additions & 53 deletions example_code/headers_codec.py
Expand Up @@ -20,54 +20,135 @@

options = {}

# TODO(try var-int encoding for indices)
# TODO(use a separate huffman encoding for cookies, and possible path)
# TODO(interpret cookies as binary instead of base-64, does it reduce entropy?)
# TODO(index renumbering so things which are often used together
# have near indices. Possibly renumber whever something is referenced)


def UnpackInt(data, params, huff):
bitlen = params
raw_data = data.GetBits(bitlen)[0]
# Performance is a non-goal for this code.

# TODO:try var-int encoding for indices, or use huffman-coding on the indices
# TODO:use a separate huffman encoding for cookies, and possibly for path
# TODO:interpret cookies as binary instead of base-64, does it reduce entropy?
# TODO:make index renumbering useful so things which are often used together
# have near indices, or remove it as not worth the cost/complexity
# TODO:use other mechanisms other than LRU to perform entry expiry
# TODO:use canonical huffman codes, like the c++ version
# TODO:use huffman coding on the operation type. Clones and toggles are by far
# the most common operations.
# TODO:use huffman coding on the operation count. Small counts are far more
# common than large counts. Alternatively, simply use a smaller fixed-size.
# TODO:modify the huffman-coding to always emit a code starting with 1 so that
# we can differentiate easily between strings that are huffman encoded or
# strings which are not huffman encoded by examining the first bit.
# Alternately, define different opcodes for the various variations.

# Note: Huffman coding is used here instead of range-coding or
# arithmetic-coding because of its relative CPU efficiency and because it is
# fairly well known (though the canonical huffman code is a bit less well
# known, it is still better known than most other codings)


###### BEGIN IMPORTANT PARAMS ######
# THESE PARAMETERS ARE IMPORTANT

# If strings_use_eof is true, then the bitlen is not necessary, and possibly
# detrimental, as it caps the maximum length of any particular string.
string_length_field_bitlen = 0

# If strings_use_eof is false, however, then string_length_field_bitlen
# MUST be >0
strings_use_eof = 1

# If strings_padded_to_byte_boundary is true, then it is potentially faster
# (in an optimized implementation) to decode/encode, at the expense of some
# compression efficiency.
strings_padded_to_byte_boundary = 1

# if strings_use_huffman is false, then strings will not be encoded with
# huffman encoding
strings_use_huffman = 1

###### END IMPORTANT PARAMS ######


def UnpackInt(input, bitlen, huff):
"""
Reads an int from an input BitBucket and returns it.
'bitlen' is between 1 and 32 (inclusive), and represents the number of bits
to be read and interpreted as the int.
'huff' is unused.
"""
raw_input = input.GetBits(bitlen)[0]
rshift = 0
if bitlen <=8:
arg = '%c%c%c%c' % (0,0, 0,raw_data[0])
arg = '%c%c%c%c' % (0,0, 0,raw_input[0])
rshift = 8 - bitlen
elif bitlen <=16:
arg = '%c%c%c%c' % (0,0, raw_data[0], raw_data[1])
arg = '%c%c%c%c' % (0,0, raw_input[0], raw_input[1])
rshift = 16 - bitlen
elif bitlen <=24:
arg = '%c%c%c%c' % (0,raw_data[0], raw_data[1], raw_data[2])
arg = '%c%c%c%c' % (0,raw_input[0], raw_input[1], raw_input[2])
rshift = 24 - bitlen
else:
arg = '%c%c%c%c' % (raw_data[0], raw_data[1], raw_data[2], raw_data[3])
arg = '%c%c%c%c' % (raw_input[0], raw_input[1], raw_input[2], raw_input[3])
rshift = 32 - bitlen
retval = (struct.unpack('>L', arg)[0] >> rshift)
return retval

def UnpackStr(data, params, huff):
(bitlen_size, use_eof, len_as_bits) = params
def UnpackStr(input, params, huff):
"""
Reads a string from an input BitBucket and returns it.
'input' is a BitBucket containing the data to be interpreted as a string.
'params' is (bitlen_size, use_eof, pad_to_byte_boundary, use_huffman)
'bitlen_size' indicates the size of the length field. A size of 0 is valid IFF
'use_eof' is true.
'use_eof' indicates that an EOF character will be used (for ascii strings,
this will be a null. For huffman-encoded strings, this will be the specific
to that huffman encoding).
If 'pad_to_byte_boundary' is true, then the 'bitlen_size' parameter
represents bits of size, else 'bitlen_size' represents bytes.
if 'use_huffman' is false, then the string is not huffman-encoded.
If 'huff' is None, then the string is not huffman-encoded. If 'huff' is not
None, then it must be a Huffman compatible object which is used to do huffman
decoding.
"""
(bitlen_size, use_eof, pad_to_byte_boundary, use_huffman) = params
if not use_huffman:
huff = None
if not use_eof and not bitlen_size:
# without either a bitlen size or an EOF, we can't know when the string ends
# having both is certainly fine, however.
raise StandardError()
bitlen = -1
if bitlen_size:
bitlen = UnpackInt(data, bitlen_size, huff)
if not len_as_bits:
bitlen *= 8
if huff:
retval = huff.DecodeFromBB(data, use_eof, bitlen)
else:
retval = data.GetBits(bitlen)[0]
bitlen = UnpackInt(input, bitlen_size, huff)
if huff:
retval = huff.DecodeFromBB(input, use_eof, bitlen)
else:
retval = input.GetBits(bitlen)[0]
else: # bitlen_size == 0
if huff:
retval = huff.DecodeFromBB(input, use_eof, 0)
else:
retval = []
while True:
c = input.GetBits8()
retval.append(c)
if c == 0:
break
if pad_to_byte_boundary:
input.AdvanceToByteBoundary()
retval = ListToStr(retval)
return retval

# this assumes the bits are near the LSB, but must be packed to be close to MSB
def PackInt(data, params, val, huff):
bitlen = params
if bitlen <= 0 or bitlen > 32 or val != val & ~(0x1 << bitlen):
def PackInt(data, bitlen, val, huff):
if bitlen <= 0 or bitlen > 32 or val != (val & ~(0x1 << bitlen)):
print 'bitlen: ', bitlen, ' val: ', val
raise StandardError()
if bitlen <= 8:
Expand All @@ -78,38 +159,38 @@ def PackInt(data, params, val, huff):
tmp_val = struct.pack('>L', val << (24 - bitlen))[1:]
else:
tmp_val = struct.pack('>L', val << (32 - bitlen))

data.StoreBits( (StrToList(tmp_val), bitlen) )

def PackStr(data, params, val, huff):
(bitlen_size, use_eof, len_as_bits) = params
# if len_as_bits, then don't need eof.
(bitlen_size, use_eof, pad_to_byte_boundary, use_huffman) = params
# if eof, then don't technically need bitlen at all...
if not use_huffman:
huff = None

if not use_eof and not bitlen_size:
# without either a bitlen size or an EOF, we can't know when the string ends
# having both is certainly fine, however.
raise StandardError()
val_as_list = StrToList(val)
len_in_bits = len(val) * 8
if huff:
formatted_val = huff.Encode(StrToList(val), use_eof)
if not len_as_bits:
formatted_val = (formatted_val[0], len(formatted_val[0])*8)
else:
formatted_val = (StrToList(val), len(val)*8)
if bitlen_size and len_as_bits:
PackInt(data, bitlen_size, formatted_val[1], huff)
elif bitlen_size:
PackInt(data, bitlen_size, formatted_val[1]/8, huff)
data.StoreBits(formatted_val)
(val_as_list, len_in_bits) = huff.Encode(val_as_list, use_eof)
if pad_to_byte_boundary:
len_in_bits = len(val_as_list) *8
if bitlen_size:
PackInt(data, bitlen_size, len_in_bits, huff)
data.StoreBits( (val_as_list, len_in_bits) )


str_pack_params = (string_length_field_bitlen, strings_use_eof,
strings_padded_to_byte_boundary, strings_use_huffman)
packing_instructions = {
'opcode' : (8, PackInt, UnpackInt),
'index' : (16, PackInt, UnpackInt),
'index_start' : (16, PackInt, UnpackInt),
'key_idx' : (16, PackInt, UnpackInt),
'val' : ((16, True, False), PackStr, UnpackStr),
'key' : ((16, True, False), PackStr, UnpackStr),
'opcode' : ( 8, PackInt, UnpackInt),
'index' : ( 16, PackInt, UnpackInt),
'index_start' : ( 16, PackInt, UnpackInt),
'key_idx' : ( 16, PackInt, UnpackInt),
'val' : (str_pack_params, PackStr, UnpackStr),
'key' : (str_pack_params, PackStr, UnpackStr),
}

def PackOps(data, packing_instructions, ops, huff):
Expand Down Expand Up @@ -197,7 +278,6 @@ def PreProcessToggles(self, instructions):
def OutputOps(self, packing_instructions, huff, data, ops, opcode):
if not ops:
return;

ops_idx = 0
ops_len = len(ops)
while ops_len > ops_idx:
Expand All @@ -210,6 +290,7 @@ def OutputOps(self, packing_instructions, huff, data, ops, opcode):
self.WriteOpData(data, ops[orig_idx + i], huff)
ops_idx += 1


def WriteOpData(self, data, op, huff):
for field_name in packing_order:
if not field_name in op:
Expand Down Expand Up @@ -283,15 +364,15 @@ def DeserializeInstructions(self, frame, packing_instructions, huff):
flags = 0
#print 'DeserializeInstructions'
while flags == 0:
frame_len = bb.GetBits16()
frame_len = bb.GetBits16() * 8
#print 'frame_len: ', frame_len
flags = bb.GetBits8()
#print 'flags: ', flags
stream_id = bb.GetBits32()
#print 'stream_id: ', stream_id
frame_type = bb.GetBits8()
#print 'frame_type: ', frame_type
while frame_len:
while frame_len > 16: # 16 bits minimum for the opcode + count...
bits_remaining_at_start = bb.BitsRemaining()
opcode_val = bb.GetBits8()
#print 'opcode_val: ', opcode_val
Expand All @@ -309,13 +390,16 @@ def DeserializeInstructions(self, frame, packing_instructions, huff):
val = unpack_fn(bb, params, huff)
#print val
op[field_name] = val
#print "BitsRemaining: %d (%d)" % (bb.BitsRemaining(), bb.BitsRemaining() % 8)
#print "Deser %d" % (bb.NumBits() - bb.BitsRemaining())
#print op
ops.append(op)
bits_consumed = (bits_remaining_at_start - bb.BitsRemaining())
if not bits_consumed % 8 == 0:
print "somehow didn't consume whole bytes..."
raise StandardError()
frame_len -= bits_consumed / 8
#if not bits_consumed % 8 == 0:
# print "somehow didn't consume whole bytes..."
# print "Bits consumed: %d (%d)" % (bits_consumed, bits_consumed % 8)
# raise StandardError()
frame_len -= bits_consumed
#print 'ops: ', ops
return ops

Expand Down
24 changes: 19 additions & 5 deletions example_code/huffman.py
Expand Up @@ -4,13 +4,16 @@
import heapq
from collections import deque
from bit_bucket import BitBucket
from common_utils import FormatAsBits
import string

class Huffman(object):
def __init__(self, freq_table):
self.code_tree = None
self.code_table = []
self.BuildCodeTree(freq_table)
self.BuildCodeTable(self.code_tree)
print self.FormatCodeTable()

def BuildCodeTree(self, freq_table):
def MN(x):
Expand Down Expand Up @@ -96,9 +99,11 @@ def Encode(self, text, include_eof):
def DecodeFromBB(self, bb, includes_eof, bits_to_decode):
output = []
total_bits = 0
if not includes_eof and bits_to_decode < 0:
if not includes_eof and bits_to_decode <= 0:
# That can't work.
raise StandardError()
if bits_to_decode <= 0:
bits_to_decode = -1
while bits_to_decode < 0 or total_bits < bits_to_decode:
root = self.code_tree
while root[1] is None:
Expand Down Expand Up @@ -150,10 +155,19 @@ def Decode(self, text, includes_eof, bits_to_decode):
return output

def FormatCodeTable(self):
x = sorted([(chr(i), self.code_table[i])
for i in xrange(len(self.code_table))],
key=lambda x: (x[1][1], x[1][0]))
return repr(x)
printable = string.digits + string.letters + string.punctuation + ' ' + "\t"
x = sorted([(i,FormatAsBits( self.code_table[i]))
for i in xrange(len(self.code_table))])
retval = []
for entry in x:
code, description = entry
readable_code = ""
if code < 256 and chr(code) in printable and chr(code) != '\t':
readable_code = "'%c'" % chr(code)
while len(readable_code) < 5:
readable_code = " " + readable_code
retval.append('%s (%3d): %s' % (readable_code, code, description))
return '\n'.join(retval)

def __repr__(self):
output = ['[']
Expand Down

0 comments on commit 3950a0d

Please sign in to comment.