# fat-crocodile/comression

more comments and more user-friendly interface

fat-crocodile committed May 22, 2014
1 parent cd362bc commit 4576fc56c83b39243778eed6a78674ac8614f924
Showing with 91 additions and 66 deletions.
1. +43 −31 huffman/bounded_huffman.py
2. +34 −29 huffman/coder.py
3. +12 −4 huffman/huffman.py
4. +2 −2 huffman/test_huffman.py
 @@ -4,6 +4,49 @@ "A Fast Algorithm for Optimal Length-Limited Huffman Codes" Journal of the Association for Computing Machinery, Vol. 37, No. 3, July 1990""" +def make_code_symbols(weights, limit): + """Input: + - list of pairs (symbol, weight); symbols with weight 0 allowed + - code lenght limit + Output: + - list of pairs (symbol, code lenght) in the same order + """ + res = make_code([w for _,w in weights], limit) + return [(s, l) for (s,_),l in zip(weights, res)] + +def make_code(weights, limit): + """Input: + - symbols weights in alphabetical order (symbols with weight 0 allowed) + - code lenght limit + Output: + - symbols code lenghts in alphabetical order + """ + + # sort by weight, exclude zero-weighted symbols and save original symbol position + positioned_weights = sorted((w, n) for n,w in enumerate(weights) if w > 0) + + if len(positioned_weights) > 2**limit: + raise Exception('there are no such code') + + coins = [] + + for level in range(limit, 0, -1): + # generate current level coins + new_coins = [(w, {i:level}) for w,i in positioned_weights] + # coins, merged from previous level coins + prev_coins = [_merge_coins(coins[2*i], coins[2*i+1]) for i in range(len(coins) / 2)] + # merge lists + coins = list(_imerge(prev_coins, new_coins, lambda x,y: x[0] < y[0])) + + # got results + res = [0] * len(weights) + + for i in range(len(positioned_weights) * 2 - 2): + for k,v in coins[i][1].items(): + if res[k] < v: res[k] = v + + return res + def _merge_coins(c1, c2): """Merge two coins in one meta-coin. Each coin in pair (weight, {base coin id --> height in tree})""" w = c1[0] + c2[0] @@ -45,34 +88,3 @@ def _imerge(iter1, iter2, less_then = None): yield i2 i2 = None - -def make_code(weights, limit): - """Input: - - symbols weights in alphabetical order (symbols with weight 0 allowed) - - code lenght limit - Output: - - symbols code lenghts in alphabetical order - """ - positioned_weights = sorted((w, n) for n,(_,w) in enumerate(weights) if w > 0) - - if len(positioned_weights) > 2**limit: - raise Exception('there are no such code') - - coins = [] - - for level in range(limit, 0, -1): - # current level coins - new_coins = [(w, {i:level}) for w,i in positioned_weights] - # coins, merged from previous level coins - prev_coins = [_merge_coins(coins[2*i], coins[2*i+1]) for i in range(len(coins) / 2)] - # merge lists - coins = list(_imerge(prev_coins, new_coins, lambda x,y: x[0] < y[0])) - - res = [(s,0) for s,_ in weights] - - for i in range(len(positioned_weights) * 2 - 2): - for k,v in coins[i][1].items(): - if res[k][1] < v: res[k] = (res[k][0], v) - - return res -
 @@ -4,39 +4,13 @@ _HuffmanRecord = namedtuple("_HuffmanRecord", "length start_code end_code symbols") -def _tables_from_lenghts(lens): - """Make canonical huffman code tables from list of code leghts. - Input: list of pairs (symbol, code length), ordered by alphabet - Output: list of tuples (length, start_code, end_code, [symbols in alphabet order]) - Tuples are sorted by length""" - - # make dict {len --> [list of symbols in alphabet order]} - by_len = defaultdict(list) - for a, l in lens: - if l > 0: - by_len[l].append(a) - - # lengths that exists in code - actual_lens = by_len.keys() - actual_lens.sort() - - tables = [] - code = 0 - prev_len = 0 - - # fill table - for l in actual_lens: - code *= 2**(l-prev_len) - tables.append(_HuffmanRecord(l, code, code + len(by_len[l]), by_len[l])) - code += len(by_len[l]) - prev_len = l - - return tables - class Decoder(object): """Decode symbol from bitsream""" def __init__(self, lens): + """Input: + - list or pairs (symbol, code lenght) + - OR list or code lenghts; in this case symbols are just integers""" if isinstance(lens[0], int): lens = [(n, l) for n,l in enumerate(lens)] @@ -71,6 +45,9 @@ class Encoder(object): """Put symbol into bitstream""" def __init__(self, lens): + """Input: + - list or pairs (symbol, code lenght) + - OR list or code lenghts; in this case symbols are just integers""" if isinstance(lens[0], int): lens = [(n, l) for n,l in enumerate(lens)] @@ -86,4 +63,32 @@ def put(self, bs, c): v, n = self.code[c] bs.put_be(v, n) +def _tables_from_lenghts(lens): + """Make canonical huffman code tables from list of code leghts. + Input: list of pairs (symbol, code length), ordered by alphabet + Output: list of tuples (length, start_code, end_code, [symbols in alphabet order]) + Tuples are sorted by length""" + + # make dict {len --> [list of symbols in alphabet order]} + by_len = defaultdict(list) + for a, l in lens: + if l > 0: + by_len[l].append(a) + + # lengths that exists in code + actual_lens = by_len.keys() + actual_lens.sort() + + tables = [] + code = 0 + prev_len = 0 + + # fill table + for l in actual_lens: + code *= 2**(l-prev_len) + tables.append(_HuffmanRecord(l, code, code + len(by_len[l]), by_len[l])) + code += len(by_len[l]) + prev_len = l + + return tables
 @@ -1,14 +1,22 @@ """Implement classical Huffman alghorithm""" -def make_code(weights): +def make_code_symbols(weights): """Input: - list of pairs (symbol, weight), simbols with weight 0 are allowed Output: - list of pairs (symbol, code lenght) in the same order""" + res = make_code([w for _,w in weights]) + return [(s, l) for (s,_),l in zip(weights, res)] + +def make_code(weights): + """Input: + - list of symbols weights; simbols with weight 0 are allowed + Output: + - list of symbols code lenghts in the same order""" # each items is: # (weight, [(symbol1, len1), (symbol2, len2), ... ]) - codes = [(w, [(i, 0)]) for i,(_,w) in enumerate(weights) if w > 0] + codes = [(w, [(i, 0)]) for i,w in enumerate(weights) if w > 0] codes.sort(key=lambda x: x[0], reverse=True) while len(codes) > 1: @@ -26,9 +34,9 @@ def make_code(weights): codes.insert(i, s) # now all pairs (symbol, code_len) are contained in codes[0][1] - res = [(s,0) for s,_ in weights] + res = [0] * len(weights) for i,l in codes[0][1]: - res[i] = (res[i][0], l) + res[i] = l return res
 @@ -1,7 +1,7 @@ import sys from coder import Encoder, Decoder -from huffman import make_code -from bounded_huffman import make_code as make_ll_code +from huffman import make_code_symbols as make_code +from bounded_huffman import make_code_symbols as make_ll_code # ll for lenght-limited class DummyInputStream(object): def __init__(self, data):