# Huffman codes


In [1]:
stats={'a':0.01, 'b':0.04, 'c':.05, 'd':.11, 'e':.19, 'f':.20, 'g':.4}

class Node(object):
    def __init__(self, val = None, freq = 0.0):
        self.left = None
        self.right = None
        self.val = val
        self.freq = freq
        
    def __lt__(self, other):
        if self.freq == other.freq:
            return len(self.val) > len(other.val)
        return self.freq < other.freq
                
    def __str__(self):
        return '(' + self.val + ',' + str(self.freq) +')'
    
    def __repr__(self):
        return self.__str__()

In [2]:
def huffman(stats):
    import heapq
    pq = []
    for k in stats:
        freq = stats[k] 
        n = Node(k, freq)
        heapq.heappush(pq, n)
    for i in range(len(pq) - 1):
        z = Node()
        z.left = x = heapq.heappop(pq)
        z.right = y = heapq.heappop(pq)
        z.freq = x.freq + y.freq
        z.val = x.val + y.val
        heapq.heappush(pq, z)
    
    return heapq.heappop(pq)

In [3]:
codec = {}

def encode(root, label=''):
    if root is None:
        return
    if root.left is None and root.right is None:
        codec[root.val] = label #int(label,2)
    else:
        encode(root.left, label + '0')
        encode(root.right, label + '1')

huffman_tree = huffman(stats)        
encode(huffman_tree)
print(codec)

{'g': '0', 'a': '10000', 'b': '10001', 'c': '1001', 'd': '101', 'e': '110', 'f': '111'}


In [4]:
def decode(codec, line):
    lookup = {}
    for k in codec:
        v = codec[k]
        lookup[v] = k
    buffer = ''
    ret = ''
    for c in line:
        buffer += c
        if buffer in lookup:
            ret += lookup[buffer]
            buffer = ''
    return ret

In [5]:
line = '100001000110001100001001100001111101000110000101'
print(decode(codec, line))

abbacafebad


In [6]:
with open('book.txt', 'rb') as f:
    read_data = f.read()

In [7]:
stats = {}
read_data = read_data[:200]
for c in read_data:
    ch = chr(c)
    if ch not in stats:
        stats[ch] = 1
    else:
        stats[ch] += 1
        
for k in stats:
    stats[k] /= len(read_data)
    
print(stats)

{'T': 0.03, 'H': 0.02, 'E': 0.03, ' ': 0.185, 'A': 0.025, 'D': 0.01, 'V': 0.02, 'N': 0.015, 'U': 0.01, 'R': 0.03, 'S': 0.025, 'O': 0.025, 'F': 0.01, 'L': 0.02, 'C': 0.015, 'K': 0.005, 'M': 0.01, 'b': 0.01, 'y': 0.025, 'I': 0.045, 'Y': 0.005, '\r': 0.045, '\n': 0.045, '.': 0.025, 'c': 0.01, 'a': 0.035, 'n': 0.015, 'd': 0.025, 'l': 0.015, 'i': 0.02, 'B': 0.01, 'o': 0.02, 'h': 0.025, 'e': 0.07, 'm': 0.01, '-': 0.005, 'g': 0.005, 'u': 0.005, 's': 0.015, 'f': 0.005, 't': 0.015, 'r': 0.005, 'v': 0.005}


In [8]:
huffman_tree = huffman(stats) 
codec = {}
encode(huffman_tree)
print(codec)

{' ': '00', 'A': '01000', 'd': '01001', 'O': '01010', '.': '01011', 'S': '01100', 'y': '01101', 't': '011100', 'N': '011101', 'n': '011110', 's': '011111', 'C': '100000', 'l': '100001', 'E': '10001', 'T': '10010', 'R': '10011', 'e': '1010', 'a': '10110', 'v': '10111000', '-': '10111001', 'g': '10111010', 'u': '10111011', 'K': '10111100', 'Y': '10111101', 'f': '10111110', 'r': '10111111', 'c': '1100000', 'D': '1100001', 'M': '1100010', 'm': '1100011', 'U': '1100100', 'b': '1100101', 'F': '1100110', 'B': '1100111', 'V': '110100', 'i': '110101', 'L': '110110', 'o': '110111', 'H': '111000', 'h': '111001', 'I': '11101', '\r': '11110', '\n': '11111'}


In [13]:
line = ''
for c in read_data:
    ch = chr(c)
    line += codec[ch]
print(line)

10010111000100010001000110000111010010001011101100101100100100111000101100000101011001100001100111000100011001111011001010100000101111000011100001010110110110001010001011000011001010110100011001110110011000100010011100101110001100100100110010000001010011101010000111010011000010101010111101110110100011111011111111101111111110111111111011111111101111100000011101010110001000000110011000001011001111001001101101000010011010101111000110011111011111100110101100011110101101101111011111000011101111010101100100101110011010001001110100100110111001111001101010110010011010010010011011010101011010111010101110111010111101111100111011110111101010110001000001000001011001111110100011011110111110001110101001101001111001110011010101110001101111101111100001110111010001011001001011100110100011001111101110111111100000110111110001111001011010001101001011010000110000110100110100110001001101011111011100101010111111011011111011111000000110100010110010010111001101000110011011010110111000


In [14]:
print(decode(codec, line))

THE ADVENTURES OF SHERLOCK HOLMES by SIR ARTHUR CONAN DOYLE




   I. A Scandal in Bohemia
  II. The Red-headed League
 III. A Case of Identity
  IV. The Boscombe Valley Mystery
   V. The Fiv
