In [None]:
"""huffman_encoding.ipynb"""

from pathlib import Path

# Cell 01 - Read the uncompressed text file

with Path.open("AliceInWonderland.txt", "rb") as infile:
    data = infile.read()
text = data.decode("ascii")
lines = text.splitlines()
for line in lines[:17]:
    print(line)


In [None]:
# Cell 02 - Build frequency table

from collections import defaultdict
from pprint import pprint

freq_table = defaultdict(int)
for byte in data:
    freq_table[byte] += 1

pprint(freq_table)

In [None]:
# Cell 03 - Create a priority queue heap from the frequency table

import heapq


class Node:
    def __init__(self, freq, byte=None, left=None, right=None):
        self.freq = freq
        self.byte = byte
        self.left = left
        self.right = right

    def __lt__(self, other):
        # This operator is needed when calling heapq.heappush()
        return self.freq < other.freq


heap = [Node(freq, byte) for byte, freq in freq_table.items()]

"""
heapq.heapify() transforms a regular Python list into a “min-heap” in-place, ensuring the
the smallest element resides at index 0 and the list satisfies the min-heap property
in that in the resulting a binary heap, every node is less than or equal to its children
"""
heapq.heapify(heap)

print(heap[0].byte, heap[0].freq)
print(heap[1].byte, heap[1].freq)
print(heap[-2].byte, heap[-2].freq)
print(heap[-1].byte, heap[-1].freq)

In [None]:
# Cell 04 - Build the Huffman Tree from the priority queue heap

while len(heap) > 1:
    node1 = heapq.heappop(heap)
    node2 = heapq.heappop(heap)
    merged = Node(node1.freq + node2.freq, None, node1, node2)
    heapq.heappush(heap, merged)

tree = heap[0]

print(f"Total frequency count of all characters: {tree.freq:,}")
print(f"Total length in characters of text file: {len(data):,}")

In [None]:
# Cell 05 - Using recursion, build array of binary strings from Huffman tree


def build_code_table(node, prefix="", code_table={}):
    if node is None:
        return
    if node.byte is not None:
        code_table[node.byte] = prefix
    build_code_table(node.left, prefix + "0", code_table)
    build_code_table(node.right, prefix + "1", code_table)
    return code_table


code_table:dict[int, str] = build_code_table(tree)

print(f"Huffman binary string for 'e' (ASCII 101): {code_table.get(101)}")
print(f"Huffman binary string for 'Z' (ASCII 90): {code_table.get(90)}")

# The x[1] sorts by the second element of each key & value tuple
pprint(sorted(code_table.items(), key=lambda x: x[1], reverse=True))


In [None]:
# Cell 06 - From first to last file byte, concatenate one Huffman binary string

encoded_data = "".join(code_table[byte] for byte in data)

for i in range(0, 400, 40):
    print(encoded_data[i : i + 40])
    

In [None]:
# Cell 07 - Save the Huffman tree and encoded file bytes to the compressed file

# Add padding if necessary to ensure the
# returned string is cleanly divisible by 8
def pad_encoded_data(encoded_data):
    extra_padding = 8 - len(encoded_data) % 8
    for _ in range(extra_padding):
        encoded_data += "0"
    padded_info = "{0:08b}".format(extra_padding)
    return padded_info + encoded_data


# Convert Huffman binary encoded strings to decimal bytes
def get_byte_array(padded_encoded_data):
    b = bytearray()
    for i in range(0, len(padded_encoded_data), 8):
        byte = padded_encoded_data[i : i + 8]
        b.append(int(byte, 2))
    return b


# Recursive function to save the Huffman tree (Nodes)
def serialize_tree(node):
    if node.byte is not None:
        return b"\x01" + bytes([node.byte])
    return b"\x00" + serialize_tree(node.left) + serialize_tree(node.right)


tree_data = serialize_tree(tree)

with open("AliceInWonderland.bin", "wb") as outfile:
    outfile.write(len(tree_data).to_bytes(2, byteorder="big"))
    outfile.write(tree_data)

    padded_data = pad_encoded_data(encoded_data)
    byte_array = get_byte_array(padded_data)
    outfile.write(bytes(byte_array))

print(f"Created file '{outfile.name}'")
print(f"Length of uncompressed file : {len(data):>7,} bytes")
print(f"Length of compressed file   : {len(byte_array):>7,} bytes")
print(f"Compression Ratio: {1 - len(byte_array) / len(data):.2%}")