In [None]:
"""huffman_decoding.ipynb"""

import os
from pathlib import Path

# Cell 01 - Read the compressed binary file

file_name = "AliceInWonderland.bin"

with Path.open(file_name, "rb") as infile:
    tree_len = int.from_bytes(infile.read(2), byteorder="big")
    tree_data = infile.read(tree_len)
    encoded_bytes = infile.read()

print(f"Compressed binary file size :{os.path.getsize(file_name):>7,}")
print(f"Encoded tree data length    :{tree_len + 2:>7,}")
print(f"Encoded file data length    :{len(encoded_bytes):>7,}")


In [None]:
# Cell 02 - Reconstruct the Huffman tree


class Node:
    def __init__(self, freq=0, byte=None, left=None, right=None):
        self.freq = freq
        self.byte = byte
        self.left = left
        self.right = right


def deserialize_tree(data):
    # Recursive function to build the Huffman tree
    def helper(index):
        if index >= len(data):
            raise ValueError("Unexpected end of tree data")

        flag = data[index]
        index += 1

        if flag == 1:
            if index >= len(data):
                raise ValueError("Missing byte after leaf indicator")
            byte = data[index]
            index += 1
            return Node(byte=byte), index
        else:
            left, index = helper(index)
            right, index = helper(index)
            return Node(left=left, right=right), index

    root, final_index = helper(0)
    if final_index != len(data):
        raise ValueError("Tree data not fully consumed. Tree may be malformed.")
    return root


tree = deserialize_tree(tree_data)

# Display the byte value for the node down the 000 path
print(tree.byte)
print(tree.left.byte)
print(tree.left.left.byte)
print(tree.left.left.left.byte)


In [None]:
# Cell 03 - Reconstruct the Huffman binary string of each file byte


def remove_padding(padded_data):
    padded_info = padded_data[:8]
    extra_padding = int(padded_info, 2)
    return padded_data[8:-extra_padding] if extra_padding > 0 else padded_data[8:]


bitstring = "".join(f"{byte:08b}" for byte in encoded_bytes)
encoded_data = remove_padding(bitstring)

for i in range(0, 400, 40):
    print(encoded_data[i : i + 40])


In [None]:
# Cell 04 - Parse the encoded data using the Huffman tree and emit each byte


def decode_data(encoded_data, tree):
    decoded_bytes = bytearray()
    current = tree
    for bit in encoded_data:
        current = current.left if bit == "0" else current.right
        if current.byte is not None:
            decoded_bytes.append(current.byte)
            current = tree
    return decoded_bytes


decoded_bytes = decode_data(encoded_data, tree)

text = decoded_bytes.decode("ascii")
lines = text.splitlines()
for line in lines[:17]:
    print(line)


In [None]:
# Cell 05 - Save the uncompressed text file and compare to the original


def files_are_identical(file1: Path, file2: Path) -> bool:
    if file1.stat().st_size != file2.stat().st_size:
        return False

    with file1.open("rb") as f1, file2.open("rb") as f2:
        while True:
            b1 = f1.read(4096)
            b2 = f2.read(4096)
            if b1 != b2:
                return False
            if not b1:  # End of file reached
                break

    return True


with open("AliceInWonderland.uncompressed.txt", "wb") as out:
    out.write(decoded_bytes)

f1 = Path("AliceInWonderland.txt")
f2 = Path("AliceInWonderland.uncompressed.txt")

print(files_are_identical(f1, f2))  # True or False