In [1]:
import struct
import komm
import numpy as np
import matplotlib.pyplot as plt

In [2]:
pxa = 3 / 10
pxb = 6 / 10
pxc = 1 / 10

In [3]:
arr = [pxa, pxb, pxc]

Hx = 0
for i in range(len(arr)):
    px = arr[i]
    Hx += -px * np.log2(px)

Hx

np.float64(1.295461844238322)

In [4]:
x2 = [i * j for i in arr for j in arr]

x2

[0.09, 0.18, 0.03, 0.18, 0.36, 0.06, 0.03, 0.06, 0.010000000000000002]

In [5]:
Hx2 = 0
for i in range(len(x2)):
    Hx2 += -x2[i] * np.log2(x2[i])

Hx2

np.float64(2.5909236884766433)

In [6]:
(
    0.36 * 2
    + 0.18 * 2
    + 0.18 * 3
    + 0.09 * 4
    + 0.06 * 4
    + 0.06 * 5
    + 0.03 * 6
    + 0.03 * 6
    + 0.01 * 6
)

2.94

### Alice no País das Maravilhas

In [7]:
with open("assets/pg11.txt") as f:
    text = f.read()

# Contar caracteres e ordenar por frequência
chars = list(set(text))
counts = {c: text.count(c) for c in chars}
chars_sorted = sorted(chars, key=lambda c: counts[c], reverse=True)  # Ordenação crítica
pmf = [counts[c] / len(text) for c in chars_sorted]

In [8]:
import struct

with open("encoded_text.bin", "rb") as file:
    # Ler metadados básicos
    padding = int.from_bytes(file.read(1), byteorder="big")

    # Ler tamanho da tabela de códigos (4 bytes)
    num_codigos = struct.unpack("I", file.read(4))[0]

    # Reconstruir tabela de códigos
    codes = {}
    for _ in range(num_codigos):
        # Ler caractere
        tamanho_char = struct.unpack("B", file.read(1))[0]
        char = file.read(tamanho_char).decode("utf-8")

        # Ler código
        tamanho_codigo = struct.unpack("B", file.read(1))[0]
        bytes_codigo = file.read((tamanho_codigo + 7) // 8)

        # Converter para string binária
        codigo_int = int.from_bytes(bytes_codigo, byteorder="big")
        codigo = format(codigo_int, f"0{tamanho_codigo}b")
        codes[codigo] = char

    # Ler dados comprimidos
    dados_comprimidos = file.read()

# Converter bytes para string binária
bits = "".join(format(byte, "08b") for byte in dados_comprimidos)
if padding != 8:
    bits = bits[:-padding]

# Decodificação gulosa
codigo_atual = ""
texto_decodificado = []
for bit in bits:
    codigo_atual += bit
    if codigo_atual in codes:
        texto_decodificado.append(codes[codigo_atual])
        codigo_atual = ""

# Escrever resultado
with open("huff_decoded.txt", "w", encoding="utf-8") as file:
    file.write("".join(texto_decodificado))

In [9]:
print("Contagem de caracteres:")
counts

Contagem de caracteres:


{'L': 111,
 'o': 9372,
 'F': 123,
 'g': 2768,
 'm': 2210,
 'q': 139,
 '(': 73,
 'T': 482,
 'i': 7856,
 'U': 70,
 '1': 58,
 '?': 204,
 'h': 7677,
 'Z': 1,
 'r': 6491,
 '5': 9,
 '’': 712,
 '#': 1,
 ']': 4,
 '“': 1129,
 '9': 7,
 '_': 440,
 '!': 452,
 'X': 10,
 '—': 265,
 ')': 73,
 'K': 81,
 'C': 181,
 'Y': 100,
 'k': 1217,
 'J': 13,
 '”': 1125,
 'N': 136,
 '-': 158,
 'I': 784,
 '2': 11,
 'u': 3921,
 'S': 229,
 'd': 5272,
 'G': 182,
 '‘': 47,
 'B': 118,
 'Q': 84,
 'V': 26,
 'n': 7935,
 '\n': 3757,
 'O': 146,
 'v': 943,
 '•': 4,
 '.': 1223,
 'A': 670,
 '/': 6,
 "'": 4,
 'y': 2503,
 'E': 193,
 'f': 2255,
 'b': 1635,
 'R': 165,
 'w': 2745,
 'l': 5102,
 'D': 206,
 's': 7041,
 'j': 223,
 ',': 2569,
 '*': 72,
 'a': 9167,
 '0': 21,
 'ù': 1,
 'H': 245,
 ';': 193,
 '%': 1,
 'e': 15287,
 'p': 1795,
 '6': 7,
 't': 11740,
 '[': 4,
 'z': 78,
 '™': 57,
 'P': 180,
 '$': 2,
 'c': 2846,
 'x': 170,
 'M': 196,
 '8': 10,
 '3': 12,
 '\ufeff': 1,
 ':': 246,
 ' ': 27601,
 '4': 9,
 'W': 226,
 '7': 5}

In [10]:
print("PMF")
dict(zip(chars_sorted, pmf))

PMF


{' ': 0.16838194474100013,
 'e': 0.09325947571666494,
 't': 0.07162073951158804,
 'o': 0.057174580127990045,
 'a': 0.05592396244486606,
 'n': 0.04840805519799413,
 'i': 0.0479261098469366,
 'h': 0.04683410708947712,
 's': 0.04295414198476077,
 'r': 0.039598826249550086,
 'd': 0.0321622264655104,
 'l': 0.031125128874627102,
 'u': 0.023920350905020163,
 '\n': 0.022919856758520978,
 'c': 0.01736223378619928,
 'g': 0.016886389009205766,
 'w': 0.01674607580573332,
 ',': 0.015672374770465902,
 'y': 0.015269736882240618,
 'f': 0.0137567945143638,
 'm': 0.013482268681482928,
 'p': 0.010950530444914867,
 'b': 0.009974438594671758,
 '.': 0.007461002080295756,
 'k': 0.007424398635911639,
 '“': 0.00688754811827793,
 '”': 0.006863145822021853,
 'v': 0.005752841342370317,
 'I': 0.004782850066191229,
 '’': 0.00434360873358183,
 'A': 0.004087384622893014,
 'T': 0.0029404766988573626,
 '!': 0.00275745947693678,
 '_': 0.0026842525881685467,
 '—': 0.0016166521269651475,
 ':': 0.0015007412197487783,
 'H':

In [20]:
huff = komm.HuffmanCode(pmf)
dms = komm.DiscreteMemorylessSource(pmf)
print("Entropy:", dms.entropy())
print("Mean length:", huff.rate(pmf))

Entropy: 4.6039201555468905
Mean length: 4.6432750321805285


In [11]:
with open("assets/pg11.txt") as f:
    text = f.read()
    original_size = len(text)

with open("encoded_text.bin", "rb") as f:
    data = f.read()
    encoded_size = len(data)

with open("huff_decoded.txt") as f:
    text = f.read()
    decoded_size = len(text)

print(f"Original size: {original_size} bytes")
print(f"Encoded size: {encoded_size} bytes")
print(f"Decoded size: {decoded_size} bytes")
print(f"Compression ratio: {(1- encoded_size/original_size)*100:.2f}%")

Original size: 163919 bytes
Encoded size: 95591 bytes
Decoded size: 163919 bytes
Compression ratio: 41.68%
