# 1. Baseline Model

Podstawowy model LSTM -- baseline.

## 1) Importy i konfiguracja

In [2]:
import torch
import torch.nn as nn
import numpy as np
import constriction
import os
import struct
import time
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import matplotlib.pyplot as plt

# Konfiguracja
TRAIN_PATH = "../data/all_silesia.bin"  # Zakładamy, że uruchamiamy z src/notebooks
TEST_PATH = "../data/all_canterbury.bin"
COMPRESSED_PATH = "../out/compressed_baseline.bin"
DECOMPRESSED_PATH = "../out/decompressed_baseline.txt"
MODEL_PATH = "../out/model_compressor_baseline.pth"

HIDDEN_SIZE = 128
EPOCHS = 1
SEQ_LEN = 128
BATCH_SIZE = 64

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

Using device: cpu


## 2) DataLoader

In [3]:
class ByteDataset(Dataset):
    def __init__(self, file_path, seq_len):
        with open(file_path, 'rb') as f:
            self.data = np.frombuffer(f.read(), dtype=np.uint8)

        self.data = torch.from_numpy(self.data).long()
        self.seq_len = seq_len
        self.n_samples = len(self.data) - seq_len - 1

    def __len__(self):
        return self.n_samples // SEQ_LEN

    def __getitem__(self, idx):
        start = idx * SEQ_LEN
        end = start + SEQ_LEN + 1
        
        if end > len(self.data):
            chunk = self.data[start:]
            return chunk[:-1], chunk[1:]
        
        chunk = self.data[start:end]
        return chunk[:-1], chunk[1:]

## 3) Model

In [4]:
class Compressor(nn.Module):
    def __init__(self):
        super().__init__()
        # Słownik 256 bajtów + 1 znak specjalny START
        self.embed = nn.Embedding(257, 32)
        self.lstm = nn.LSTM(32, HIDDEN_SIZE, batch_first=True)
        self.fc = nn.Linear(HIDDEN_SIZE, 256)

    def forward(self, x, hidden=None):
        x = self.embed(x)
        out, hidden = self.lstm(x, hidden)
        logits = self.fc(out)
        return logits, hidden
    
    def _get_probs(self, x, hidden):
        # Helper do pobierania prawdopodobieństw pojedynczego kroku
        with torch.no_grad():
            logits, hidden = self(x, hidden)
            probs = torch.softmax(logits[0, 0], dim=0).cpu().numpy().astype(np.float32)
        return probs, hidden

## 4) Trening

In [5]:
def train_model(model, train_path, epochs=EPOCHS):
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    criterion = nn.CrossEntropyLoss()

    dataset = ByteDataset(train_path, SEQ_LEN)
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
    
    start_time = time.time()
    history = []

    for i in range(epochs):
        total_loss = 0
        steps = 0
        pbar = tqdm(dataloader, desc=f"Epoch {i + 1}/{epochs}", unit="batch")

        for x, y in pbar:
            x, y = x.to(DEVICE), y.to(DEVICE)
            optimizer.zero_grad()
            logits, _ = model(x)
            loss = criterion(logits.view(-1, 256), y.view(-1))
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            steps += 1
            pbar.set_postfix({'loss': loss.item()})

        avg_loss = total_loss / steps
        bpc = avg_loss / 0.693147
        history.append({'loss': avg_loss, 'bpc': bpc})
        print(f"Epoch {i + 1}/{epochs} | Loss: {avg_loss:.4f} | BPC: {bpc:.4f}")

    total_time = time.time() - start_time
    print(f"Training finished in {total_time:.2f} seconds.")
    return history, total_time

## 5) Funkcje kompresji i dekompresji

In [6]:
def compress_file(model, input_path, output_path):
    model.eval()
    encoder = constriction.stream.queue.RangeEncoder()
    
    with open(input_path, "rb") as f:
        data_to_compress = np.frombuffer(f.read(), dtype=np.uint8)
    
    curr_symbol = torch.tensor([[256]], dtype=torch.long, device=DEVICE)
    hidden = None
    length = len(data_to_compress)
    
    print(f"Compressing {length} bytes...")
    start_time = time.time()
    
    # Baseline: Symbol-by-symbol processing
    for i, symbol in enumerate(tqdm(data_to_compress, desc="Encoding")):
        probs, hidden = model._get_probs(curr_symbol, hidden)
        dist = constriction.stream.model.Categorical(probs, perfect=False)
        encoder.encode(int(symbol), dist)
        curr_symbol = torch.tensor([[symbol]], dtype=torch.long, device=DEVICE)

    compressed_bits = encoder.get_compressed()
    
    with open(output_path, "wb") as f:
        f.write(struct.pack('<I', length))
        f.write(compressed_bits.tobytes())
        
    duration = time.time() - start_time
    original_size = length
    compressed_size = os.path.getsize(output_path)
    ratio = original_size / compressed_size
    bpc = (compressed_size * 8) / original_size
    
    print(f"Compression finished in {duration:.2f}s")
    print(f"Original size: {original_size} B")
    print(f"Compressed size: {compressed_size} B")
    print(f"Compression Ratio: {ratio:.2f}x")
    print(f"Bits Per Character (BPC): {bpc:.2f}")
    
    return {
        'time': duration,
        'original_size': original_size,
        'compressed_size': compressed_size,
        'ratio': ratio,
        'bpc': bpc,
        'speed_bps': original_size / duration
    }

def decompress_file(model, input_path, output_path):
    model.eval()
    
    start_time = time.time()
    with open(input_path, "rb") as f:
        orig_len = struct.unpack('<I', f.read(4))[0]
        bits = np.frombuffer(f.read(), dtype=np.uint32)

    decoder = constriction.stream.queue.RangeDecoder(bits)
    decoded_data = []
    curr_symbol = torch.tensor([[256]], dtype=torch.long, device=DEVICE)
    hidden = None

    print(f"Decompressing {orig_len} bytes...")
    
    for _ in tqdm(range(orig_len), desc="Decoding"):
        probs, hidden = model._get_probs(curr_symbol, hidden)
        dist = constriction.stream.model.Categorical(probs, perfect=False)
        symbol = decoder.decode(dist)
        decoded_data.append(symbol)
        curr_symbol = torch.tensor([[symbol]], dtype=torch.long, device=DEVICE)

    with open(output_path, "wb") as f:
        f.write(bytes(decoded_data))
        
    duration = time.time() - start_time
    print(f"Decompression finished in {duration:.2f}s")
    
    return {
        'time': duration,
        'speed_bps': orig_len / duration
    }

## 6) Train i test

In [7]:
# Inicjalizacja modelu
model = Compressor().to(DEVICE)
model.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE))
model.eval()

TEST_PATH = "../data/canterbury_small.bin"

# 1. Trening
# print("=== TRAINING ===")
# train_metrics, train_time = train_model(model, TRAIN_PATH)

# Zapis modelu
# torch.save(model.state_dict(), MODEL_PATH)

# 2. Kompresja
print("=== COMPRESSION ===")
# Uwaga: Dla testu na szybko można użyć mniejszego pliku tekstowego jeśli bible.txt jest duży
# Tutaj używamy bible.txt zgodnie z oryginałem
comp_metrics = compress_file(model, TEST_PATH, COMPRESSED_PATH)

# 3. Dekompresja
print("=== DECOMPRESSION ===")
decomp_metrics = decompress_file(model, COMPRESSED_PATH, DECOMPRESSED_PATH)

# 4. Weryfikacja poprawności
with open(TEST_PATH, 'rb') as f1, open(DECOMPRESSED_PATH, 'rb') as f2:
    original = f1.read()
    decompressed = f2.read()
    if original == decompressed:
        print("SUCCESS: Decompressed data matches original!")
    else:
        print("FAILURE: Data mismatch!")

=== COMPRESSION ===
Compressing 10846 bytes...


Encoding: 100%|██████████| 10846/10846 [00:00<00:00, 25085.80it/s]


Compression finished in 0.44s
Original size: 10846 B
Compressed size: 6112 B
Compression Ratio: 1.77x
Bits Per Character (BPC): 4.51
=== DECOMPRESSION ===
Decompressing 10846 bytes...


Decoding: 100%|██████████| 10846/10846 [00:00<00:00, 24144.46it/s]

Decompression finished in 0.45s
SUCCESS: Decompressed data matches original!





## 7) Podsumowanie

In [8]:
print(f"Baseline Results:")
# print(f"Training Time: {train_time:.2f}s")
print(f"Compression Speed: {comp_metrics['speed_bps']:.2f} B/s")
print(f"Decompression Speed: {decomp_metrics['speed_bps']:.2f} B/s")
print(f"Compression Ratio: {comp_metrics['ratio']:.2f}x")
print(f"BPC: {comp_metrics['bpc']:.2f}")

Baseline Results:
Compression Speed: 24500.38 B/s
Decompression Speed: 24030.46 B/s
Compression Ratio: 1.77x
BPC: 4.51
