# 2. Vectorized Model

Wersja wektoryzowana modelu baseline.

In [1]:
import torch
import torch.nn as nn
import numpy as np
import constriction
import os
import struct
import time
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# Konfiguracja
TRAIN_PATH = "../data/all_silesia.bin"
TEST_PATH = "../data/all_canterbury.bin"
COMPRESSED_PATH = "../out/compressed_vectorized.bin"
DECOMPRESSED_PATH = "../out/decompressed_vectorized.bin"
MODEL_PATH = "../out/model_compressor_vectorized.pth"

HIDDEN_SIZE = 128
EPOCHS = 1 
SEQ_LEN = 128
BATCH_SIZE = 64
CHUNK_SIZE = 10_000 # Wielkość bloku dla wektoryzacji kompresji

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

Using device: cpu


## 1) Importy i konfiguracja

In [2]:
class ByteDataset(Dataset):
    def __init__(self, file_path, seq_len):
        print(f"Loading data from {file_path}...")
        with open(file_path, 'rb') as f:
            self.data = np.frombuffer(f.read(), dtype=np.uint8)

        self.data = torch.from_numpy(self.data).long()
        self.seq_len = seq_len
        self.n_samples = len(self.data) - seq_len - 1

    def __len__(self):
        return self.n_samples // SEQ_LEN

    def __getitem__(self, idx):
        start = idx * SEQ_LEN
        end = start + SEQ_LEN + 1
        
        if end > len(self.data):
            chunk = self.data[start:]
            return chunk[:-1], chunk[1:]
        
        chunk = self.data[start:end]
        return chunk[:-1], chunk[1:]

## 2) DataLoader

In [None]:
class Compressor(nn.Module):
    def __init__(self):
        super().__init__()
        self.embed = nn.Embedding(257, 32)
        self.lstm = nn.LSTM(32, HIDDEN_SIZE, batch_first=True)
        self.fc = nn.Linear(HIDDEN_SIZE, 256)

    def forward(self, x, hidden=None):
        x = self.embed(x)
        out, hidden = self.lstm(x, hidden)
        logits = self.fc(out)
        return logits, hidden
    
    def _get_probs(self, x, hidden):
        with torch.no_grad():
            logits, hidden = self(x, hidden)
            probs = torch.softmax(logits[0, 0], dim=0).cpu().numpy().astype(np.float32)
        return probs, hidden

## 3) Model

In [None]:
def train_model(model, train_path, epochs=EPOCHS):
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
    criterion = nn.CrossEntropyLoss()

    dataset = ByteDataset(train_path, SEQ_LEN)
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
    
    start_time = time.time()
    history = []

    for i in range(epochs):
        total_loss = 0
        steps = 0
        pbar = tqdm(dataloader, desc=f"Epoch {i + 1}/{epochs}", unit="batch")

        for x, y in pbar:
            x, y = x.to(DEVICE), y.to(DEVICE)
            optimizer.zero_grad()
            logits, _ = model(x)
            loss = criterion(logits.view(-1, 256), y.view(-1))
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            steps += 1
            pbar.set_postfix({'loss': loss.item()})

        avg_loss = total_loss / steps
        bpc = avg_loss / 0.693147
        history.append({'loss': avg_loss, 'bpc': bpc})
        print(f"Epoch {i + 1}/{epochs} | Loss: {avg_loss:.4f} | BPC: {bpc:.4f}")

    total_time = time.time() - start_time
    print(f"Training finished in {total_time:.2f} seconds.")
    return history, total_time

## 4) Trening

In [None]:
def compress_vectorized(model, input_path, output_path):
    model.eval()
    encoder = constriction.stream.queue.RangeEncoder()
    
    with open(input_path, "rb") as f:
        data_to_compress = np.frombuffer(f.read(), dtype=np.uint8)
    
    length = len(data_to_compress)
    print(f"Compressing {length} bytes using Vectorized approach...")
    start_time = time.time()
    
    last_symbol = 256 # START token
    hidden = None
    
    with torch.no_grad():
        # Przetwarzamy w chunkach
        for i in tqdm(range(0, length, CHUNK_SIZE), desc="Encoding Chunks"):
            chunk_target = data_to_compress[i : i + CHUNK_SIZE]
            chunk_len = len(chunk_target)
            
            # Przygotowanie inputu: [last_symbol, d0, d1, ..., d_M-2]
            input_seq = np.empty(chunk_len, dtype=np.int64)
            input_seq[0] = last_symbol
            if chunk_len > 1:
                input_seq[1:] = chunk_target[:-1]
                
            input_tensor = torch.from_numpy(input_seq).unsqueeze(0).to(DEVICE) # [1, Seq]
            
            # 1. Oblicz prawdopodobieństwa dla całego chunka naraz (GPU/Model)
            logits, hidden = model(input_tensor, hidden)            
            
            # 2. Kodowanie (CPU loop - szybkie w C++ constriction)
            for j in range(chunk_len):
                symbol = chunk_target[j]
                probs = torch.softmax(logits[0, j], dim=0).cpu().numpy().astype(np.float32)
                dist = constriction.stream.model.Categorical(probs[j], perfect=False)
                encoder.encode(int(symbol), dist)
                
            last_symbol = chunk_target[-1]

    compressed_bits = encoder.get_compressed()
    
    with open(output_path, "wb") as f:
        f.write(struct.pack('<I', length))
        f.write(compressed_bits.tobytes())
        
    duration = time.time() - start_time
    original_size = length
    compressed_size = os.path.getsize(output_path)
    ratio = original_size / compressed_size
    bpc = (compressed_size * 8) / original_size
    
    print(f"Compression finished in {duration:.2f}s")
    print(f"Original size: {original_size} B")
    print(f"Compressed size: {compressed_size} B")
    print(f"Compression Ratio: {ratio:.2f}x")
    print(f"Bits Per Character (BPC): {bpc:.2f}")
    
    return {
        'time': duration,
        'original_size': original_size,
        'compressed_size': compressed_size,
        'ratio': ratio,
        'bpc': bpc,
        'speed_bps': original_size / duration
    }

@torch.inference_mode()
def decompress_optimized(model, input_path, output_path):
    model.eval()
    
    start_time = time.time()
    with open(input_path, "rb") as f:
        orig_len = struct.unpack('<I', f.read(4))[0]
        bits = np.frombuffer(f.read(), dtype=np.uint32)

    decoder = constriction.stream.queue.RangeDecoder(bits)
    decoded_data = []
    curr_symbol = torch.tensor([[256]], dtype=torch.long, device=DEVICE)
    hidden = None

    print(f"Decompressing {orig_len} bytes...")
    
    # Pętla po jednym symbolu - wąskie gardło
    for _ in tqdm(range(orig_len), desc="Decoding"):
        probs, hidden = model._get_probs(curr_symbol, hidden)
        dist = constriction.stream.model.Categorical(probs, perfect=False)
        symbol = decoder.decode(dist)
        decoded_data.append(symbol)
        curr_symbol = torch.tensor([[symbol]], dtype=torch.long, device=DEVICE)

    with open(output_path, "wb") as f:
        f.write(bytes(decoded_data))
        
    duration = time.time() - start_time
    print(f"Decompression finished in {duration:.2f}s")
    
    return {
        'time': duration,
        'speed_bps': orig_len / duration
    }

## 6) Train i test

In [None]:
model = Compressor().to(DEVICE)
model.load_state_dict(torch.load(MODEL_PATH))
model.eval()

# 1. Trening na Silesia Corpus
# print("=== TRAINING (Silesia) ===")
# train_metrics, train_time = train_model(model, TRAIN_PATH)

# torch.save(model.state_dict(), MODEL_PATH)

# 2. Kompresja na Canterbury Corpus (Test)
print("\n=== COMPRESSION (Canterbury) ===")
comp_metrics = compress_vectorized(model, TEST_PATH, COMPRESSED_PATH)

# 3. Dekompresja
print("\n=== DECOMPRESSION (Canterbury) ===")
decomp_metrics = decompress_optimized(model, COMPRESSED_PATH, DECOMPRESSED_PATH)

# 4. Check
with open(TEST_PATH, 'rb') as f1, open(DECOMPRESSED_PATH, 'rb') as f2:
    if f1.read() == f2.read():
        print("\nSUCCESS: Validated!")
    else:
        print("\nFAILURE: Mismatch!")

=== TRAINING (Silesia) ===


  self.data = torch.from_numpy(self.data).long()


Loading data from ../data/all_silesia.bin...


Epoch 1/1:   1%|          | 234/25654 [00:09<17:53, 23.68batch/s, loss=2.42]


KeyboardInterrupt: 

## 7) Podsumowanie

In [None]:
print(f"Baseline Results:")
# print(f"Training Time: {train_time:.2f}s")
print(f"Compression Speed: {comp_metrics['speed_bps']:.2f} B/s")
print(f"Decompression Speed: {decomp_metrics['speed_bps']:.2f} B/s")
print(f"Compression Ratio: {comp_metrics['ratio']:.2f}x")
print(f"BPC: {comp_metrics['bpc']:.2f}")

# Wnioski

batch-processing powoduje wykrzaczenie się precyzji floatow w pytorchu. 
Dla entropy-codera, praowdpodbienstwa muszą być IDENTYCZNE, zeby moc odkodować poprawnie bez utraty informacji.

Dalsze kroki:
- kwantyzacja prawdopodobienstw