# 0. ZIP Benchmark

Pomiar statystyk ZIP na datasetach: `all_canterbury.bin` i `canterbury_small.bin` 

## 1) Importy i konfiguracja

In [2]:
import os
import time
import zipfile
from pathlib import Path

import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option("display.precision", 3)

In [3]:
TEST_FILES = {
    "canterbury_small": "../data/canterbury_small.bin",
    "all_canterbury": "../data/all_canterbury.bin",
}

OUT_DIR = Path("../out")
OUT_DIR.mkdir(parents=True, exist_ok=True)

ZIP_LEVEL = 9

## 2) Funkcja do benchmarkowania ZIPa

In [4]:
def benchmark_zip_file(input_path: str, zip_path: Path, extracted_path: Path, compresslevel: int = 9):
    input_path = Path(input_path)
    if not input_path.exists():
        raise FileNotFoundError(f"Missing input file: {input_path}")

    original_size = input_path.stat().st_size

    t0 = time.perf_counter()
    with zipfile.ZipFile(zip_path, mode="w", compression=zipfile.ZIP_DEFLATED, compresslevel=compresslevel) as zf:
        zf.write(input_path, arcname=input_path.name)
    compression_time = time.perf_counter() - t0

    compressed_size = zip_path.stat().st_size

    t0 = time.perf_counter()
    with zipfile.ZipFile(zip_path, mode="r") as zf:
        data = zf.read(input_path.name)
    with open(extracted_path, "wb") as f:
        f.write(data)
    decompression_time = time.perf_counter() - t0

    with open(input_path, "rb") as f1, open(extracted_path, "rb") as f2:
        is_match = f1.read() == f2.read()

    return {
        "file": input_path.name,
        "original_bytes": original_size,
        "compressed_bytes": compressed_size,
        "compression_ratio_x": original_size / compressed_size,
        "bpc": (compressed_size * 8) / original_size,
        "compression_Bps": original_size / compression_time,
        "compression_bitps": (original_size * 8) / compression_time,
        "decompression_Bps": original_size / decompression_time,
        "decompression_bitps": (original_size * 8) / decompression_time,
        "compression_time_s": compression_time,
        "decompression_time_s": decompression_time,
        "verified": is_match,
        "zip_level": compresslevel,
    }

## 3) Uruchomienie testów

In [5]:
results = []

for label, input_path in TEST_FILES.items():
    zip_path = OUT_DIR / f"zip_{label}.zip"
    extracted_path = OUT_DIR / f"zip_{label}_decompressed.bin"

    metrics = benchmark_zip_file(
        input_path=input_path,
        zip_path=zip_path,
        extracted_path=extracted_path,
        compresslevel=ZIP_LEVEL,
    )
    results.append(metrics)

    print(f"\n=== {label} ===")
    print(f"Original:     {metrics['original_bytes']:,} B")
    print(f"Compressed:   {metrics['compressed_bytes']:,} B")
    print(f"Ratio:        {metrics['compression_ratio_x']:.2f}x")
    print(f"BPC:          {metrics['bpc']:.2f}")
    print(f"Comp speed:   {metrics['compression_Bps']:.2f} B/s ({metrics['compression_bitps']:.2f} bit/s)")
    print(f"Decomp speed: {metrics['decompression_Bps']:.2f} B/s ({metrics['decompression_bitps']:.2f} bit/s)")
    print(f"Integrity:    {'✅' if metrics['verified'] else '❌'}")

df = pd.DataFrame(results)
df = df[[
    "file",
    "original_bytes",
    "compressed_bytes",
    "compression_ratio_x",
    "bpc",
    "compression_Bps",
    "compression_bitps",
    "decompression_Bps",
    "decompression_bitps",
    "compression_time_s",
    "decompression_time_s",
    "verified",
    "zip_level",
]]

df


=== canterbury_small ===
Original:     10,846 B
Compressed:   5,189 B
Ratio:        2.09x
BPC:          3.83
Comp speed:   14451698.67 B/s (115613589.35 bit/s)
Decomp speed: 7182186.89 B/s (57457495.13 bit/s)
Integrity:    ✅

=== all_canterbury ===
Original:     18,521,760 B
Compressed:   5,536,295 B
Ratio:        3.35x
BPC:          2.39
Comp speed:   6415630.20 B/s (51325041.64 bit/s)
Decomp speed: 934639955.92 B/s (7477119647.37 bit/s)
Integrity:    ✅


Unnamed: 0,file,original_bytes,compressed_bytes,compression_ratio_x,bpc,compression_Bps,compression_bitps,decompression_Bps,decompression_bitps,compression_time_s,decompression_time_s,verified,zip_level
0,canterbury_small.bin,10846,5189,2.09,3.827,14450000.0,115600000.0,7182000.0,57460000.0,0.0007505,0.002,True,9
1,all_canterbury.bin,18521760,5536295,3.346,2.391,6416000.0,51330000.0,934600000.0,7477000000.0,2.887,0.02,True,9
