# Compression

The purpose of this notebook is to show the typical compression ratios for text data. This can be used to estimate when serialization benefits from compression.


In [None]:
import os as _os
import bz2 as _bz2
import gzip as _gzip
import lzma as _lzma
import random as _random
import typing as _typing
import requests as _requests

import pandas as _pd
import seaborn as _sb

# Public domain copy of The Iliad on Project Gutenberg.
TEXT_LINK = 'https://www.gutenberg.org/ebooks/6130.txt.utf-8'

DATA_FILE = 'compression.csv'

CHUNK_SIZES = 1024
CHUNK_COUNT = 1000

COMPRESS_FUNCTIONS = {
    'bz2' : _bz2.compress,
    'gzip' : _gzip.compress,
    'lzma' : _lzma.compress,
}

class Result (_typing.NamedTuple):
    method: str
    original: int
    compressed: int


The computation compresses random chunks of text and returns average compressed sizes for ranges of original sizes.


In [None]:
def compute_results (text):
    results = []
    for size in range (CHUNK_SIZES):
        offsets = [ _random.randrange (0, len (text) - size) for _ in range (CHUNK_COUNT) ]
        chunks = [ text [offset:offset + size].encode ('utf-8') for offset in offsets ]
        for comp, compress_function in COMPRESS_FUNCTIONS.items ():
            sizes = [ len (compress_function (chunk)) for chunk in chunks ]
            average = sum (sizes) / len (sizes)
            results.append (Result (method = comp, original = size, compressed = average))
    return results


The diagram shows that compression typically starts paying off in terms of size after few hundred bytes of text.


In [None]:
# Try to load the results if already computed.
# Save new results if not already computed.
try:
    results = _pd.read_csv (DATA_FILE)

except Exception:
    text = _requests.get (TEXT_LINK).text
    results = compute_results (text)
    results.to_csv (DATA_FILE)

plottable = _pd.DataFrame (results)

_sb.set ()
plot = _sb.relplot (data = plottable, x = 'original', y = 'compressed', hue = 'method', kind = 'line')
plot.set_xlabels ('Original size [B]')
plot.set_ylabels ('Compressed size [B]')
plot._legend.set_title ('Method')
