In [1]:
import arith, fqt, ppm
import contextlib, sys

## Test modified Nayuki's Artihmetic Coder

In [2]:
def get_frequencies(filepath):
    model = fqt.SimpleFrequencyTable([0] * 257)
    with open(filepath, "rb") as input:
        while True:
            b = input.read(1)
            if len(b) == 0:
                break
            model.increment(b[0])
    return model


def write_frequencies(bitout, model):
    for i in range(256):
        write_int(bitout, 32, model.get(i))
def write_int(bitout, numbits, value):
    for i in reversed(range(numbits)):
        bitout.write((value >> i) & 1)  

def compress(model, inp, bitout):
    enc = arith.ArithmeticCoder(32)
    enc.start_encode(bitout) ## New line!
    while True:
        symbol = inp.read(1)
        if len(symbol) == 0:
            break
        t = model.get_total() ## New lines!
        l = model.get_low(symbol[0])
        h = model.get_high(symbol[0])
        enc.storeRegion(l, h, t) 
    t = model.get_total()
    l = model.get_low(256)
    h = model.get_high(256)
    enc.storeRegion(l, h, t)
    enc.finish_encode()  

inputfile, outputfile = 'testdata\gattaca.txt', 'testdata\gattaca_compressed.txt'

# Read input file once to compute symbol frequencies
model = get_frequencies(inputfile)
model.increment(256)  # EOF symbol gets a frequency of 1

# Read input file again, compress with arithmetic coding, and write output file
with open(inputfile, "rb") as inp, \
        contextlib.closing(arith.BitOutputStream(open(outputfile, "wb"))) as bitout:
    write_frequencies(bitout, model)
    compress(model, inp, bitout)


In [3]:
def read_frequencies(bitin):
    def read_int(n):
        result = 0
        for _ in range(n):
            result = (result << 1) | bitin.read_no_eof()  # Big endian
        return result

    freqs = [read_int(32) for _ in range(256)]
    freqs.append(1)  # EOF symbol
    return fqt.SimpleFrequencyTable(freqs)


def decompress(model, bitin, out):
    dec = arith.ArithmeticCoder(32)
    dec.start_decode(bitin) ## New line!
    while True:
        symbol = dec.loadRegion_binary(model)
        if symbol == 256:  # EOF symbol
            break
        out.write(bytes((symbol,)))
        
inputfile, outputfile = 'testdata\gattaca_compressed.txt', 'testdata\gattaca_decompressed.txt'

# Perform file decompression
with open(outputfile, "wb") as out, open(inputfile, "rb") as inp:
    bitin = arith.BitInputStream(inp)
    model = read_frequencies(bitin)
    decompress(model, bitin, out)
    
import filecmp
filecmp.cmp('testdata\gattaca.txt', 'testdata\gattaca_decompressed.txt')

True

## Let's try more adaptive (update as we go model)

In [4]:
def compress(inp, bitout):
    initfreqs = fqt.FlatFrequencyTable(257)
    model = fqt.SimpleFrequencyTable(initfreqs)
    enc = arith.ArithmeticCoder(32)
    enc.start_encode(bitout) # New line!
    while True:
        # Read and encode one byte
        symbol = inp.read(1)
        if len(symbol) == 0:
            break
        t = model.get_total() ## New lines!
        l = model.get_low(symbol[0])
        h = model.get_high(symbol[0])
        enc.storeRegion(l, h, t) 
        model.increment(symbol[0])
    t = model.get_total() ## New lines!
    l = model.get_low(256)
    h = model.get_high(256)
    enc.storeRegion(l, h, t)
    enc.finish_encode()  # New line!
inputfile, outputfile = 'testdata\gattaca.txt', 'testdata\gattaca_ad_compressed.txt'

# Perform file compression
with open(inputfile, "rb") as inp, \
        contextlib.closing(arith.BitOutputStream(open(outputfile, "wb"))) as bitout:
    compress(inp, bitout)



In [5]:
def decompress(bitin, out):
    initfreqs = fqt.FlatFrequencyTable(257)
    model = fqt.SimpleFrequencyTable(initfreqs)
    dec = arith.ArithmeticCoder(32)
    dec.start_decode(bitin) # New line!
    while True:
        # Decode and write one byte
        symbol = dec.loadRegion_binary(model)
        if symbol == 256:  # EOF symbol
            break
        out.write(bytes((symbol,)))
        model.increment(symbol)
        
inputfile, outputfile = 'testdata\gattaca_ad_compressed.txt', 'testdata\gattaca_ad_decompressed.txt'

# Perform file decompression
with open(inputfile, "rb") as inp, open(outputfile, "wb") as out:
    bitin = arith.BitInputStream(inp)
    decompress(bitin, out)


filecmp.cmp('testdata\gattaca.txt', 'testdata\gattaca_ad_decompressed.txt')

True

## Let's try ppm model

In [17]:
import ppm
def compress(inp, bitout):
    enc = arith.ArithmeticCoder(32)
    enc.start_encode(bitout)
    model = ppm.PpmModel(MODEL_ORDER, 257, 256)
    history = []
    while True:
        # Read and encode one byte
        symbol = inp.read(1)
        if len(symbol) == 0:
            break
        symbol = symbol[0]
        encode_symbol(model, history, symbol, enc)
        model.increment_contexts(history, symbol)
        if model.model_order >= 1:
            # Prepend current symbol, dropping oldest symbol if necessary
            if len(history) == model.model_order:
                history.pop()
            history.insert(0, symbol)

    encode_symbol(model, history, 256, enc)  # EOF
    enc.finish_encode()  # New line!
def encode_symbol(model, history, symbol, enc):
    for order in reversed(range(len(history) + 1)):
        ctx = model.root_context
        for sym in history[ : order]:
            assert ctx.subcontexts is not None
            ctx = ctx.subcontexts[sym]
            if ctx is None:
                break
        else:  # ctx is not None
            if symbol != 256 and ctx.frequencies.get(symbol) > 0:
                t = ctx.frequencies.get_total() ## New lines!
                l = ctx.frequencies.get_low(symbol)
                h = ctx.frequencies.get_high(symbol)
                enc.storeRegion(l, h, t) 
                return
            # Else write context escape symbol and continue decrementing the order
            t = ctx.frequencies.get_total() ## New lines!
            l = ctx.frequencies.get_low(256)
            h = ctx.frequencies.get_high(256)
            enc.storeRegion(l, h, t) 
    # Logic for order = -1
    t = model.order_minus1_freqs.get_total() ## New lines!
    l = model.order_minus1_freqs.get_low(symbol)
    h = model.order_minus1_freqs.get_high(symbol)
    enc.storeRegion(l, h, t) 
    
# Must be at least -1 and match ppm-decompress.py. Warning: Exponential memory usage at O(257^n).
MODEL_ORDER = 2
inputfile, outputfile = 'testdata\gattaca.txt', 'testdata\gattaca_ppm_compressed.txt'
# Perform file compression
with open(inputfile, "rb") as inp, \
        contextlib.closing(arith.BitOutputStream(open(outputfile, "wb"))) as bitout:
    compress(inp, bitout)

In [19]:
def decompress(bitin, out):
    dec = arith.ArithmeticCoder(32)
    dec.start_decode(bitin)
    model = ppm.PpmModel(MODEL_ORDER, 257, 256)
    history = []

    while True:
        # Decode and write one byte
        symbol = decode_symbol(dec, model, history)
        if symbol == 256:  # EOF symbol
            break
        out.write(bytes((symbol,)))
        model.increment_contexts(history, symbol)

        if model.model_order >= 1:
            # Prepend current symbol, dropping oldest symbol if necessary
            if len(history) == model.model_order:
                history.pop()
            history.insert(0, symbol)
def decode_symbol(dec, model, history):
    for order in reversed(range(len(history) + 1)):
        ctx = model.root_context
        for sym in history[ : order]:
            assert ctx.subcontexts is not None
            ctx = ctx.subcontexts[sym]
            if ctx is None:
                break
        else:  # ctx is not None
            symbol = dec.loadRegion_binary(ctx.frequencies)
            if symbol < 256:
                return symbol
    return dec.loadRegion_binary(model.order_minus1_freqs)
# Must be at least -1 and match ppm-compress.py. Warning: Exponential memory usage at O(257^n).
MODEL_ORDER = 2
inputfile, outputfile = 'testdata\gattaca_ppm_compressed.txt', 'testdata\gattaca_ppm_decompressed.txt'

# Perform file decompression
with open(inputfile, "rb") as inp, open(outputfile, "wb") as out:
    bitin = arith.BitInputStream(inp)
    decompress(bitin, out)


filecmp.cmp('testdata\gattaca.txt', 'testdata\gattaca_ppm_decompressed.txt')

True