In [1]:
from collections import deque
import struct
import itertools
from bitarray import bitarray
import os
import pandas as pd
from time import time

# Utils

In [2]:
class SliceableDeque(deque):
    def __getitem__(self, index):
        if isinstance(index, slice):
            return type(self)(itertools.islice(self, index.start, index.stop, index.step))
        return deque.__getitem__(self, index)

In [3]:
def compare_content(path1, path2):
    with open(path1, 'rb') as f1:
        with open(path2, 'rb') as f2:
            content1 = f1.read()
            content2 = f2.read()
            
            return content1 == content2

In [4]:
def get_file_size(path):
    return os.path.getsize(path)

In [5]:
root_path = './data/lesson01/'
extension = '.txt'
labels = ['czech', 'german', 'english', 'french', 'hungarian']
dataset_paths = [root_path + label + extension for label in labels]

# Encoder

In [6]:
class LZ77Encoder(object):
    def __init__(self, search_buffer_size, lookahead_buffer_size, write_buffer_size=1024):
        self.search_buffer_size = search_buffer_size
        self.lookahead_buffer_size = lookahead_buffer_size
        
        window_size = search_buffer_size + lookahead_buffer_size
        self.window = deque([b'']*window_size, window_size)
        
        self.offset_bit_size = int.bit_length(search_buffer_size-1)
        self.size_bit_size = int.bit_length(lookahead_buffer_size-1)

        if write_buffer_size % 8 != 0:
            raise Exception('write_buffer_size should be multiple of 8')
        
        self.write_buffer = bitarray()
        self.write_buffer_size = write_buffer_size
        
        self.triplets_count = 0
        
    def encode(self, file_path, output_file_path):
        self.file = open(file_path, 'rb')
        self.output_file = open(output_file_path, 'wb')
        self._move_window(self.lookahead_buffer_size)
        
        triplets = []
        self.triplets_count = 0
        while self.window[self.search_buffer_size] != b'':
            current_char = self.window[self.search_buffer_size]
            best_triplet = (0, 0, current_char) # (offset, size, character)
            
            for offset in range(1, self.search_buffer_size):
                i = self.search_buffer_size - offset # actual offset position
                if self.window[i] == b'':
                    break
                
                for size in range(self.lookahead_buffer_size):
                    if self.window[i+size] != self.window[self.search_buffer_size+size]:
                        break
                        
                if size > best_triplet[1]:
                    best_triplet = (offset, size, self.window[self.search_buffer_size+size])

            triplets.append(best_triplet)
            self._write_triplet(best_triplet)
            self._move_window(best_triplet[1]+1)
            self.triplets_count += 1
        
        self.write_buffer.tofile(self.output_file)
        
        self.file.close()
        self.output_file.close()
        return triplets
        
    def _move_window(self, size):
        chunk = self.file.read(size)
        for byte in struct.unpack(str(len(chunk)) + 'c', chunk): # iterate over chunk withou conversion to int
            self.window.append(byte)
            
        # append empty bytes when EOF
        for _ in range(size - len(chunk)):
            self.window.append(b'')
            
    def _write_triplet(self, triplet):
        offset, size, char = triplet
        
        offset_bin = bin(offset)[2:].zfill(self.offset_bit_size)
        size_bin = bin(size)[2:].zfill(self.size_bit_size)
        
        self.write_buffer.extend(offset_bin)
        self.write_buffer.extend(size_bin)
        self.write_buffer.frombytes(char)
        
        if len(self.write_buffer) > self.write_buffer_size:
            self.write_buffer[:self.write_buffer_size].tofile(self.output_file)
            self.write_buffer = self.write_buffer[self.write_buffer_size:]
            
encoder = LZ77Encoder(4096, 16)
%time triplets = encoder.encode(dataset_paths[2], './output/lz77.bin')

Wall time: 1min 36s


# Decoder

In [11]:
class LZ77Decoder(object):
    def __init__(self, search_buffer_size, offset_bit_size, size_bit_size, min_read_buffer_size=1024):
        self.search_buffer_size = search_buffer_size
        self.window = SliceableDeque([b'']*search_buffer_size, search_buffer_size)

        self.offset_bit_size = offset_bit_size
        self.size_bit_size = size_bit_size
        
        if min_read_buffer_size % 8 != 0:
            raise Exception('min_read_buffer_size should be multiple of 8')
        
        self.min_read_buffer_size = min_read_buffer_size
        self.read_buffer = bitarray()

#         self.decoded = b''

    def decode(self, input_file_path, output_file_path):
        self.input_file = open(input_file_path, 'rb')
        self.output_file = open(output_file_path, 'wb')
        
        for offset, size, char in self._iter_triplets():
            i = self.search_buffer_size - offset # actual offset position

            for _ in range(size):
                self._append_char(self.window[i])

            self._append_char(char)

        # write remaining characters
        for char in self.window:
            if char == b'' or char == b'\x00':
                break
            
#             self.decoded += char
            self.output_file.write(char)

            
        self.input_file.close()
        self.output_file.close()
        
#         return self.decoded

    def _append_char(self, char):
        if self.window[0] != b'\x00':
#             self.decoded += self.window[0]
            self.output_file.write(self.window[0])

        self.window.append(char)
        
    def _iter_triplets(self):
        triplets_count = 0
#         self._load_to_buffer()
        self.read_buffer.fromfile(self.input_file)
        
        while len(self.read_buffer) > 0:
            offset_bin = self.read_buffer[:self.offset_bit_size]
            size_bin = self.read_buffer[self.offset_bit_size:self.offset_bit_size+self.size_bit_size]
            char_bin = self.read_buffer[self.offset_bit_size+self.size_bit_size:self.offset_bit_size+self.size_bit_size+8]
            self.read_buffer = self.read_buffer[self.offset_bit_size+self.size_bit_size+8:]
            
            offset = int(offset_bin.to01(), 2)
            size = int(size_bin.to01(), 2)
            char = char_bin.tobytes()
            
            yield (offset, size, char)
            triplets_count += 1
            
#             self._load_to_buffer()
            
        print('Triplets read:', triplets_count)
            
    def _load_to_buffer(self):
        try:
            self.read_buffer.fromfile(self.input_file, self.min_read_buffer_size // 8)
        except EOFError:
            self.read_buffer.fromfile(self.input_file)

decoder = LZ77Decoder(encoder.search_buffer_size, encoder.offset_bit_size, encoder.size_bit_size)
%time decoded = decoder.decode('./output/lz77.bin', './output/lz77.txt')

Triplets read: 28546
Wall time: 1min 5s


# Results

In [12]:
window_sizes = [
    (4096, 16),
    (16384, 32),
    (32768, 64)
]

In [13]:
df = pd.DataFrame(columns=['File', 'Triplets', 'FileSize', 'SearchSize', 'LookaheadSize', 'EncodedSize', 'OverallTime'])

for dataset_label in labels:
    dataset_path = root_path + dataset_label + extension

    for search_size, lookahead_size in window_sizes:
        t0 = time()
        encoder = LZ77Encoder(search_size, lookahead_size)
        encoder.encode(dataset_path, './output/lz77.bin')
        
        decoder = LZ77Decoder(encoder.search_buffer_size, encoder.offset_bit_size, encoder.size_bit_size)
        decoded = decoder.decode('./output/lz77.bin', './output/lz77.txt')
        
        if not compare_content(dataset_path, './output/lz77.txt'):
            print(dataset_label, 'unpacked file has different content')
            
        df = df.append({
            'File': dataset_label,
            'Triplets': encoder.triplets_count,
            'FileSize': get_file_size(dataset_path),
            'SearchSize': search_size,
            'LookaheadSize': lookahead_size,
            'EncodedSize': get_file_size('./output/lz77.bin'),
            'OverallTime': int(time() - t0)
        }, ignore_index=True)
        
        print(dataset_label, int(time() - t0))

Triplets read: 28735
czech 150
Triplets read: 23979
czech 381
Triplets read: 22411
czech 785
Triplets read: 44041
german 293
Triplets read: 36625
german 636
Triplets read: 34117
german 1355
Triplets read: 28546
english 156
Triplets read: 23701
english 394
Triplets read: 21646
english 784
Triplets read: 28202
french 154
Triplets read: 23010
french 376
Triplets read: 21429
french 737
Triplets read: 44711
hungarian 302
Triplets read: 37368
hungarian 672
Triplets read: 34750
hungarian 1383


In [14]:
df

Unnamed: 0,File,Triplets,FileSize,SearchSize,LookaheadSize,EncodedSize,OverallTime
0,czech,28735,150849,4096,16,86204,150
1,czech,23979,150849,16384,32,80929,381
2,czech,22411,150849,32768,64,81239,785
3,german,44041,232542,4096,16,132122,293
4,german,36625,232542,16384,32,123609,636
5,german,34117,232542,32768,64,123674,1355
6,english,28546,150266,4096,16,85637,156
7,english,23701,150266,16384,32,79990,394
8,english,21646,150266,32768,64,78466,784
9,french,28202,150767,4096,16,84605,154
