In [79]:
from os import listdir, makedirs, stat
from os.path import isfile, join, exists
import subprocess

In [68]:
test_file_names = [f for f in listdir("dataset") if isfile(join("dataset", f))]

In [69]:
test_file_names

['obj1',
 'obj2',
 'geo',
 'trans',
 'progp',
 'progl',
 'book2',
 'paper1',
 'news',
 'progc',
 'pic',
 'bib',
 'paper2',
 'book1']

In [4]:
def test_encode(executable: str, file_in: str, file_out: str, additional_params: str):
    subprocess.run(executable + " -i {} -o {} {}".format(file_in, file_out, additional_params),
                   shell=True, check=True)

In [75]:
def do_tests(executable: str, inDir: str, filenames: list, outDir: str,
             outPostfix: str, numThreads: int, additionalParams = []):
    if not exists(outDir):
        makedirs(outDir)
    procs = []
        
    numRuns = (len(filenames) - 1) // numThreads + 1
        
    for runI in range(numRuns):
        lowI = numThreads * runI
        highI = min(len(filenames), numThreads * (runI + 1))
        print("Process files: ", *filenames[lowI:highI])
        for filename in filenames[lowI:highI]:
            inFile = inDir + "/" + filename
            outFile = outDir + "/" + filename + outPostfix
            print("In file: {}. \t Out file: {}".format(inFile, outFile))
            procs.append(subprocess.Popen([executable, "-i", inFile, "-o",  outFile] + additionalParams))
        for p in procs:
            p.wait()

In [6]:
do_tests("./exe/archiever_d_encoder", "dataset", test_file_names,
         "out/dataset-d-out", "-encoded", 6, ["-b", "8", "-l", "off"])

Process files:  paper5 obj1 obj2 geo trans progp
In file: dataset/paper5. 	 Out file: out/dataset-d-out/paper5-encoded
In file: dataset/obj1. 	 Out file: out/dataset-d-out/obj1-encoded
In file: dataset/obj2. 	 Out file: out/dataset-d-out/obj2-encoded
In file: dataset/geo. 	 Out file: out/dataset-d-out/geo-encoded
In file: dataset/trans. 	 Out file: out/dataset-d-out/trans-encoded
In file: dataset/progp. 	 Out file: out/dataset-d-out/progp-encoded
Process files:  progl book2 paper1 news progc paper4
In file: dataset/progl. 	 Out file: out/dataset-d-out/progl-encoded
In file: dataset/book2. 	 Out file: out/dataset-d-out/book2-encoded
In file: dataset/paper1. 	 Out file: out/dataset-d-out/paper1-encoded
In file: dataset/news. 	 Out file: out/dataset-d-out/news-encoded
In file: dataset/progc. 	 Out file: out/dataset-d-out/progc-encoded
In file: dataset/paper4. 	 Out file: out/dataset-d-out/paper4-encoded
Process files:  pic bib paper2 book1 paper3 paper6
In file: dataset/pic. 	 Out file: o

In [10]:
for bits in range(29, 30):
    strBits = str(bits)
    do_tests(executable="./exe/archiever_d_encoder", 
             inDir="dataset", 
             filenames=test_file_names, 
             outDir="out/dataset-d-out-1502-" + strBits, 
             outPostfix="-encoded",
             numThreads=18, 
             additionalParams=["-b", strBits, "-l", "off"])

Process files:  paper5 obj1 obj2 geo trans progp progl book2 paper1 news progc paper4 pic bib paper2 book1 paper3 paper6
In file: dataset/paper5. 	 Out file: out/dataset-d-out-1502-29/paper5-encoded
In file: dataset/obj1. 	 Out file: out/dataset-d-out-1502-29/obj1-encoded
In file: dataset/obj2. 	 Out file: out/dataset-d-out-1502-29/obj2-encoded
In file: dataset/geo. 	 Out file: out/dataset-d-out-1502-29/geo-encoded
In file: dataset/trans. 	 Out file: out/dataset-d-out-1502-29/trans-encoded
In file: dataset/progp. 	 Out file: out/dataset-d-out-1502-29/progp-encoded
In file: dataset/progl. 	 Out file: out/dataset-d-out-1502-29/progl-encoded
In file: dataset/book2. 	 Out file: out/dataset-d-out-1502-29/book2-encoded
In file: dataset/paper1. 	 Out file: out/dataset-d-out-1502-29/paper1-encoded
In file: dataset/news. 	 Out file: out/dataset-d-out-1502-29/news-encoded
In file: dataset/progc. 	 Out file: out/dataset-d-out-1502-29/progc-encoded
In file: dataset/paper4. 	 Out file: out/dataset-

KeyboardInterrupt: 

In [9]:
for bits in range(8, 14):
    strBits = str(bits)
    encodedFileNames = [filename + "-encoded" for filename in test_file_names]
    do_tests(executable="./exe/archiever_d_decoder",
             inDir="out/dataset-d-out-" + strBits,
             filenames=encodedFileNames,
             outDir="out/dataset-d-decoded-" + strBits,
             outPostfix="-decoded",
             numThreads=18,
             additionalParams=["-l", "off"])

Process files:  paper5-encoded obj1-encoded obj2-encoded geo-encoded trans-encoded progp-encoded progl-encoded book2-encoded paper1-encoded news-encoded progc-encoded paper4-encoded pic-encoded bib-encoded paper2-encoded book1-encoded paper3-encoded paper6-encoded
In file: out/dataset-d-out-8/paper5-encoded. 	 Out file: out/dataset-d-decoded-8/paper5-encoded-decoded
In file: out/dataset-d-out-8/obj1-encoded. 	 Out file: out/dataset-d-decoded-8/obj1-encoded-decoded
In file: out/dataset-d-out-8/obj2-encoded. 	 Out file: out/dataset-d-decoded-8/obj2-encoded-decoded
In file: out/dataset-d-out-8/geo-encoded. 	 Out file: out/dataset-d-decoded-8/geo-encoded-decoded
In file: out/dataset-d-out-8/trans-encoded. 	 Out file: out/dataset-d-decoded-8/trans-encoded-decoded
In file: out/dataset-d-out-8/progp-encoded. 	 Out file: out/dataset-d-decoded-8/progp-encoded-decoded
In file: out/dataset-d-out-8/progl-encoded. 	 Out file: out/dataset-d-decoded-8/progl-encoded-decoded
In file: out/dataset-d-out-

Process files:  paper5-encoded obj1-encoded obj2-encoded geo-encoded trans-encoded progp-encoded progl-encoded book2-encoded paper1-encoded news-encoded progc-encoded paper4-encoded pic-encoded bib-encoded paper2-encoded book1-encoded paper3-encoded paper6-encoded
In file: out/dataset-d-out-12/paper5-encoded. 	 Out file: out/dataset-d-decoded-12/paper5-encoded-decoded
In file: out/dataset-d-out-12/obj1-encoded. 	 Out file: out/dataset-d-decoded-12/obj1-encoded-decoded
In file: out/dataset-d-out-12/obj2-encoded. 	 Out file: out/dataset-d-decoded-12/obj2-encoded-decoded
In file: out/dataset-d-out-12/geo-encoded. 	 Out file: out/dataset-d-decoded-12/geo-encoded-decoded
In file: out/dataset-d-out-12/trans-encoded. 	 Out file: out/dataset-d-decoded-12/trans-encoded-decoded
In file: out/dataset-d-out-12/progp-encoded. 	 Out file: out/dataset-d-decoded-12/progp-encoded-decoded
In file: out/dataset-d-out-12/progl-encoded. 	 Out file: out/dataset-d-decoded-12/progl-encoded-decoded
In file: out/

In [20]:
do_tests(executable="./exe/ppmd_encoder", 
             inDir="dataset", 
             filenames=test_file_names, 
             outDir="out/dataset-14-ppmd-improved-out-8-2-8", 
             outPostfix="-encoded",
             numThreads=4, 
             additionalParams=["-b", "8", "-l", "off", "-c", "2", "-q", "8"])

Process files:  obj1 obj2 geo trans
In file: dataset/obj1. 	 Out file: out/dataset-14-ppmd-improved-out-8-2-8/obj1-encoded
In file: dataset/obj2. 	 Out file: out/dataset-14-ppmd-improved-out-8-2-8/obj2-encoded
In file: dataset/geo. 	 Out file: out/dataset-14-ppmd-improved-out-8-2-8/geo-encoded
In file: dataset/trans. 	 Out file: out/dataset-14-ppmd-improved-out-8-2-8/trans-encoded
Process files:  progp progl book2 paper1
In file: dataset/progp. 	 Out file: out/dataset-14-ppmd-improved-out-8-2-8/progp-encoded
In file: dataset/progl. 	 Out file: out/dataset-14-ppmd-improved-out-8-2-8/progl-encoded
In file: dataset/book2. 	 Out file: out/dataset-14-ppmd-improved-out-8-2-8/book2-encoded
In file: dataset/paper1. 	 Out file: out/dataset-14-ppmd-improved-out-8-2-8/paper1-encoded
Process files:  news progc pic bib
In file: dataset/news. 	 Out file: out/dataset-14-ppmd-improved-out-8-2-8/news-encoded
In file: dataset/progc. 	 Out file: out/dataset-14-ppmd-improved-out-8-2-8/progc-encoded
In fil

KeyboardInterrupt: 

In [15]:
do_tests(executable="./exe/ppma_encoder", 
             inDir="dataset", 
             filenames=test_file_names, 
             outDir="out/dataset-ppma-out-16-4-2", 
             outPostfix="-encoded",
             numThreads=9, 
             additionalParams=["-b", "16", "-l", "off", "-c", "4", "-q", "2"])

Process files:  paper5 obj1 obj2 geo trans progp progl book2 paper1
In file: dataset/paper5. 	 Out file: out/dataset-ppma-out-16-4-2/paper5-encoded
In file: dataset/obj1. 	 Out file: out/dataset-ppma-out-16-4-2/obj1-encoded
In file: dataset/obj2. 	 Out file: out/dataset-ppma-out-16-4-2/obj2-encoded
In file: dataset/geo. 	 Out file: out/dataset-ppma-out-16-4-2/geo-encoded
In file: dataset/trans. 	 Out file: out/dataset-ppma-out-16-4-2/trans-encoded
In file: dataset/progp. 	 Out file: out/dataset-ppma-out-16-4-2/progp-encoded
In file: dataset/progl. 	 Out file: out/dataset-ppma-out-16-4-2/progl-encoded
In file: dataset/book2. 	 Out file: out/dataset-ppma-out-16-4-2/book2-encoded
In file: dataset/paper1. 	 Out file: out/dataset-ppma-out-16-4-2/paper1-encoded
Process files:  news progc paper4 pic bib paper2 book1 paper3 paper6
In file: dataset/news. 	 Out file: out/dataset-ppma-out-16-4-2/news-encoded
In file: dataset/progc. 	 Out file: out/dataset-ppma-out-16-4-2/progc-encoded
In file: da

In [12]:
test_file_names_decoded = [word + "-encoded" for word in test_file_names]
do_tests(executable="./exe/ppma_decoder",
             inDir="out/dataset-ppma-out-8-6-4",
             filenames=test_file_names_decoded,
             outDir="out/dataset-ppma-decoded-8-6-4",
             outPostfix="-decoded",
             numThreads=9,
             additionalParams=["-l", "off"])

Process files:  paper5-encoded obj1-encoded obj2-encoded geo-encoded trans-encoded progp-encoded progl-encoded book2-encoded paper1-encoded
In file: out/dataset-ppma-out-8-6-4/paper5-encoded. 	 Out file: out/dataset-ppma-decoded-8-6-4/paper5-encoded-decoded
In file: out/dataset-ppma-out-8-6-4/obj1-encoded. 	 Out file: out/dataset-ppma-decoded-8-6-4/obj1-encoded-decoded
In file: out/dataset-ppma-out-8-6-4/obj2-encoded. 	 Out file: out/dataset-ppma-decoded-8-6-4/obj2-encoded-decoded
In file: out/dataset-ppma-out-8-6-4/geo-encoded. 	 Out file: out/dataset-ppma-decoded-8-6-4/geo-encoded-decoded
In file: out/dataset-ppma-out-8-6-4/trans-encoded. 	 Out file: out/dataset-ppma-decoded-8-6-4/trans-encoded-decoded
In file: out/dataset-ppma-out-8-6-4/progp-encoded. 	 Out file: out/dataset-ppma-decoded-8-6-4/progp-encoded-decoded
In file: out/dataset-ppma-out-8-6-4/progl-encoded. 	 Out file: out/dataset-ppma-decoded-8-6-4/progl-encoded-decoded
In file: out/dataset-ppma-out-8-6-4/book2-encoded. 	 O

In [9]:
do_tests(executable="./exe/ppmd_plus_encoder", 
             inDir="dataset", 
             filenames=test_file_names, 
             outDir="out/dataset-ppmd-plus11-out-8-8-4", 
             outPostfix="-encoded",
             numThreads=7, 
             additionalParams=["-b", "8", "-l", "off", "-c", "8", "-q", "4"])

Process files:  obj1 obj2 geo trans progp progl book2
In file: dataset/obj1. 	 Out file: out/dataset-ppmd-plus11-out-8-8-4/obj1-encoded
In file: dataset/obj2. 	 Out file: out/dataset-ppmd-plus11-out-8-8-4/obj2-encoded
In file: dataset/geo. 	 Out file: out/dataset-ppmd-plus11-out-8-8-4/geo-encoded
In file: dataset/trans. 	 Out file: out/dataset-ppmd-plus11-out-8-8-4/trans-encoded
In file: dataset/progp. 	 Out file: out/dataset-ppmd-plus11-out-8-8-4/progp-encoded
In file: dataset/progl. 	 Out file: out/dataset-ppmd-plus11-out-8-8-4/progl-encoded
In file: dataset/book2. 	 Out file: out/dataset-ppmd-plus11-out-8-8-4/book2-encoded
Process files:  paper1 news progc pic bib paper2 book1
In file: dataset/paper1. 	 Out file: out/dataset-ppmd-plus11-out-8-8-4/paper1-encoded
In file: dataset/news. 	 Out file: out/dataset-ppmd-plus11-out-8-8-4/news-encoded
In file: dataset/progc. 	 Out file: out/dataset-ppmd-plus11-out-8-8-4/progc-encoded
In file: dataset/pic. 	 Out file: out/dataset-ppmd-plus11-o

In [6]:
from collections import Counter
from collections import defaultdict
import math

In [13]:
def estimate_seq_H(seq: list):
    cnt = Counter(seq)
    counts = cnt.values()
    total = cnt.total()
    return -sum([c / total * math.log2(c / total) for c in counts])

In [14]:
def estimate_H(filename: str, bytesSize: int):
    with open(filename, 'rb') as file:
        ords = []
        while w := file.read(bytesSize):
            ords.append(int.from_bytes(w, 'big'))
        return estimate_seq_H(ords)

In [15]:
print(estimate_H("dataset/book1", 2))

8.109799149888886


In [70]:
for filename in sorted(test_file_names):
    print("{}\t{}\t{}\t{}".format(filename,
                          round(estimate_H("dataset/" + filename, 1), 3),
                          round(estimate_H("dataset/" + filename, 2), 3),
                          round(estimate_H("dataset/" + filename, 3), 3))
         )

bib	5.201	8.552	10.75
book1	4.527	8.11	10.897
book2	4.793	8.534	11.224
geo	5.646	9.174	12.806
news	5.19	9.268	12.085
obj1	5.948	9.121	10.126
obj2	6.26	8.906	12.153
paper1	4.983	8.61	10.749
paper2	4.601	8.101	10.519
pic	1.21	2.023	2.639
progc	5.199	8.77	10.686
progl	4.77	7.969	9.922
progp	4.869	8.031	9.656
trans	5.533	8.873	10.695


In [55]:
def estimate_H_cond(filename: str, bytesSize: int, condLen: int):
    with open(filename, 'rb') as file:
        ords = []
        while w := file.read(bytesSize):
            ords.append(int.from_bytes(w, 'big'))
        cases = [(tuple(ords[i:i+condLen]), ords[i+condLen]) for i in range(len(ords) - condLen)]
        casesMapping = defaultdict(list)
        for case in cases:
            condition, x = case
            casesMapping[condition].append(x)
        ret = 0
        for case in casesMapping:
            condition = case
            x = casesMapping[condition]
            cnt = Counter(x)
            counts = list(cnt.values())
            total = cnt.total()
            px = sum(counts) / (len(ords) - condLen)
            ret -= px * sum([c / total * math.log2(c / total) for c in counts])
        return ret

In [65]:
estimate_H_cond("dataset/progl", 1, 2)

2.043554586666519

In [73]:
for filename in sorted(test_file_names):
    print("{}\t{}\t{}".format(filename,
                          round(estimate_H_cond("dataset/" + filename, 1, 1), 3),
                          round(estimate_H_cond("dataset/" + filename, 1, 2), 3)))

bib	3.364	2.308
book1	3.585	2.814
book2	3.745	2.736
geo	4.264	3.458
news	4.092	2.923
obj1	3.464	1.4
obj2	3.87	2.265
paper1	3.646	2.332
paper2	3.522	2.514
pic	0.824	0.705
progc	3.603	2.134
progl	3.212	2.044
progp	3.188	1.755
trans	3.355	1.93


In [77]:
do_tests("exe/numerical_encoder", "dataset", test_file_names, "out/numerical_test_out", "-numerical-encoded",
         7, ["-l", "off"])

Process files:  obj1 obj2 geo trans progp progl book2
In file: dataset/obj1. 	 Out file: out/numerical_test_out/obj1-numerical-encoded
In file: dataset/obj2. 	 Out file: out/numerical_test_out/obj2-numerical-encoded
In file: dataset/geo. 	 Out file: out/numerical_test_out/geo-numerical-encoded
In file: dataset/trans. 	 Out file: out/numerical_test_out/trans-numerical-encoded
In file: dataset/progp. 	 Out file: out/numerical_test_out/progp-numerical-encoded
In file: dataset/progl. 	 Out file: out/numerical_test_out/progl-numerical-encoded
In file: dataset/book2. 	 Out file: out/numerical_test_out/book2-numerical-encoded
Process files:  paper1 news progc pic bib paper2 book1
In file: dataset/paper1. 	 Out file: out/numerical_test_out/paper1-numerical-encoded
In file: dataset/news. 	 Out file: out/numerical_test_out/news-numerical-encoded
In file: dataset/progc. 	 Out file: out/numerical_test_out/progc-numerical-encoded
In file: dataset/pic. 	 Out file: out/numerical_test_out/pic-numerica

In [93]:
for filename in sorted(test_file_names):
    origSize = stat("dataset/" + filename).st_size
    archSize = stat("out/numerical_test_out/" + filename + "-numerical-encoded").st_size
    print("{} & {} & {} & {}".format(
        filename,
        origSize,
        round(archSize / origSize * 8, 3),
        archSize)
    )

bib & 111261 & 5.212 & 72482
book1 & 768771 & 4.529 & 435203
book2 & 610856 & 4.795 & 366132
geo & 102400 & 5.672 & 72601
news & 377109 & 5.194 & 244817
obj1 & 21504 & 6.057 & 16280
obj2 & 246814 & 6.272 & 193488
paper1 & 53161 & 5.007 & 33273
paper2 & 82199 & 4.617 & 47434
pic & 513216 & 1.213 & 77848
progc & 39611 & 5.231 & 25901
progl & 71646 & 4.787 & 42872
progp & 49379 & 4.894 & 30205
trans & 93695 & 5.548 & 64972


In [94]:
do_tests("exe/archiever_a_encoder", "dataset", test_file_names, "out/arithmetic_a_test_out_8", "",
         7, ["-l", "off", "-b", "8"])
do_tests("exe/archiever_a_encoder", "dataset", test_file_names, "out/arithmetic_a_test_out_16", "",
         7, ["-l", "off", "-b", "16"])
do_tests("exe/archiever_a_encoder", "dataset", test_file_names, "out/arithmetic_a_test_out_24", "",
         7, ["-l", "off", "-b", "24"])

Process files:  obj1 obj2 geo trans progp progl book2
In file: dataset/obj1. 	 Out file: out/arithmetic_a_test_out_8/obj1
In file: dataset/obj2. 	 Out file: out/arithmetic_a_test_out_8/obj2
In file: dataset/geo. 	 Out file: out/arithmetic_a_test_out_8/geo
In file: dataset/trans. 	 Out file: out/arithmetic_a_test_out_8/trans
In file: dataset/progp. 	 Out file: out/arithmetic_a_test_out_8/progp
In file: dataset/progl. 	 Out file: out/arithmetic_a_test_out_8/progl
In file: dataset/book2. 	 Out file: out/arithmetic_a_test_out_8/book2
Process files:  paper1 news progc pic bib paper2 book1
In file: dataset/paper1. 	 Out file: out/arithmetic_a_test_out_8/paper1
In file: dataset/news. 	 Out file: out/arithmetic_a_test_out_8/news
In file: dataset/progc. 	 Out file: out/arithmetic_a_test_out_8/progc
In file: dataset/pic. 	 Out file: out/arithmetic_a_test_out_8/pic
In file: dataset/bib. 	 Out file: out/arithmetic_a_test_out_8/bib
In file: dataset/paper2. 	 Out file: out/arithmetic_a_test_out_8/pa

In [96]:
for filename in sorted(test_file_names):
    origSize = stat("dataset/" + filename).st_size
    archSize = stat("out/arithmetic_a_test_out_8/" + filename).st_size
    print("{} & {} & {} & {}".format(
        filename,
        origSize,
        round(archSize / origSize * 8, 3),
        archSize)
    )

bib & 111261 & 5.21 & 72460
book1 & 768771 & 4.529 & 435180
book2 & 610856 & 4.795 & 366110
geo & 102400 & 5.67 & 72579
news & 377109 & 5.193 & 244794
obj1 & 21504 & 6.048 & 16257
obj2 & 246814 & 6.271 & 193465
paper1 & 53161 & 5.004 & 33251
paper2 & 82199 & 4.614 & 47412
pic & 513216 & 1.213 & 77825
progc & 39611 & 5.227 & 25879
progl & 71646 & 4.785 & 42850
progp & 49379 & 4.89 & 30183
trans & 93695 & 5.546 & 64950


In [97]:
for filename in sorted(test_file_names):
    origSize = stat("dataset/" + filename).st_size
    archSize = stat("out/arithmetic_a_test_out_16/" + filename).st_size
    print("{} & {} & {} & {}".format(
        filename,
        origSize,
        round(archSize / origSize * 16, 3),
        archSize)
    )

bib & 111261 & 8.941 & 62175
book1 & 768771 & 8.181 & 393085
book2 & 610856 & 8.681 & 331410
geo & 102400 & 9.82 & 62850
news & 377109 & 9.583 & 225853
obj1 & 21504 & 13.389 & 17995
obj2 & 246814 & 9.676 & 149254
paper1 & 53161 & 9.422 & 31305
paper2 & 82199 & 8.542 & 43882
pic & 513216 & 2.165 & 69429
progc & 39611 & 9.92 & 24558
progl & 71646 & 8.437 & 37778
progp & 49379 & 8.841 & 27285
trans & 93695 & 9.49 & 55570


In [98]:
for filename in sorted(test_file_names):
    origSize = stat("dataset/" + filename).st_size
    archSize = stat("out/arithmetic_a_test_out_24/" + filename).st_size
    print("{} & {} & {} & {}".format(
        filename,
        origSize,
        round(archSize / origSize * 24, 3),
        archSize)
    )

bib & 111261 & 14.136 & 65531
book1 & 768771 & 11.793 & 377746
book2 & 610856 & 12.666 & 322383
geo & 102400 & 23.571 & 100569
news & 377109 & 15.335 & 240952
obj1 & 21504 & 21.965 & 19681
obj2 & 246814 & 17.592 & 180911
paper1 & 53161 & 15.873 & 35159
paper2 & 82199 & 13.893 & 47584
pic & 513216 & 3.783 & 80889
progc & 39611 & 17.066 & 28167
progl & 71646 & 13.203 & 39415
progp & 49379 & 14.112 & 29034
trans & 93695 & 14.475 & 56508


In [99]:
do_tests("exe/archiever_d_encoder", "dataset", test_file_names, "out/arithmetic_d_test_out_8", "",
         7, ["-l", "off", "-b", "8"])
do_tests("exe/archiever_d_encoder", "dataset", test_file_names, "out/arithmetic_d_test_out_16", "",
         7, ["-l", "off", "-b", "16"])
do_tests("exe/archiever_d_encoder", "dataset", test_file_names, "out/arithmetic_d_test_out_24", "",
         7, ["-l", "off", "-b", "24"])

Process files:  obj1 obj2 geo trans progp progl book2
In file: dataset/obj1. 	 Out file: out/arithmetic_d_test_out_8/obj1
In file: dataset/obj2. 	 Out file: out/arithmetic_d_test_out_8/obj2
In file: dataset/geo. 	 Out file: out/arithmetic_d_test_out_8/geo
In file: dataset/trans. 	 Out file: out/arithmetic_d_test_out_8/trans
In file: dataset/progp. 	 Out file: out/arithmetic_d_test_out_8/progp
In file: dataset/progl. 	 Out file: out/arithmetic_d_test_out_8/progl
In file: dataset/book2. 	 Out file: out/arithmetic_d_test_out_8/book2
Process files:  paper1 news progc pic bib paper2 book1
In file: dataset/paper1. 	 Out file: out/arithmetic_d_test_out_8/paper1
In file: dataset/news. 	 Out file: out/arithmetic_d_test_out_8/news
In file: dataset/progc. 	 Out file: out/arithmetic_d_test_out_8/progc
In file: dataset/pic. 	 Out file: out/arithmetic_d_test_out_8/pic
In file: dataset/bib. 	 Out file: out/arithmetic_d_test_out_8/bib
In file: dataset/paper2. 	 Out file: out/arithmetic_d_test_out_8/pa

In [100]:
for filename in sorted(test_file_names):
    origSize = stat("dataset/" + filename).st_size
    archSize = stat("out/arithmetic_d_test_out_8/" + filename).st_size
    print("{} & {} & {} & {}".format(
        filename,
        origSize,
        round(archSize / origSize * 8, 3),
        archSize)
    )

bib & 111261 & 5.211 & 72471
book1 & 768771 & 4.529 & 435196
book2 & 610856 & 4.795 & 366128
geo & 102400 & 5.664 & 72495
news & 377109 & 5.193 & 244814
obj1 & 21504 & 6.015 & 16169
obj2 & 246814 & 6.268 & 193389
paper1 & 53161 & 5.004 & 33250
paper2 & 82199 & 4.614 & 47410
pic & 513216 & 1.213 & 77786
progc & 39611 & 5.227 & 25882
progl & 71646 & 4.785 & 42852
progp & 49379 & 4.89 & 30184
trans & 93695 & 5.546 & 64956


In [101]:
for filename in sorted(test_file_names):
    origSize = stat("dataset/" + filename).st_size
    archSize = stat("out/arithmetic_d_test_out_16/" + filename).st_size
    print("{} & {} & {} & {}".format(
        filename,
        origSize,
        round(archSize / origSize * 16, 3),
        archSize)
    )

bib & 111261 & 8.805 & 61227
book1 & 768771 & 8.157 & 391947
book2 & 610856 & 8.62 & 329104
geo & 102400 & 9.563 & 61205
news & 377109 & 9.438 & 222438
obj1 & 21504 & 10.955 & 14723
obj2 & 246814 & 9.229 & 142372
paper1 & 53161 & 9.105 & 30251
paper2 & 82199 & 8.384 & 43071
pic & 513216 & 2.098 & 67309
progc & 39611 & 9.445 & 23384
progl & 71646 & 8.276 & 37059
progp & 49379 & 8.53 & 26326
trans & 93695 & 9.245 & 54139


In [102]:
for filename in sorted(test_file_names):
    origSize = stat("dataset/" + filename).st_size
    archSize = stat("out/arithmetic_d_test_out_24/" + filename).st_size
    print("{} & {} & {} & {}".format(
        filename,
        origSize,
        round(archSize / origSize * 24, 3),
        archSize)
    )

bib & 111261 & 12.856 & 59601
book1 & 768771 & 11.451 & 366787
book2 & 610856 & 12.065 & 307077
geo & 102400 & 18.295 & 78058
news & 377109 & 13.85 & 217620
obj1 & 21504 & 17.206 & 15417
obj2 & 246814 & 14.965 & 153900
paper1 & 53161 & 13.993 & 30996
paper2 & 82199 & 12.694 & 43475
pic & 513216 & 3.28 & 70148
progc & 39611 & 14.703 & 24267
progl & 71646 & 12.071 & 36035
progp & 49379 & 12.538 & 25796
trans & 93695 & 13.064 & 51003


In [105]:
do_tests("exe/a_contextual_encoder", "dataset", test_file_names, "out/contextual_a_test_out_8-6-4", "",
         7, ["-l", "off", "-b", "8", "-c", "6", "-q", "4"])
do_tests("exe/d_contextual_encoder", "dataset", test_file_names, "out/contextual_d_test_out_8-6-4", "",
         7, ["-l", "off", "-b", "8", "-c", "6", "-q", "4"])

Process files:  obj1 obj2 geo trans progp progl book2
In file: dataset/obj1. 	 Out file: out/contextual_a_test_out_8-6-4/obj1
In file: dataset/obj2. 	 Out file: out/contextual_a_test_out_8-6-4/obj2
In file: dataset/geo. 	 Out file: out/contextual_a_test_out_8-6-4/geo
In file: dataset/trans. 	 Out file: out/contextual_a_test_out_8-6-4/trans
In file: dataset/progp. 	 Out file: out/contextual_a_test_out_8-6-4/progp
In file: dataset/progl. 	 Out file: out/contextual_a_test_out_8-6-4/progl
In file: dataset/book2. 	 Out file: out/contextual_a_test_out_8-6-4/book2
Process files:  paper1 news progc pic bib paper2 book1
In file: dataset/paper1. 	 Out file: out/contextual_a_test_out_8-6-4/paper1
In file: dataset/news. 	 Out file: out/contextual_a_test_out_8-6-4/news
In file: dataset/progc. 	 Out file: out/contextual_a_test_out_8-6-4/progc
In file: dataset/pic. 	 Out file: out/contextual_a_test_out_8-6-4/pic
In file: dataset/bib. 	 Out file: out/contextual_a_test_out_8-6-4/bib
In file: dataset/pa

In [106]:
for filename in sorted(test_file_names):
    origSize = stat("dataset/" + filename).st_size
    archSize = stat("out/contextual_a_test_out_8-6-4/" + filename).st_size
    print("{} & {} & {} & {}".format(
        filename,
        origSize,
        round(archSize / origSize * 8, 3),
        archSize)
    )

bib & 111261 & 2.635 & 36649
book1 & 768771 & 2.683 & 257789
book2 & 610856 & 2.525 & 192824
geo & 102400 & 6.462 & 82708
news & 377109 & 3.206 & 151133
obj1 & 21504 & 4.574 & 12295
obj2 & 246814 & 3.21 & 99045
paper1 & 53161 & 3.16 & 21001
paper2 & 82199 & 2.99 & 30727
pic & 513216 & 1.033 & 66269
progc & 39611 & 3.246 & 16073
progl & 71646 & 2.337 & 20932
progp & 49379 & 2.261 & 13954
trans & 93695 & 2.103 & 24626


In [107]:
for filename in sorted(test_file_names):
    origSize = stat("dataset/" + filename).st_size
    archSize = stat("out/contextual_d_test_out_8-6-4/" + filename).st_size
    print("{} & {} & {} & {}".format(
        filename,
        origSize,
        round(archSize / origSize * 8, 3),
        archSize)
    )

bib & 111261 & 2.596 & 36107
book1 & 768771 & 2.671 & 256667
book2 & 610856 & 2.507 & 191445
geo & 102400 & 5.53 & 70785
news & 377109 & 3.135 & 147789
obj1 & 21504 & 4.273 & 11487
obj2 & 246814 & 3.107 & 95846
paper1 & 53161 & 3.109 & 20657
paper2 & 82199 & 2.956 & 30372
pic & 513216 & 1.0 & 64183
progc & 39611 & 3.167 & 15680
progl & 71646 & 2.316 & 20743
progp & 49379 & 2.23 & 13766
trans & 93695 & 2.082 & 24386


In [115]:
do_tests("exe/d_contextual_encoder_improved", "dataset",
         test_file_names, "out/contextual_d_plus_test_out_8-8-4", "",
         7, ["-l", "off", "-b", "8", "-c", "8", "-q", "4"])

Process files:  obj1 obj2 geo trans progp progl book2
In file: dataset/obj1. 	 Out file: out/contextual_d_plus_test_out_8-8-4/obj1
In file: dataset/obj2. 	 Out file: out/contextual_d_plus_test_out_8-8-4/obj2
In file: dataset/geo. 	 Out file: out/contextual_d_plus_test_out_8-8-4/geo
In file: dataset/trans. 	 Out file: out/contextual_d_plus_test_out_8-8-4/trans
In file: dataset/progp. 	 Out file: out/contextual_d_plus_test_out_8-8-4/progp
In file: dataset/progl. 	 Out file: out/contextual_d_plus_test_out_8-8-4/progl
In file: dataset/book2. 	 Out file: out/contextual_d_plus_test_out_8-8-4/book2
Process files:  paper1 news progc pic bib paper2 book1
In file: dataset/paper1. 	 Out file: out/contextual_d_plus_test_out_8-8-4/paper1
In file: dataset/news. 	 Out file: out/contextual_d_plus_test_out_8-8-4/news
In file: dataset/progc. 	 Out file: out/contextual_d_plus_test_out_8-8-4/progc
In file: dataset/pic. 	 Out file: out/contextual_d_plus_test_out_8-8-4/pic
In file: dataset/bib. 	 Out file: 

In [116]:
do_tests("exe/a_contextual_encoder_improved", "dataset",
         test_file_names, "out/contextual_a_plus_test_out_8-8-4", "",
         7, ["-l", "off", "-b", "8", "-c", "8", "-q", "4"])

Process files:  obj1 obj2 geo trans progp progl book2
In file: dataset/obj1. 	 Out file: out/contextual_a_plus_test_out_8-8-4/obj1
In file: dataset/obj2. 	 Out file: out/contextual_a_plus_test_out_8-8-4/obj2
In file: dataset/geo. 	 Out file: out/contextual_a_plus_test_out_8-8-4/geo
In file: dataset/trans. 	 Out file: out/contextual_a_plus_test_out_8-8-4/trans
In file: dataset/progp. 	 Out file: out/contextual_a_plus_test_out_8-8-4/progp
In file: dataset/progl. 	 Out file: out/contextual_a_plus_test_out_8-8-4/progl
In file: dataset/book2. 	 Out file: out/contextual_a_plus_test_out_8-8-4/book2
Process files:  paper1 news progc pic bib paper2 book1
In file: dataset/paper1. 	 Out file: out/contextual_a_plus_test_out_8-8-4/paper1
In file: dataset/news. 	 Out file: out/contextual_a_plus_test_out_8-8-4/news
In file: dataset/progc. 	 Out file: out/contextual_a_plus_test_out_8-8-4/progc
In file: dataset/pic. 	 Out file: out/contextual_a_plus_test_out_8-8-4/pic
In file: dataset/bib. 	 Out file: 

In [117]:
for filename in sorted(test_file_names):
    origSize = stat("dataset/" + filename).st_size
    archSize = stat("out/contextual_d_plus_test_out_8-8-4/" + filename).st_size
    print("{} & {} & {} & {}".format(
        filename,
        origSize,
        round(archSize / origSize * 8, 3),
        archSize)
    )

bib & 111261 & 2.504 & 34818
book1 & 768771 & 2.633 & 253031
book2 & 610856 & 2.402 & 183444
geo & 102400 & 4.998 & 63972
news & 377109 & 3.082 & 145265
obj1 & 21504 & 4.3 & 11559
obj2 & 246814 & 3.031 & 93512
paper1 & 53161 & 3.069 & 20397
paper2 & 82199 & 2.917 & 29974
pic & 513216 & 0.944 & 60582
progc & 39611 & 3.107 & 15384
progl & 71646 & 2.332 & 20887
progp & 49379 & 2.233 & 13782
trans & 93695 & 2.158 & 25273


In [118]:
for filename in sorted(test_file_names):
    origSize = stat("dataset/" + filename).st_size
    archSize = stat("out/contextual_a_plus_test_out_8-8-4/" + filename).st_size
    print("{} & {} & {} & {}".format(
        filename,
        origSize,
        round(archSize / origSize * 8, 3),
        archSize)
    )

bib & 111261 & 2.553 & 35502
book1 & 768771 & 2.66 & 255617
book2 & 610856 & 2.431 & 185640
geo & 102400 & 6.011 & 76946
news & 377109 & 3.18 & 149917
obj1 & 21504 & 4.778 & 12842
obj2 & 246814 & 3.195 & 98579
paper1 & 53161 & 3.133 & 20822
paper2 & 82199 & 2.967 & 30489
pic & 513216 & 0.996 & 63903
progc & 39611 & 3.2 & 15842
progl & 71646 & 2.349 & 21033
progp & 49379 & 2.264 & 13975
trans & 93695 & 2.171 & 25423
