In [1]:
import os, sys
import time
import json

from pprint import pprint
from collections import defaultdict

import matplotlib.pyplot as plt

import bz2
import gzip
import zlib

import avro.schema
from avro.datafile import DataFileReader, DataFileWriter
from avro.io import DatumReader, DatumWriter

In [2]:
avro_schema = avro.schema.parse(
    open(
        "/home/nano/projects/tree-ensemble-model-management/avro/xgboost.avsc", "rb"
    ).read()
)

In [3]:
xgboost_daily = (
    "/home/nano/projects/tree-ensemble-model-management/models/xgboost/daily"
)

xgboost_5days = (
    "/home/nano/projects/tree-ensemble-model-management/models/xgboost/5days"
)

In [7]:
def encoding_vs_compression(model_folder, compression_func):
    compression_rates = {"avro": [], "compressed": [], "avro+compressed": []}

    for model_file in os.listdir(model_folder):
        model_path = os.path.join(model_folder, model_file)
        with open(model_path, "r") as f:
            model_json = json.load(f)

        with open(model_path, "rb") as f:
            model_binary = f.read()

        original_size = os.path.getsize(model_path)

        # AVRO
        writer = DataFileWriter(open("model.avro", "wb"), DatumWriter(), avro_schema)
        writer.append(model_json)
        writer.close()
        avro_size = os.path.getsize("model.avro")

        # Compression
        compressed_model = compression_func(model_binary)
        with open("model.bin", "wb") as f:
            f.write(compressed_model)
        compressed_size = os.path.getsize("model.bin")

        # Compression + AVRO
        with open("model.bin", "rb") as f:
            avro_binary = f.read()
        compressed_avro = compression_func(avro_binary)
        with open("model_avro_compressed.bin", "wb") as f:
            f.write(compressed_avro)
        compressed_avro_size = os.path.getsize("model_avro_compressed.bin")

        os.remove("model.avro")
        os.remove("model_avro_compressed.bin")
        os.remove("model.bin")

        avro_ratio = original_size / avro_size
        compressed_ratio = original_size / compressed_size
        compressed_avro_ratio = original_size / compressed_avro_size

        compression_rates["avro"].append(avro_ratio)
        compression_rates["compressed"].append(compressed_ratio)
        compression_rates["avro+compressed"].append(compressed_avro_ratio)

    for key, rates in compression_rates.items():
        compression_rates[key] = sum(rates) / len(rates)

    return compression_rates

In [8]:
compression_rates = encoding_vs_compression(xgboost_daily, bz2.compress)
print(compression_rates)

{'avro': 1.8961918401166984, 'compressed': 6.888617001336445, 'avro+compressed': 6.857327289494049}


In [9]:
compression_rates = encoding_vs_compression(xgboost_5days, bz2.compress)
print(compression_rates)

{'avro': 1.9107413001045686, 'compressed': 6.6189130073585565, 'avro+compressed': 6.59103601638846}


In [10]:
compression_rates = encoding_vs_compression(xgboost_daily, gzip.compress)
print(compression_rates)

{'avro': 1.8961918401166984, 'compressed': 5.669686765668213, 'avro+compressed': 5.693787775923223}


In [11]:
compression_rates = encoding_vs_compression(xgboost_5days, gzip.compress)
print(compression_rates)

{'avro': 1.9107413001045686, 'compressed': 5.50347247214127, 'avro+compressed': 5.525603669509486}


In [12]:
compression_rates = encoding_vs_compression(xgboost_daily, zlib.compress)
print(compression_rates)

{'avro': 1.8961918401166984, 'compressed': 5.520476168665319, 'avro+compressed': 5.543701008647218}


In [13]:
compression_rates = encoding_vs_compression(xgboost_5days, zlib.compress)
print(compression_rates)

{'avro': 1.9107413001045686, 'compressed': 5.37702255641472, 'avro+compressed': 5.398432169563723}
