In [42]:
%pip install msgpack-python bson avro msgpack-numpy

Collecting msgpack-numpy
  Downloading msgpack_numpy-0.4.8-py2.py3-none-any.whl (6.9 kB)
Collecting msgpack>=0.5.2
  Using cached msgpack-1.0.7-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (530 kB)
Installing collected packages: msgpack, msgpack-numpy
Successfully installed msgpack-1.0.7 msgpack-numpy-0.4.8
You should consider upgrading via the '/home/nano/projects/tree-ensemble-model-management/.venv/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [13]:
import json
import os

import avro.schema
from avro.datafile import DataFileReader, DataFileWriter
from avro.io import DatumReader, DatumWriter
import msgpack

In [16]:
import math

def mean_and_std(data):
    # Calculate mean
    mean = sum(data) / len(data)

    # Calculate standard deviation
    variance = sum((x - mean) ** 2 for x in data) / len(data)
    std_dev = math.sqrt(variance)
    
    return mean, std_dev

In [17]:
xgboost_path = "/home/nano/projects/tree-ensemble-model-management/models/xgboost"

daily_path = xgboost_path + "/daily"
daily_100 = daily_path + "/100trees"
daily_1000 = daily_path + "/1000trees"
daily_10000 = daily_path + "/10000trees"

fivedays_path = xgboost_path + "/5days"
fivedays_100 = fivedays_path + "/100trees"
fivedays_1000 = fivedays_path + "/1000trees"
fivedays_10000 = fivedays_path + "/10000trees"

In [18]:
import xgboost as xgb
from time import time
import json
import timeit

import xgboost as xgb
import msgpack
import tempfile

from avro.datafile import DataFileReader
from avro.io import DatumReader
import tempfile

import xgboost as xgb
import avro.schema
from avro.datafile import DataFileWriter
from avro.io import DatumWriter

In [19]:
def load_model_json(model_path):
    booster = xgb.Booster()
    booster.load_model(model_path)
    return booster

def save_model_json(booster, model_path):
    booster.save_model(model_path)

In [57]:
def save_model_avro(booster, model_path_avro, path_avro_schema="xgboost.avsc"):
    # Save the model to a temporary JSON file
    temp_model_path = tempfile.mktemp(suffix=".json")
    booster.save_model(temp_model_path)

    # Read the JSON data from the temporary file
    with open(temp_model_path, "r") as f:
        model_json = json.load(f)

    # Load Avro schema
    avro_schema = avro.schema.parse(open(path_avro_schema, "rb").read())

    # Write the JSON data to an Avro file
    with open(model_path_avro, "wb") as avro_file:
        with DataFileWriter(avro_file, DatumWriter(), avro_schema) as writer:
            writer.append(model_json)


def load_model_avro(model_path_avro):
    # Read from Avro file
    with open(model_path_avro, "rb") as avro_file:
        with DataFileReader(avro_file, DatumReader()) as reader:
            for record in reader:
                model = record

    # Temporarily save the binary data to a file
    temp_model_path = tempfile.mktemp()
    with open(temp_model_path, "w") as f:
        json.dump(model, f)

    # Load the model using the temporary file
    booster = xgb.Booster()
    booster.load_model(temp_model_path)
    return booster

In [53]:
def save_model_msgpack(booster, model_path_msgpack):
    # Serialize the booster object directly to a byte stream
    model_bytearray = booster.save_raw()

    # Pack the byte stream using MessagePack
    packed = msgpack.packb(model_bytearray)

    # Write the packed data to a file
    with open(model_path_msgpack, "wb") as outfile:
        outfile.write(packed)

def load_model_msgpack(model_path_msgpack):
    # Read and unpack the MessagePack data
    with open(model_path_msgpack, "rb") as f:
        packed_data = f.read()
    model_bytearray = msgpack.unpackb(packed_data)

    # Temporarily save the binary data to a file
    temp_model_path = tempfile.mktemp()
    with open(temp_model_path, "wb") as f:
        f.write(model_bytearray)

    # Load the model using the temporary file
    booster = xgb.Booster()
    booster.load_model(temp_model_path)
    return booster

In [58]:
def get_metrics(model_dir):
    dt_json_load = []
    dt_json_save = []
    json_sizes = []
    dt_avro_load = []
    dt_avro_save = []
    avro_sizes = []
    dt_msgpack_load = []
    dt_msgpack_save = []
    msgpack_sizes = []

    for model_file in os.listdir(model_dir):
        model_path = os.path.join(model_dir, model_file)

        # JSON
        # Load
        t0 = time()
        booster = load_model_json(model_path)
        dt_json_load.append(time() - t0)

        # Save
        t0 = time()
        save_model_json(booster, model_path)
        dt_json_save.append(time() - t0)

        # Size
        json_sizes.append(os.path.getsize(model_path))

        # AVRO
        # Save
        avro_path = "test.avro"
        t0 = time()
        save_model_avro(booster, avro_path)
        dt_avro_save.append(time() - t0)

        # Load
        t0 = time()
        load_model_avro(avro_path)
        dt_avro_load.append(time() - t0)

        # Size
        avro_sizes.append(os.path.getsize(avro_path))
        os.remove(avro_path)

        # MSGPACK
        msgpack_path = "test.msgpack"
        # Save
        t0 = time()
        save_model_msgpack(booster, msgpack_path)
        dt_msgpack_save.append(time() - t0)

        # # Load
        t0 = time()
        msgpack_booster = load_model_msgpack(msgpack_path)
        dt_msgpack_load.append(time() - t0)

        # Size
        msgpack_sizes.append(os.path.getsize(msgpack_path))
        os.remove(msgpack_path)

    # Evaluation
    # Load
    json_mean, json_std = mean_and_std(dt_json_load)
    avro_mean, avro_std = mean_and_std(dt_avro_load)
    msgpack_mean, msgpack_std = mean_and_std(dt_msgpack_load)
    print(
        f"JSON: {json_mean} +- {json_std}; AVRO: {avro_mean} +- {avro_std}: MsgPack: {msgpack_mean} +- {msgpack_std}"
    )
    # Save
    json_mean, json_std = mean_and_std(dt_json_save)
    avro_mean, avro_std = mean_and_std(dt_avro_save)
    msgpack_mean, msgpack_std = mean_and_std(dt_msgpack_save)
    print(
        f"JSON: {json_mean} +- {json_std}; AVRO: {avro_mean} +- {avro_std}: MsgPack: {msgpack_mean} +- {msgpack_std}"
    )
    # Size
    json_mean, json_std = mean_and_std(json_sizes)
    avro_mean, avro_std = mean_and_std(avro_sizes)
    msgpack_mean, msgpack_std = mean_and_std(msgpack_sizes)
    print(
        f"JSON: {json_mean} +- {json_std}; AVRO: {avro_mean} +- {avro_std}: MsgPack: {msgpack_mean} +- {msgpack_std}"
    )

In [59]:
get_metrics(daily_100)




JSON: 0.006000472653296686 +- 0.0031863884951864475; AVRO: 0.05663604890146563 +- 0.006889361100828236: MsgPack: 0.0009072365299347908 +- 0.00011541351150931733
JSON: 0.00447799313452936 +- 0.003593799125430099; AVRO: 0.06876660931494928 +- 0.02166959174812385: MsgPack: 0.00060287598640688 +- 0.0004617794673667119
JSON: 106012.87096774194 +- 3172.877606516098; AVRO: 58621.67741935484 +- 2272.6995655599894: MsgPack: 64061.967741935485 +- 2147.8749210274727




In [34]:
get_metrics(daily_1000)

JSON: 0.024463799691969348 +- 0.004038995372208485; AVRO: 0.676363468170166 +- 0.09830436931241641: MsgPack: 0.17586958792901808 +- 0.0396373045414616
JSON: 1027167.7419354839 +- 65128.09421636705; AVRO: 543157.5161290322 +- 49329.47228052968: MsgPack: 848130.3548387097 +- 54502.096297978205


In [35]:
get_metrics(daily_10000)

KeyboardInterrupt: 