In [None]:
import os
from glob import glob
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# dataset information
dataset_names = ['embryo', 'tissue_on', 'tissue_off']
benchmark_count = 2
# information
compression_options = {
    'gzip': (1,5,9),
    'lzma': (1,5,9),
    'bzip2': (1,5,9),
    'zlib': (1,5,9),
    'lz4': (1,),
    'zstd': (1,11,22),
    'blosc-zlib': (1,5,9),
    'blosc-lz4': (1,5,9),
    'blosc-zstd': (1,5,9),
}
#filter_options = ['none', 'Shuffle', 'FixedScaleOffset', 'FixedScaleOffset-Shuffle']
filter_options = ['none', 'Shuffle', 'BitRound-14', 'BitRound-14-Shuffle']
# color used: Paired_10
filter_display_colors = {
    'none': [(31/255, 120/255, 180/255), (99/255, 163/255, 204/255), (166/255, 206/255, 227/255)], # blue
    'Shuffle': [(51/255, 160/255, 44/255), (115/255, 192/255, 91/255), (178/255, 223/255, 138/255)], # Green
    'BitRound-14': [(227/255, 26/255, 28/255), (239/255, 90/255, 91/255), (251/255, 154/255, 153/255)], # Red
    'BitRound-14-Shuffle': [(255/255, 127/255, 0/255), (254/255, 159/255, 56/255), (253/255, 191/255, 111/255)], # Orange
    'FixedScaleOffset': [(227/255, 26/255, 28/255), (239/255, 90/255, 91/255), (251/255, 154/255, 153/255)], # Red
    'FixedScaleOffset-Shuffle': [(255/255, 127/255, 0/255), (254/255, 159/255, 56/255), (253/255, 191/255, 111/255)], # Orange
}

1. Load tables

Get the list of files w/ benchmark_count amount.
Then read the files into tables and merge for each dataset name

In [None]:
"""
Load all the table data and return a single table.
"""
def load_chunked_table(table_paths: list[str]) -> pd.DataFrame:
    dfs = []
    for idx, file in enumerate(table_paths):
        df = pd.read_csv(file)
        dfs.append(df)
    whole_df = pd.concat(dfs)
    return whole_df

In [None]:
table_dataset = dict()
for name in dataset_names:
    # load all tables
    dfs = []
    for idx in range(1, 1 + benchmark_count):
        df = load_chunked_table(glob(f'./output/bench_{name}_8G*/compression_benchmark_{idx}.csv'))
        df['index'] = idx
        dfs.append(df)
    whole_df = pd.concat(dfs)
    # get everaged table
    group_df = whole_df.groupby(['compression option', 'filter option'])
    group_mean_df = group_df.mean()
    group_mean_df.pop('index')
    # save table
    table_dataset[name] = group_mean_df

## Estimate speed

In [None]:
from scipy.linalg import solve

In [None]:
a.shape

In [None]:
comp_key

In [None]:
arg_size

In [None]:
np.zeros((2,3)) @ np.zeros((3,4))

In [None]:
comp_idx

In [None]:
arg_size

In [None]:
comp_size = np.sum([len(deg) for deg in compression_options.values()])
filt_size = 2 # len(filter_options)
sp_size = len(table_dataset)

arg_size = filt_size + comp_size * sp_size + 1 # one for the memory write time (sec/MiB)
comb_size = (filt_size ** 2) * comp_size * sp_size
a = np.zeros((comb_size,arg_size))
b = np.zeros((comb_size,))

comp_dict = dict()
comb_idx = 0
comp_idx = 0
for name, df in table_dataset.items():
    for comp_key, comp_degree in compression_options.items():
        for d in comp_degree:
            comp_name = f'{comp_key}-{d}'
            for filt_name in filter_options:
                a[comb_idx, 0] = filt_name.endswith('Shuffle')
                a[comb_idx, 1] = filt_name.startswith('BitRound')
                a[comb_idx, comp_idx + 2] = 1
                a[comb_idx, -1] = 1/df.loc[(comp_name, filt_name), 'compression ratio']
                b[comb_idx] = 2 ** 20/df.loc[(comp_name, filt_name), 'decompression speed (bytes/sec)'] # sec/MiB
                comb_idx += 1
            comp_idx += 1
aa = a.T @ a
ab = a.T @ b
x = solve(aa, ab)

        

In [None]:
x

In [None]:
ab

In [None]:
aa @ x

In [None]:
aa @ x - ab

## Ranking

In [None]:
def rank_data(table_dataset : dict[str, pd.DataFrame], compression_options: dict[str, tuple[int]], filter_options: list[str]):
    literal_comp_options = []
    for comp, comp_degree in compression_options.items():
        literal_comp_options += [f'{comp}-{d}' for d in comp_degree]
    for name, df in table_dataset.items():
        indices = [idx for idx in df.index if idx[0] in literal_comp_options and idx[1] in filter_options]
        df_target = df.loc[indices]
        for col_name in df_target.columns:
            series = df_target[col_name]
            sorted_result = series.sort_values(ascending = False)
            print(f"{name}, {col_name}: {sorted_result.index[:3]}")

In [None]:
rank_data(table_dataset, compression_options, ['none'])

## Visualization

In [None]:
def plot_data(table_dataset, compression_options, filter_options, filter_display_colors, col_name, save_dir = None, show_none_value = False,**kwargs):
    for name, df in table_dataset.items():
        fig, ax = plt.subplots()
        ax.set_title(f"{col_name} ({name})")
        ax.set(**kwargs)
        x = np.zeros(len(filter_options))
        tick_loc = []
        tick_label = []
        for comp_name, comp_degree in compression_options.items():
            for d_idx, deg in enumerate(comp_degree):
                comp_opt = f'{comp_name}-{deg}'
                y = (df
                     .loc[comp_opt]
                     .loc[filter_options, col_name]
                     .sort_values(ascending=False)
                )
                c = [filter_display_colors[filt_name][d_idx] for filt_name in y.index]
                ax.bar(x, y, color=c)
                tick_loc.append(x[0])
                tick_label.append(comp_opt)
                x += 1
            x += 1
        ax.set_xticks(tick_loc, tick_label, rotation=90)
        ax.set_xlim(-1, tick_loc[-1] + 1)
        if show_none_value:
            y_val = df.loc[('none', 'none'), col_name]
            ax.hlines(y_val, -1, tick_loc[-1] + 1, colors='black', linestyles='dashed')
        if save_dir is not None:
            fig.savefig(os.path.join(save_dir,f'{col_name.split('(')[0]}-{name}.svg'))


In [None]:
plot_data(table_dataset, compression_options, filter_options, filter_display_colors, 'compression ratio', save_dir='./figure', ylim = (1,6))

In [None]:
plot_data(table_dataset, compression_options, filter_options, filter_display_colors, 'compression speed (bytes/sec)', save_dir='./figure', show_none_value=True, yscale = 'log', ylim = (10**5,10**8.5))

In [None]:
plot_data(table_dataset, compression_options, filter_options, filter_display_colors, 'decompression speed (bytes/sec)', save_dir='./figure', show_none_value=True, yscale = 'log', ylim = (10**7,10**9.5))