In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import colormaps as cm

In [None]:
# benchmark path
benchmark_path = {
    'bacteria':'../output/merged-bacteria-8G.csv',
    'HeLa':'../output/merged-HeLa-8G.csv',
    'embryo':'../output/merged-embryo-8G.csv',
    'organoid':'../output/merged-organoid-8G.csv',
    'tissue-on':'../output/merged-tissue-on-8G.csv',
    'tissue-off':'../output/merged-tissue-off-8G.csv',
}
# expressions
compression_options = {
    'gzip': (1,5,9),
    'zlib': (1,5,9),
    'bzip2': (1,5,9),
    'lzma': (1,5,9),
    'lz4': (1,),
    'Snappy': (0,),
    'zstd': (1,11,22),
    'PCodec': (0, 6, 12),
    'blosc-zlib': (1,5,9),
    'blosc-lz4': (1,5,9),
    'blosc-zstd': (1,5,9),
    'LOSSLESS_ZFP': (0,),
    'LOSSY_ZFP': (0,),
}
filter_options = ['none', 'Shuffle', 'BitRound-14', 'BitRound-14-Shuffle']


###  Load data

In [None]:
benchmark_table = dict()
for name, path in benchmark_path.items():
    df = pd.read_csv(path)
    benchmark_table[name] = df

### Visualization 1: overall

Show benchmark results for each dataset and benchamrk parameter

In [None]:
filter_display_colors = {
    'none': [(31/255, 120/255, 180/255), (99/255, 163/255, 204/255), (166/255, 206/255, 227/255)], # blue
    'Shuffle': [(51/255, 160/255, 44/255), (115/255, 192/255, 91/255), (178/255, 223/255, 138/255)], # Green
    'BitRound-14': [(227/255, 26/255, 28/255), (239/255, 90/255, 91/255), (251/255, 154/255, 153/255)], # Red
    'BitRound-14-Shuffle': [(255/255, 127/255, 0/255), (254/255, 159/255, 56/255), (253/255, 191/255, 111/255)], # Orange
}

In [None]:
kwargs_list = dict()
for sample_name in ['bacteria', 'HeLa', 'embryo', 'organoid', 'tissue-on', 'tissue-off']:
    kwargs_list[sample_name] = {
        'compression ratio': {
            'ylim':(1,8),
            'yscale':'linear',
        },
        'compression speed (bytes/sec)': {
            'ylim':(10**7.5,10**10),
            'yscale':'log',
        },
        'decompression speed (bytes/sec)': {
            'ylim':(10**7,10**9.5),
            'yscale':'log',
        },
    }

kwargs_list['organoid']['compression ratio']['ylim'] = (1, 10)
kwargs_list['tissue-off']['compression ratio']['ylim'] = (1, 2)


In [None]:
def view_compression_ratio(save_path, df, compression_options, filter_options, filter_display_colors, **kwargs):
    fig, ax = plt.subplots()
    ax.set(**kwargs)
    x = np.zeros(len(filter_options))
    tick_loc = []
    tick_label = []
    for comp_name, comp_degree in compression_options.items():
        for d_idx, deg in enumerate(comp_degree):
            comp_opt = f'{comp_name}-{deg}'
            sub_df = df.loc[comp_opt]
            avail_f_opt = [f for f in filter_options if f in sub_df.index.tolist()]
            y = (sub_df
                 .loc[avail_f_opt]
                 .sort_values(ascending=False)
            )
            c = [filter_display_colors[filt_name][d_idx] for filt_name in y.index]
            ax.bar(x[0:len(y)], y, color=c)
            tick_loc.append(x[0])
            tick_label.append(comp_opt)
            x += 1
        x += 1
    ax.set_xticks(tick_loc, tick_label, rotation=90)
    ax.set_xlim(-1, tick_loc[-1] + 1)
    fig.savefig(save_path)


In [None]:
def view_throughput(save_path, df, compression_options, filter_options, filter_display_colors, **kwargs):
    fig, ax = plt.subplots()
    ax.set(**kwargs)
    x = np.zeros(len(filter_options))
    tick_loc = []
    tick_label = []
    for comp_name, comp_degree in compression_options.items():
        for d_idx, deg in enumerate(comp_degree):
            comp_opt = f'{comp_name}-{deg}'
            sub_df = df.loc[comp_opt]
            avail_f_opt = [f for f in filter_options if f in sub_df.index.tolist()]
            avail_f_opt = [avail_f_opt[0]]
            y = (sub_df
                 .loc[avail_f_opt]
                 .sort_values(ascending=False)
            )
            c = [filter_display_colors[filt_name][d_idx] for filt_name in y.index]
            ax.bar(x[0:len(y)], y, color=c)
        tick_loc.append(x[0])
        tick_label.append(comp_name)
        x += 1
    ax.set_xticks(tick_loc, tick_label, rotation=90)
    ax.set_xlim(-1, tick_loc[-1] + 1)
    fig.savefig(save_path)


In [None]:
for name, df in benchmark_table.items():
    grp_df = (df
        .groupby(['compression option', 'filter option'])
        .mean()
    )
    # ploat compression ratio
    bench_param = 'compression ratio'
    bench_filter_options = ['none', 'Shuffle', 'BitRound-14', 'BitRound-14-Shuffle']
    kwargs = kwargs_list[name][bench_param]
    srs = grp_df.loc[:,bench_param]
    save_path = f'../figure/view-bench-1-overall/{name}-{bench_param.split("(")[0].strip()}.svg'
    view_compression_ratio(save_path, srs, compression_options, bench_filter_options, filter_display_colors, **kwargs)

    # plot throughput
    for bench_param in ['compression speed (bytes/sec)', 'decompression speed (bytes/sec)']:
        bench_filter_options = ['Shuffle', 'none']
        kwargs = kwargs_list[name][bench_param]
        srs = grp_df.loc[:,bench_param]
        save_path = f'../figure/view-bench-1-overall/{name}-{bench_param.split('(')[0].strip()}.svg'
        view_throughput(save_path, srs, compression_options, bench_filter_options, filter_display_colors, **kwargs)

### Table 1: relative performance to Gzip

In [None]:
for name, df in benchmark_table.items():
    perf_df = df.copy()
    perf_df['compressor type'] = df['compression option'].map(lambda x: x.rsplit('-',1)[0])
    perf_df.pop('compression option')
    grp_df = (perf_df
        .groupby(['compressor type', 'filter option'])
        .mean()
    )
    for filt in filter_options:
        ref_bench = grp_df.loc[('gzip',filt)]
        grp_df.loc[[(comp, filt) for comp in compression_options]] /= ref_bench
    save_path = f'../figure/table-1-relative-bench/{name}.csv'
    grp_df.to_csv(save_path)

### Visualization 2: two-axis plot

Show benchmark results for each dataset, two-benchmark parameter w/ pure comprssion data

In [None]:
comp_idx = ['gzip','lzma','bzip2','zlib','lz4','zstd','blosc-zlib','blosc-lz4','blosc-zstd']
compression_display_markers = pd.DataFrame(data={
        'marker': ['o','o','o','o','^','s','p','p','p'],
        'markerfacecolor':[(0,0,0),(1/3,1/3,1/3),(2/3,2/3,2/3),(1,1,1),(0,0,0),(0,0,0),(0,0,0),(1/2,1/2,1/2),(1,1,1)],
        'markeredgecolor':[(0,0,0) for _ in range(len(comp_idx))],
        'linestyle':[':' for _ in range(len(comp_idx))],
        'color':[(0,0,0) for _ in range(len(comp_idx))],
    },
    index=comp_idx
)

In [None]:
def view_data2(save_path, df, compression_options, compression_display_markers,**kwargs):
    fig, ax = plt.subplots()
    ax.set(**kwargs)
    for comp_name, comp_opt in compression_options.items():
        idx_name_list = [(f'{comp_name}-{o}','Shuffle') for o in comp_opt]
        col_names = df.columns
        x = df.loc[idx_name_list,col_names[0]]
        y = df.loc[idx_name_list,col_names[1]]
        ax.plot(x,y,**dict(compression_display_markers.loc[comp_name]))
    fig.savefig(save_path)

In [None]:
for name, df in benchmark_table.items():
    grp_df = (df
        .groupby(['compression option', 'filter option'])
        .mean()
    )
    for bench_params, xyscale in zip(
            [('compression speed (bytes/sec)', 'compression ratio'),('decompression speed (bytes/sec)', 'compression ratio')],
            [('log','linear'),('log','linear')]
        ):
        save_path = f'../figure/view-bench-2-two-axis/{name}-{'-'.join([p.split('(')[0].strip() for p in bench_params])}.svg'
        sub_df = grp_df.loc[:,bench_params]
        view_data2(save_path, sub_df, compression_options, compression_display_markers,
                   xscale = xyscale[0], yscale = xyscale[1])

### Visualization 3

Show compression improvement for each filter

In [None]:
def view_data3(save_path, bench_by_filter, **kwargs):
    fig, ax = plt.subplots()
    ax.set(**kwargs)
    bplot = ax.boxplot(
        bench_by_filter.values(),
        tick_labels=bench_by_filter.keys()
    )
    print({k:v.median() for k,v in bench_by_filter.items()})
    fig.savefig(save_path)

In [None]:
for name, df in benchmark_table.items():
    bench_by_filter = {
        filt_name:df[(df['filter option'] == filt_name) & (df['compression option'] != 'none')] for filt_name in filter_options
    }
    bench_param = 'compression ratio'
    kwargs = kwargs_list[name][bench_param]
    save_path = f'../figure/view-bench-3-filter/{name}-{bench_param.split("(")[0].strip()}.svg'
    sub_bench_by_filter = {
        filt_name:df[bench_param] for filt_name, df in bench_by_filter.items()
    }
    view_data3(save_path, sub_bench_by_filter, **kwargs)
    # for bench_param in ['compression ratio', 'compression speed (bytes/sec)', 'decompression speed (bytes/sec)']:
    #     kwargs = kwargs_list[name][bench_param]
    #     save_path = f'../figure/view-bench-3-filter/{name}-{bench_param.split('(')[0].strip()}.svg'
    #     sub_bench_by_filter = {
    #         filt_name:df[bench_param] for filt_name, df in bench_by_filter.items()
    #     }
    #     view_data3(save_path, sub_bench_by_filter, **kwargs)
    

### Visualization 4: bar plot for w/ filters

In [None]:
kwargs_list = {
    'embryo':{
        'compression ratio': {
            'ylim':(1,6),
            'yscale':'linear',
        },
        'compression speed (bytes/sec)': {
            'ylim':(10**7.5,10**9.5),
            'yscale':'log',
        },
        'decompression speed (bytes/sec)': {
            'ylim':(10**7,10**9.5),
            'yscale':'log',
        },
    },
    'tissue-on':{
        'compression ratio': {
            'ylim':(1,6),
            'yscale':'linear',
        },
        'compression speed (bytes/sec)': {
            'ylim':(10**7.5,10**9.5),
            'yscale':'log',
        },
        'decompression speed (bytes/sec)': {
            'ylim':(10**7,10**9.5),
            'yscale':'log',
        },
    },
    'tissue-off':{
        'compression ratio': {
            'ylim':(1,2),
            'yscale':'linear',
        },
        'compression speed (bytes/sec)': {
            'ylim':(10**7.5,10**9.5),
            'yscale':'log',
        },
        'decompression speed (bytes/sec)': {
            'ylim':(10**7,10**9.5),
            'yscale':'log',
        },
    }
}

In [None]:
def view_data4(save_path, df, compression_options, filter_options, filter_display_colors, **kwargs):
    fig, ax = plt.subplots()
    ax.set(**kwargs)
    x = 0
    tick_loc = []
    tick_label = []
    
    for comp_name, comp_degree in compression_options.items():
        for filt in filter_options:
            comp_opt = [f'{comp_name}-{deg}' for deg in comp_degree]
            y = (df
                 .loc[filt]
                 .loc[comp_opt]
                 .sort_values(ascending=False)
            )
            sorted_comp_deg = list(map(lambda c_opt: int(c_opt.rsplit('-',1)[-1]), y.index))
            c_indices = np.argsort(sorted_comp_deg)
            c = [filter_display_colors[filt][c_idx] for c_idx in c_indices]
            ax.bar([x for _ in range(len(c))], y, color=c)
            tick_loc.append(x)
            tick_label.append(f'{comp_name}-{filt}')
            x += 1
        x += 1
    ax.set_xticks(tick_loc, tick_label, rotation=90)
    ax.set_xlim(-1, tick_loc[-1] + 1)
    fig.savefig(save_path)


In [None]:
for name, df in benchmark_table.items():
    grp_df = (df
        .groupby(['filter option', 'compression option'])
        .mean()
    )
    for bench_param, bench_filter_options in zip(
            ['compression ratio', 'compression speed (bytes/sec)', 'decompression speed (bytes/sec)'],
            [filter_options, filter_options, filter_options]):
        kwargs = kwargs_list[name][bench_param]
        srs = grp_df.loc[:,bench_param]
        save_path = f'../figure/view-bench-4-filter-bar/{name}-{bench_param.split('(')[0].strip()}.svg'
        view_data4(save_path, srs, compression_options, bench_filter_options, filter_display_colors, **kwargs)

### visualzation 5: bottleneck graph

In [None]:
def cm_map(v):
    true_color = cm['PiYG'](v)[:-1]
    return np.array([c * 0.8 for c in true_color])

In [None]:
def bottleneck(comp_ratio, comp_speed, decomp_speed, transfer_rate = 2**7): # 1Gbps = 128MB/s
    return min(comp_ratio*transfer_rate, comp_speed, decomp_speed) # higher the value, better the performance

In [None]:
def view_data5(save_path, rank_df, **kwargs):
    fig, ax = plt.subplots()
    ax.set(**kwargs)
    # get tested transfer rate
    base_transfer_rate_list = [int(idx.rsplit('-',1)[-1]) for idx in rank_df.columns if idx.startswith('rank')]
    base_transfer_rate_list.sort()
    x_transfer_rate = [2**bt for bt in base_transfer_rate_list]
    column_idx = [f'rank-{t}' for t in base_transfer_rate_list]
    x_min_text = 2**(base_transfer_rate_list[0]-1)
    x_max_text = 2**(base_transfer_rate_list[-1]+1)
    # calculate compression_options_color from last rank result of rank_df
    comp_option_names = rank_df.index.tolist()
    last_ranks = rank_df.iloc[:,-1]
    compression_options_color = {
        n: cm_map(r/(len(last_ranks)-1)) for n, r in zip(comp_option_names, last_ranks)
    }
    # draw graph
    for comp_opt,c in compression_options_color.items():
        y_rank = rank_df.loc[comp_opt,column_idx]
        ax.plot(x_transfer_rate, y_rank, color=c,marker='o')
        # insert text
        ax.text(x_min_text, y_rank[f'rank-{base_transfer_rate_list[0]}'], comp_opt, ha = 'right')
        ax.text(x_max_text, y_rank[f'rank-{base_transfer_rate_list[-1]}'], comp_opt)
    fig.savefig(save_path)

In [None]:
concat_df = pd.concat(list(benchmark_table.values()),axis=0,join='outer')
# remove 'compression option' == 'none' case
concat_df = concat_df[concat_df['compression option'] != 'none']
# remove 'BitRound' used cases
concat_df = concat_df[concat_df['filter option'].map(lambda x: not x.startswith('BitRound'))]
# remove nvcomp and zlibng used cases
concat_df = concat_df[concat_df['compression option'].map(lambda x: not x.startswith('Nvcomp'))]
concat_df = concat_df[concat_df['compression option'].map(lambda x: not x.startswith('Zlibng'))]
# If shuffle is used, then use it as the filter option or none
concat_df_none_only = concat_df[concat_df['compression option'].map(lambda x: ('PCodec' in x) or ('ZFP' in x))]
concat_df_shuffle_only = concat_df[concat_df['filter option'].map(lambda x: x == 'Shuffle')]
filter_df = pd.concat([concat_df_none_only, concat_df_shuffle_only], axis=0, join='outer')

# evaluate rank based on the bottleneck measurement
filter_df.pop('filter option')
rank_df = (filter_df
    .groupby(['compression option'])
    .mean()
)

# 
base_transfer_rate_range = range(18,36,3)
for base_transfer_rate in base_transfer_rate_range: # unit MB/s: #0.1Gps , 1Gbps (128MB/s), 10Gbps (1GB/s), 100GB/s
    transfer_rate = 2**base_transfer_rate
    rank = (rank_df
        .apply(lambda x: bottleneck(x['compression ratio'],x['compression speed (bytes/sec)'],x['decompression speed (bytes/sec)'],transfer_rate=transfer_rate),axis=1)
        .sort_values()
    )
    rank_df.loc[rank.index, f'rank-{base_transfer_rate}'] = np.arange(len(rank))
save_path = f'../figure/view-bench-5-bottlneck/ranking.svg'
view_data5(save_path, rank_df, xscale = 'log')
# Check all-rounder
rank = (rank_df[[f'rank-{t}' for t in base_transfer_rate_range]]
    .apply(min, axis=1))
print(rank.sort_values(ascending=False)[:7])
