# Our Amazing Benchmark Evaluation

### Set path to log directory here.

In [None]:
LOG_DIR = "/Users/law/drive/msc/m3/adb/bm_results"

##  Run all cells and all the plots will magically appear :)

### Functions

In [None]:
import re

class BMRunInfo:
    def __init__(self, bm_type, bm_infos, final_info):
        self.bm_type = bm_type
        self.bm_infos = bm_infos
        self.final_info = final_info

# Example line (no linebreaks)
# 2018-08-12 18:56:45:761 10 sec: 4262 operations; 426.2 current ops/sec; est completion in 1 hour 18 minutes 
# [INSERT: Count=4266, Max=951807, Min=2522, Avg=12826.31, 90=16719, 99=58399, 99.9=947199, 99.99=951807]

BM_INFO_RE = re.compile(r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}:\d{3} (\d{2,}) sec: (\d+) operations; (\d*.\d*) current ops/sec; .* \[[A-Z]+: Count=(\d*), Max=(\d*), Min=(\d*), Avg=(\d*.\d*|\d*), 90=(\d*), 99=(\d*), 99\.9=(\d*), 99\.99=(\d*)\]')

class BMInfo:
    def __init__(self, log_line):
        match = BM_INFO_RE.match(log_line)
        if match is None:
            raise ValueError("Bad log line: " + log_line)
        self.seconds = match.group(1)
        self.total_ops = match.group(2)
        self.ops_per_sec = match.group(3)
        self.ops_per_interval = match.group(4)
        self.max_latency = match.group(5)
        self.min_latency = match.group(6)
        self.avg_latecny = match.group(7)
        self.p90 = match.group(8)
        self.p99 = match.group(9)
        self.p999 = match.group(10)
        self.p9999 = match.group(11)

In [None]:
def get_bm_run_from_file(file_name):
    bm_infos = []
    final_info = {}
    bm_type = ""
    for line in open(file_name):
        if line.startswith("2018-") and "0 sec: 0 operations;" not in line:
            bm_infos.append(BMInfo(line))
            
        # [INSERT], AverageLatency(us), 1851.142024
        # [OVERALL], Throughput(ops/sec), 8293.62759124027
        # [READ], AverageLatency(us), 1948.265838
        elif line.startswith("[INSERT],") or line.startswith("[OVERALL],") or line.startswith("[READ],"):
            split_line = line.split(', ')
            final_info[split_line[1]] = int(float(split_line[2].strip()))

        if line.startswith('[INSERT]'):
            bm_type = "INSERT"
        elif line.startswith('[READ]'):
            bm_type = "READ"
        
    return BMRunInfo(bm_type, bm_infos, final_info)        

In [None]:
import os
from collections import defaultdict

RUN_FILE_RE = re.compile(r"^(run_\d{1,2}-)")

def get_bm_runs(log_dir):
    print(log_dir)
    runs = defaultdict(list)
    for dir_path, dirs, files in os.walk(log_dir):
        for file in files:
            if not (file.endswith(".txt") or file.endswith(".log")) or file.endswith("FAILED.txt"):
                continue
            
            run_prefix = RUN_FILE_RE.match(file)
            if run_prefix is not None:
                file_key = file[len(run_prefix.group(1)):]            
            else:
                file_key = file
            
            runs[file_key].append(get_bm_run_from_file(os.path.join(dir_path, file)))
    return runs
    

In [None]:
from collections import defaultdict

def get_final_info(runs, file_type):
    res = defaultdict(list)
    for file, runs in runs.items():
        if file.startswith(file_type):
            for run in runs:
                res[file].append(run.final_info)
                
    return res

In [None]:
RUNS = get_bm_runs(LOG_DIR)
LOADS = get_final_info(RUNS, 'load')
READS = get_final_info(RUNS, 'read')  # or 'run'

In [None]:
print(RUNS)
print(LOADS)
print(READS)

In [None]:
def capacity_from_file(file):
    # read_capacity_1000-num_stores_1.txt
    match = re.search(r'capacity_(\d*)', file)
    if match is None:
        raise ValueError("bad file name: " + file)
    
    return int(match.group(1))

def r_w_from_file(file):
    match = re.search(r'r(\d+)_w(\d+)', file)
    if match is None:
        raise ValueError("bad file name: " + file)
        
    return int(match.group(1)), int(match.group(2))

## Plots

In [None]:
from matplotlib import rcParams
rcParams.update({'figure.autolayout': True, 'pgf.rcfonts' : False})

In [None]:
def print_metric_single_line(runs, metric, capacity, name):
    print('%8d' % capacity, sorted(list(run[metric] for run in runs[f"{name}_capacity_{capacity}-num_stores_1.txt"])))

def print_metric_single(runs, metric, name):
    capacities = [1000, 10000, 50000, 100000, 250000, 500000, 1000000, 10000000]
    print("\n")
    print(name, metric)
    for cap in capacities:
        print_metric_single_line(runs, metric, cap, name)

def print_metrics(metrics):
    for metric in metrics:
        print_metric_single(LOADS, metric, "load")
        print_metric_single(READS, metric, "read")

In [None]:
def print_quorum_metric_line(runs, metric, r, w, name):
    # load_n50_r17_w34.log
    print('R: %2d, W: %2d' % (r, w), sorted(list(run[metric] for run in runs[f"{name}_n50_r{r}_w{w}.log"])))
    
def print_quorum_metric_single(runs, metric, name):
    quorum = [(1, 50), (13, 38), (17, 34), (25, 26)]
    print("\n")
    print(name, metric)
    for r, w in quorum:
        print_quorum_metric_line(runs, metric, r, w, name)

def print_quorum_metrics(metrics):
    for metric in metrics:
        print_quorum_metric_single(LOADS, metric, "load")
        print_quorum_metric_single(READS, metric, "run")

In [None]:
INFO_NAMES = [
    ('Throughput(ops/sec)', 'throughput'),
#     ('RunTime(ms)', 'runtime'),
    ('AverageLatency(us)', 'avg_latency'),
#     ('MinLatency(us)', 'min_latency'),
#     ('MaxLatency(us)', 'max_latency'),
#     ('95thPercentileLatency(us)', '95p_latency'),
    ('99thPercentileLatency(us)', '99p_latency')
]

In [None]:
import matplotlib.pyplot as plt
from collections import defaultdict

def get_nested_max(a, b):
    max_a = max([max(x) for x in a])
    max_b = max([max(x) for x in b])
    return max(max_a, max_b)

def get_info_per_capacity(runs, info_name):
    cap_final_infos = []
    for file, infos in runs.items():
        cap = capacity_from_file(file)
        info_list = []
        for info in infos:    
            info_list.append(info[info_name])
        cap_final_infos.append((cap, info_list))
        
    cap_final_infos.sort(key=lambda x: x[0])
    caps, infos = list(zip(*cap_final_infos))
    return caps, infos

def get_info_quorum(runs, info_name):
    quorum_final_infos = []
    for file, infos in runs.items():
        r, w = r_w_from_file(file)
        info_list = []

        for info in infos:
            info_list.append(info[info_name])
        quorum_final_infos.append(((r, w), info_list))
        
    quorum_final_infos.sort(key=lambda x: x[0])
    quorums, infos = list(zip(*quorum_final_infos))
    return quorums, infos

def plot_info_per_capacity(info_name, out_file_name, save_fig = False):
    print(info_name, "\n" + ("=") * len(info_name))
    load_caps, load_infos = get_info_per_capacity(LOADS, info_name)
    read_caps, read_infos = get_info_per_capacity(READS, info_name)

    max_ylim = get_nested_max(load_infos, read_infos) * 1.1
    
    ############ LOAD ############ 
    plt.boxplot(load_infos, labels=[str(x) for x in load_caps], showfliers=False, whis="range")
    plt.ylabel(info_name)
    plt.xlabel("Partition capacity")
    plt.ylim(ymin=0, ymax=max_ylim)
    plt.xticks(rotation=90)
    plt.legend(["INSERT"])

    if save_fig:
        plt.savefig(f"load_{out_file_name}.svg")
        plt.savefig(f"load_{out_file_name}.pgf")
        plt.savefig(f"load_{out_file_name}.png")
    plt.show()
    
    ############ READ ############     
    plt.boxplot(read_infos, labels=[str(x) for x in read_caps], showfliers=False, whis="range")
    plt.ylabel(info_name)
    plt.xlabel("Partition capacity")
    plt.ylim(ymin=0, ymax=max_ylim)
    plt.xticks(rotation=90)
    plt.legend(["READ"])

    if save_fig:
        plt.savefig(f"read_{out_file_name}.svg")
        plt.savefig(f"read_{out_file_name}.pgf")
        plt.savefig(f"read_{out_file_name}.png")
    plt.show()
    
def plot_quorum_size(info_name, out_file_name, save_fig = False):
    print(info_name, "\n" + ("=") * len(info_name))
    load_quorums, load_infos = get_info_quorum(LOADS, info_name)
    read_quorums, read_infos = get_info_quorum(READS, info_name)
    
    def get_quorum_strings(quorums):
        return [f"r={r},\nw={w}" for r, w in quorums]
    
    max_ylim = get_nested_max(load_infos, read_infos) * 1.1
    ############ LOAD ############ 
    y = [x for li in load_infos for x in li]
    plt.bar(list(range(len(load_quorums))), y, tick_label=get_quorum_strings(load_quorums), width=0.3, color="white", edgecolor="black") #, showfliers=False, whis="range")
    plt.ylabel(info_name)
    plt.xlabel("Quorum configuration")
    plt.ylim(ymin=0, ymax=max_ylim)
    plt.legend(["INSERT"])

    if save_fig:
        plt.savefig(f"multi_load_{out_file_name}.svg")
        plt.savefig(f"multi_load_{out_file_name}.pgf")
        plt.savefig(f"multi_load_{out_file_name}.png")
    plt.show()
    
    ############ READ ############    
    z = [x for ri in read_infos for x in ri]
    plt.bar(list(range(len(read_quorums))), z, tick_label=get_quorum_strings(load_quorums), width=0.3, color="white", edgecolor="black") #, showfliers=False, whis="range")
    plt.ylabel(info_name)
    plt.xlabel("Quorum configuration")
    plt.ylim(ymin=0, ymax=max_ylim)
    plt.legend(["READ"])

    if save_fig:
        plt.savefig(f"multi_read_{out_file_name}.svg")
        plt.savefig(f"multi_read_{out_file_name}.pgf")
        plt.savefig(f"multi_read_{out_file_name}.png")
    plt.show()


In [None]:
for info_name, out_file_name in INFO_NAMES:
    plot_info_per_capacity(info_name, out_file_name, save_fig=True)

In [None]:
for info_name, out_file_name in INFO_NAMES:
    plot_quorum_size(info_name, out_file_name, save_fig=True)

In [None]:
METRICS = [
    'Throughput(ops/sec)',
    'AverageLatency(us)',
    '99thPercentileLatency(us)'
]

print_quorum_metrics(METRICS)

In [None]:
import numpy as np

data = [1545, 1579, 1597, 1621, 1636, 1668, 1700, 1764, 1807, 1834, 1907, 1957]
dev = np.std(data)
mean = np.mean(data)

print(dev)
print(dev / mean)