In [None]:
pip install pandas matplotlib scipy numpy seaborn 

In [None]:
import copy
import pandas
import warnings
import requests
import scipy
from scipy import stats
import subprocess
import numpy as np
from matplotlib import pyplot as plt
import json
import pprint
import seaborn
import os
import sys
from pathlib import Path
from datetime import timedelta

sys.path.insert(0, "../src")
from perf_tools.analysis import make_differential_frame, get_data, get_summary_statistics
from perf_tools.analysis import check_are_close, make_latency_plot, plot_latency_stats

In [None]:
class YCSBLikeWorkload:
    def __init__(self, workdir, patch_id, variant, execution, task_name):
        self.workdir = workdir
        self.patch_id = patch_id
        self.variant = variant
        self.execution = execution
        self.task_name = task_name
        self.load_data = None
        self.readonly_data = None
        self.updateonly_data = None
        self.evensplit_read_data = None
        self.evensplit_write_data = None
        self.readheavy_read_data = None
        self.readheavy_write_data = None
        self.sharded = "shard" in variant
        Path(self.plots_path()).mkdir(parents=True, exist_ok=True)

    def json_path(self, metric):
        return os.path.join(self.workdir, self.patch_id, self.variant,
            self.task_name, str(self.execution), metric + ".json")
    def plots_path(self):
        return os.path.join(self.workdir, "plots", "sharded" if self.sharded else "replset")
        
    def get_load_data(self):
        if self.load_data is None:
            self.load_data = get_data(self.json_path("YCSBLike.load.inserts"))
        return self.load_data
    def get_100read_data(self):
        if self.readonly_data is None:
            self.readonly_data = get_data(self.json_path("YCSBLike.100read.reads"))
        return self.readonly_data
    def get_100update_data(self):
        if self.updateonly_data is None:
            self.updateonly_data = get_data(self.json_path("YCSBLike.100update.writes"))
        return self.updateonly_data
    def get_50read50update_data(self):
        if self.evensplit_read_data is None:
            self.evensplit_read_data = get_data(self.json_path("YCSBLike.50read50update.reads"))
        if self.evensplit_write_data is None:
            self.evensplit_write_data = get_data(self.json_path("YCSBLike.50read50update.writes"))
        return self.evensplit_read_data, self.evensplit_write_data
    def get_95read5update_data(self):
        if self.readheavy_read_data is None:
            self.readheavy_read_data = get_data(self.json_path("YCSBLike.95read5update.reads"))
        if self.readheavy_write_data is None:
            self.readheavy_write_data = get_data(self.json_path("YCSBLike.95read5update.writes"))
        return self.readheavy_read_data, self.readheavy_write_data

    def _plot_line_or_scatter(self, df, x, y, line=False, start=None, end=None, **kwargs):
        if line:
            return df[start:end].plot(x=x, y=y, figsize=(20,20), **kwargs)
        return df[start:end].plot.scatter(x=x, y=y, figsize=(20,20), **kwargs)

    def plot_load_data(self, x, y, line=False, start=None, end=None, **kwargs):
        title=f"{self.variant}-{self.task_name} load phase insert {y}"
        return self._plot_line_or_scatter(self.get_load_data().diff_data, x, y, line, start, end, title=title, **kwargs)
    def plot_100read_data(self, x, y, line=False, start=None, end=None, **kwargs):
        title=f"{self.variant}-{self.task_name} 100read read {y}"
        return self._plot_line_or_scatter(self.get_100read_data().diff_data, x, y, line, start, end, title=title, **kwargs)
    def plot_100update_data(self, x, y, line=False, start=None, end=None, **kwargs):
        title=f"{self.variant}-{self.task_name} 100update update {y}"
        return self._plot_line_or_scatter(self.get_100update_data().diff_data, x, y, line, start, end, title=title, **kwargs)
    def plot_95read5update_read_data(self, x, y, line=False, start=None, end=None, **kwargs):
        title=f"{self.variant}-{self.task_name} 95read5update read {y}"
        dfr, dfw = self.get_95read5update_data()
        return self._plot_line_or_scatter(dfr.diff_data, x, y, line, start, end, title=title, **kwargs)
    def plot_95read5update_write_data(self, x, y, line=False, start=None, end=None, **kwargs):
        title=f"{self.variant}-{self.task_name} 95read5update update {y}"
        dfr, dfw = self.get_95read5update_data()
        return self._plot_line_or_scatter(dfw.diff_data, x, y, line, start, end, title=title, **kwargs)
    def plot_50read50update_read_data(self, x, y, line=False, start=None, end=None, **kwargs):
        title=f"{self.variant}-{self.task_name} 50read50update read {y}"
        dfr, dfw = self.get_50read50update_data()
        return self._plot_line_or_scatter(dfr.diff_data, x, y, line, start, end, title=title, **kwargs)
    def plot_50read50update_write_data(self, x, y, line=False, start=None, end=None, **kwargs):
        title=f"{self.variant}-{self.task_name} 50read50update update {y}"
        dfr, dfw = self.get_50read50update_data()
        return self._plot_line_or_scatter(dfw.diff_data, x, y, line, start, end, title=title, **kwargs)


In [None]:
def save_plot(wld, axes, filename):
    axes.figure.savefig(os.path.join(wld.plots_path(), filename), bbox_inches="tight")

In [None]:
VARIANTS = {"replset": "linux-3-node-replSet-qebench", "sharded": "linux-shard-lite-qebench"}
WORKDIR="../datasets/genny/ycsblike"

patch_id = "63370d281e2d174e799c1141"
sharded_patch_id = "634404703066152024a287dc"
exec_idx=0
replset_executions = {
    "genny_qebench_unencrypted": [0,1,2,3,4],
    "genny_qebench_fle_1enc": [0,1,2,3,4],
    "genny_qebench_fle_5enc": [0,1,2,3,4],
    "genny_qebench_qe_1enc_cf16": [0,1,2,3,4],
    "genny_qebench_qe_1enc_cf32": [0,1,2,3,4],
    "genny_qebench_qe_1enc_cfdefault": [0,1,2,3,4],
    "genny_qebench_qe_5enc_cf16": [0,1,2,3,4],
    "genny_qebench_qe_5enc_cf32": [0,1,2,3,4],
    "genny_qebench_qe_5enc_cfdefault": [0,1,2,3,4],
}
sharded_executions = {
    "genny_qebench_unencrypted": [0,1,2,3,4],
    "genny_qebench_fle_1enc": [0,1,2,3,4],
    "genny_qebench_fle_5enc": [0,1,3,5,6],
    "genny_qebench_qe_1enc_cf16": [0,1,2,3,4],
    "genny_qebench_qe_1enc_cf32": [0,1,2,3,4],
    "genny_qebench_qe_1enc_cfdefault": [1,2,3,4,5],
    "genny_qebench_qe_5enc_cf16": [0,1,2,3,4],
    "genny_qebench_qe_5enc_cf32": [0,1,3,4,5],
    "genny_qebench_qe_5enc_cfdefault": [0,1,2,3,4],
}
replset_workloads = {
    task: YCSBLikeWorkload(WORKDIR, patch_id, VARIANTS["replset"], str(replset_executions[task][exec_idx]), task)
    for task in replset_executions.keys()
}
sharded_workloads = {
    task: YCSBLikeWorkload(WORKDIR, sharded_patch_id, VARIANTS["sharded"], str(sharded_executions[task][exec_idx]), task)
    for task in sharded_executions.keys()
}

## 8-thread variation
replset_8thread_executions = {
    "genny_qebench_qe_1enc_cf16": [0],
    "genny_qebench_qe_1enc_cf32": [0],
    "genny_qebench_qe_1enc_cfdefault": [0],
    "genny_qebench_qe_5enc_cf16": [0],
    "genny_qebench_qe_5enc_cf32": [0],
    "genny_qebench_qe_5enc_cfdefault": [0]
}
sharded_8thread_executions = {
    "genny_qebench_qe_1enc_cf16": [0],
    "genny_qebench_qe_1enc_cf32": [0],
    "genny_qebench_qe_1enc_cfdefault": [0],
    "genny_qebench_qe_5enc_cf16": [0],
    "genny_qebench_qe_5enc_cf32": [0],
    "genny_qebench_qe_5enc_cfdefault": [0]
}
replset_8thread_workloads = {
    task: YCSBLikeWorkload(WORKDIR + "_8threads", "6352e6ea61837d2bb04547c1", 
        VARIANTS["replset"], str(replset_8thread_executions[task][0]), task)
    for task in replset_8thread_executions.keys()
}
sharded_8thread_workloads = {
    task: YCSBLikeWorkload(WORKDIR + "_8threads", "6352e6ea61837d2bb04547c1", 
        VARIANTS["sharded"], str(sharded_8thread_executions[task][0]), task)
    for task in sharded_8thread_executions.keys()
}

## 60gb variation
replset_60gb_executions = {
    "genny_qebench_qe_1enc_cf16": [0],
    "genny_qebench_qe_1enc_cf32": [0],
    "genny_qebench_qe_1enc_cfdefault": [0],
    "genny_qebench_qe_5enc_cf16": [0],
    "genny_qebench_qe_5enc_cf32": [0],
    "genny_qebench_qe_5enc_cfdefault": [0]
}
sharded_60gb_executions = {
    "genny_qebench_qe_1enc_cf16": [0],
    "genny_qebench_qe_1enc_cf32": [0],
    "genny_qebench_qe_1enc_cfdefault": [0],
    "genny_qebench_qe_5enc_cf16": [0],
    "genny_qebench_qe_5enc_cf32": [0],
    "genny_qebench_qe_5enc_cfdefault": [0]
}
replset_60gb_workloads = {
    task: YCSBLikeWorkload(WORKDIR + "_60gb", "635fe75cd1fe0752e2e05ac1", 
        VARIANTS["replset"], str(replset_60gb_executions[task][0]), task)
    for task in replset_60gb_executions.keys()
}
sharded_60gb_workloads = {
    task: YCSBLikeWorkload(WORKDIR + "_60gb", "635fe75cd1fe0752e2e05ac1", 
        VARIANTS["sharded"], str(sharded_60gb_executions[task][0]), task)
    for task in sharded_60gb_executions.keys()
}

replset_short_workloads = {
    task: YCSBLikeWorkload(WORKDIR + "_short", "636d40c25623432c2407e2a0",
        "linux-1-node-replSet-qebench", str(execution), task)
    for task, execution in {
        "genny_qebench_qe_1enc_cf16": 0,
        "genny_qebench_qe_1enc_cf32": 0,
        "genny_qebench_qe_1enc_cfdefault": 0,
        "genny_qebench_qe_5enc_cf16": 0,
        "genny_qebench_qe_5enc_cf32": 0,
        "genny_qebench_qe_5enc_cfdefault": 0
    }.items()
}


In [None]:
row="total_ops"
start = None
end = None
pp = pprint.PrettyPrinter()
workloads = replset_short_workloads
# workloads = replset_workloads
# workloads = sharded_workloads
# workloads = replset_8thread_workloads
# workloads = replset_60gb_workloads
# workloads = sharded_60gb_workloads
save_plots = True

def try_save_plot(wld, ax, filename):
    if save_plots:
        save_plot(wld, ax, filename)

In [None]:
for task, wld in workloads.items():
    ax = wld.plot_load_data(row, "pure_latency(ms)", line=True, start=start, end=end)
    try_save_plot(wld, ax, f"{task}_load_latency.png")

    ax = wld.plot_load_data("ts", "throughput", True, start, end, ylabel="throughput (ops/sec)")
    try_save_plot(wld, ax, f"{task}_load_throughput_vs_ts.png")
    
    dfs = wld.get_load_data()
    title = f"{wld.variant} {wld.task_name} insert stats"
    ax = plot_latency_stats(dfs.diff_data, row, title=title, regr="log", start=start, end=end)
    try_save_plot(wld, ax, f"{task}_load_latency_stats.png")
    pp.pprint(get_summary_statistics(dfs.diff_data, dfs.fixed_data, dfs.raw_data))

In [None]:
for task, wld in workloads.items():
    ax = wld.plot_100read_data(row, "pure_latency(ms)", start, end)
    try_save_plot(wld, ax, f"{task}_100read_latency.png")

    ax = wld.plot_100read_data("ts", "throughput", True, start, end, ylabel="throughput (ops/sec)")
    try_save_plot(wld, ax, f"{task}_100read_throughput_vs_ts.png")

    dfs = wld.get_100read_data()
    title = f"{wld.variant} {wld.task_name} 100read stats"
    ax = plot_latency_stats(dfs.diff_data, row, title=title, regr="line", start=start, end=end)
    try_save_plot(wld, ax, f"{task}_100read_latency_stats.png")
    pp.pprint(get_summary_statistics(dfs.diff_data, dfs.fixed_data, dfs.raw_data))

In [None]:
for task, wld in workloads.items():
    ax = wld.plot_95read5update_read_data(row, "pure_latency(ms)", start, end)
    try_save_plot(wld, ax, f"{task}_95read5update_read_latency.png")

    ax = wld.plot_95read5update_write_data(row, "pure_latency(ms)", start, end)
    try_save_plot(wld, ax, f"{task}_95read5update_write_latency.png")

    rdfs = wld.get_95read5update_data()[0]
    wdfs = wld.get_95read5update_data()[1]
    title = f"{wld.variant} {wld.task_name} 95read5update read stats"
    ax = plot_latency_stats(rdfs.diff_data, row, title=title, regr="log", start=start, end=end)
    try_save_plot(wld, ax, f"{task}_95read5update_read_latency_stats.png")

    title = f"{wld.variant} {wld.task_name} 95read5update update stats"
    ax = plot_latency_stats(wdfs.diff_data, row, title=title, regr="line", start=start, end=end)
    try_save_plot(wld, ax, f"{task}_95read5update_write_latency_stats.png")

    pp.pprint(get_summary_statistics(rdfs.diff_data, rdfs.fixed_data, rdfs.raw_data))
    pp.pprint(get_summary_statistics(wdfs.diff_data, wdfs.fixed_data, wdfs.raw_data))

In [None]:
for task, wld in workloads.items():
    ax = wld.plot_95read5update_read_data("ts", "throughput", True, start, end, ylabel="throughput (ops/sec)")
    try_save_plot(wld, ax, f"{task}_95read5update_read_throughput_vs_ts.png")
    ax = wld.plot_95read5update_write_data("ts", "throughput", True, start, end, ylabel="throughput (ops/sec)")
    try_save_plot(wld, ax, f"{task}_95read5update_write_throughput_vs_ts.png")

In [None]:
for task, wld in workloads.items():
    ax = wld.plot_100update_data(row, "pure_latency(ms)", start, end)
    try_save_plot(wld, ax, f"{task}_100update_latency.png")

    ax = wld.plot_100update_data("ts", "throughput", True, start, end, ylabel="throughput (ops/sec)")
    try_save_plot(wld, ax, f"{task}_100update_throughput_vs_ts.png")

    dfs = wld.get_100update_data()
    title = f"{wld.variant} {wld.task_name} 100update update stats"
    ax = plot_latency_stats(dfs.diff_data, row, title=title, regr="line", start=start, end=end)
    try_save_plot(wld, ax, f"{task}_100update_latency_stats.png")

    pp.pprint(get_summary_statistics(dfs.diff_data, dfs.fixed_data, dfs.raw_data))

In [None]:
for task, wld in workloads.items():
    ax = wld.plot_50read50update_read_data(row, "pure_latency(ms)", start, end)
    try_save_plot(wld, ax, f"{task}_50read50update_read_latency.png")

    ax = wld.plot_50read50update_write_data(row, "pure_latency(ms)", start, end)
    try_save_plot(wld, ax, f"{task}_50read50update_write_latency.png")

    rdfs = wld.get_50read50update_data()[0]
    wdfs = wld.get_50read50update_data()[1]
    title = f"{wld.variant} {wld.task_name} 50read50update read stats"
    ax = plot_latency_stats(rdfs.diff_data, row, title=title, regr="line", start=start, end=end)
    try_save_plot(wld, ax, f"{task}_50read50update_read_latency_stats.png")

    title = f"{wld.variant} {wld.task_name} 50read50update update stats"
    ax = plot_latency_stats(wdfs.diff_data, row, title=title, regr="line", start=start, end=end)
    try_save_plot(wld, ax, f"{task}_50read50update_write_latency_stats.png")

    pp.pprint(get_summary_statistics(rdfs.diff_data, rdfs.fixed_data, rdfs.raw_data))
    pp.pprint(get_summary_statistics(wdfs.diff_data, wdfs.fixed_data, wdfs.raw_data))

In [None]:
for task, wld in workloads.items():
    ax = wld.plot_50read50update_read_data("ts", "throughput", True, start, end, ylabel="throughput (ops/sec)")
    try_save_plot(wld, ax, f"{task}_50read50update_read_throughput_vs_ts.png")
    ax = wld.plot_50read50update_write_data("ts", "throughput", True, start, end, ylabel="throughput (ops/sec)")
    try_save_plot(wld, ax, f"{task}_50read50update_write_throughput_vs_ts.png")