In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
    .config("spark.driver.memory", "16g")\
    .getOrCreate()

spark

In [None]:
%load_ext autoreload
%autoreload 2

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pyspark.sql.functions as F
import pyspark.sql.types as T

from pyspark.sql.window import Window
from pyspark.sql.functions import col, lit

from common import *

plt.rc('axes', labelsize=13, titlesize=13)
plt.rc('xtick', labelsize=12)
plt.rc('ytick', labelsize=12)
plt.rc('legend', fontsize=13)

# Throughput

In [None]:
THRP_PREFIX = "main/cockroach"

IGNORE_CACHE = False

thrp_index_df = from_cache_or_compute(
    f'{THRP_PREFIX}/index.parquet',
    lambda: get_index(spark, THRP_PREFIX)\
        .withColumn("config_name", F.regexp_replace("config_name", "\.conf", ""))\
        .toPandas()\
        .convert_dtypes()\
        .astype({
            "wl:hot": "int32",
            "wl:mh": "int32",
        }),
    ignore_cache=IGNORE_CACHE,
)
thrp_index_df

## SLOG throughput

In [None]:
IGNORE_CACHE = False

def compute_throughput(prefix):
    res = throughput(
        spark,
        prefix,
        start_offset_sec=10,
        duration_sec=50
    ).first().throughput
    print(prefix, res)
    return res


def compute_all_throughputs(index_df):
    # Extract all prefixes in the index
    throughput_df = index_df.loc[:, ["prefix"]]
    # Compute the throughput of each prefix
    throughput_df["throughput"] = throughput_df.apply(lambda r : compute_throughput(r["prefix"]), axis=1)
    # Associate metadata from the index to the throughputs
    return throughput_df.merge(index_df, on="prefix")


throughput_df = from_cache_or_compute(
    f'{THRP_PREFIX}/throughput.parquet',
    lambda: compute_all_throughputs(thrp_index_df),
    ignore_cache=IGNORE_CACHE,
)

## CockroachDB throughput

In [None]:
crdb_schema = StructType([
    StructField("wl:mh", T.IntegerType(), False),
    StructField("wl:hot", T.IntegerType(), False),
    StructField("region", T.StringType(), False),
    StructField("type", T.StringType(), False),
    StructField("count", T.IntegerType(), False),
    StructField("throughput", T.DoubleType(), False),
    StructField("avg", T.DoubleType(), False),
    StructField("p50", T.DoubleType(), False),
    StructField("p95", T.DoubleType(), False),
    StructField("p99", T.DoubleType(), False),
    StructField("pMax", T.DoubleType(), False),
])

crdb_sdf = spark.read.csv("main/crdb.csv", header=True, schema=crdb_schema)

crdb_df = crdb_sdf\
    .groupBy("wl:mh", "wl:hot")\
    .agg(F.sum("throughput").alias("crdb"))\
    .toPandas()

crdb_df.set_index(["wl:mh", "wl:hot"], inplace=True)
crdb_df.sort_index(inplace=True, ascending=[True, False])

crdb_df.columns = crdb_df.columns.set_names("config_name")
crdb_df

## Plot

In [None]:
pivot_values = "throughput"
pivot_columns = "config_name"
pivot_index = ["wl:mh", "wl:hot"]

ddr_ts_df = throughput_df.pivot(index=pivot_index, columns=pivot_columns, values=pivot_values).sort_index(ascending=[True, False])
combined = ddr_ts_df.join(crdb_df)

_, axes = plt.subplots(1, 3, figsize=(15, 5), sharey=True)

combined.loc[(0, slice(None)), :].plot.bar(ax=axes[0], title="MH = 0", xlabel="HOT", ylabel="txn/s", rot=45)
combined.loc[(50, slice(None)), :].plot.bar(ax=axes[1], title="MH = 50", xlabel="HOT", rot=45)
combined.loc[(100, slice(None)), :].plot.bar(ax=axes[2], title="MH = 100", xlabel="HOT", rot=45)

for ax in axes:
    ax.set_xticklabels([1/1000000, 1/100000, 1/10000, 1/1000, 1/100, 1/10])

def disp(x):
    return f'{x:,.0f}'

HEIGHT = 500
axes[0].annotate(disp(crdb_df.loc[(0, 1000), 'crdb']), (3, HEIGHT + 1800))
axes[0].annotate(disp(crdb_df.loc[(0, 100), 'crdb']), (4.1, HEIGHT + 500))
axes[0].annotate(disp(crdb_df.loc[(0, 10), 'crdb']), (5.1, HEIGHT))
axes[1].annotate(disp(crdb_df.loc[(50, 1000), 'crdb']), (3, HEIGHT + 0.08))
axes[1].annotate(disp(crdb_df.loc[(50, 100), 'crdb']), (4.1, HEIGHT))
axes[1].annotate(disp(crdb_df.loc[(50, 10), 'crdb']), (5.1, HEIGHT))
axes[2].annotate(disp(crdb_df.loc[(100, 1000), 'crdb']), (3, HEIGHT + 0.08))
axes[2].annotate(disp(crdb_df.loc[(100, 100), 'crdb']), (4.1, HEIGHT))
axes[2].annotate(disp(crdb_df.loc[(100, 10), 'crdb']), (5.1, HEIGHT))

plt.tight_layout()

In [None]:
def normalize_per_mh(df):
    mh_pcts = df.index.get_level_values(0).unique()
    for mh_pct in mh_pcts:
        g = df.loc[(mh_pct, slice(None)), :]
        df.loc[(mh_pct, slice(None)), :] /= g.loc[(mh_pct, 1000000), :]
    return df    

crdb_norm_df = normalize_per_mh(crdb_df.copy())
slog_norm_df = normalize_per_mh(ddr_ts_df.copy())
combined_norm = slog_norm_df.join(crdb_norm_df)

In [None]:

fig, axes = plt.subplots(1, 3, figsize=(15, 4), sharey=True)

xticks = np.array([1/1000000, 1/100000, 1/10000, 1/1000, 1/100, 1/10])
xticks = xticks.astype('str')
x = np.arange(len(xticks))*1.5

for i, mh in enumerate([0, 50, 100]):
    axes[i].bar(
        x - 0.25,
        combined_norm.loc[(mh, slice(None)), 'ddr_ts'],
        width=0.5,
        fill=False,
        edgecolor='r',
        hatch='\\',
        label='Detock' if i == 0 else '_nolegend_',
    )
    axes[i].bar(
        x + 0.25,
        combined_norm.loc[(mh, slice(None)), 'crdb'],
        width=0.5,
        fill=False,
        edgecolor='b',
        hatch='x',
        label='CockroachDB' if i == 0 else '_nolegend_',
    )
    axes[i].set_title(f'MH = {mh}%')
    axes[i].set_xlabel('HOT')
    if i == 0:
        axes[i].set_ylabel('normalized throughput')

for ax in axes:
    ax.set_xticks(x)
    ax.set_xticklabels(xticks)

HEIGHT = 0.03

def disp(x):
    return f'{x:,.3f}'

axes[0].annotate(disp(crdb_norm_df.loc[(0, 1000), 'crdb']), (x[3]+.04, HEIGHT + 0.13))
axes[0].annotate(disp(crdb_norm_df.loc[(0, 100), 'crdb']), (x[4]+.04, HEIGHT + 0.03))
axes[0].annotate(disp(crdb_norm_df.loc[(0, 10), 'crdb']), (x[5]+.04, HEIGHT))
axes[1].annotate(disp(crdb_norm_df.loc[(50, 1000), 'crdb']), (x[3]+.04, HEIGHT + 0.01))
axes[1].annotate(disp(crdb_norm_df.loc[(50, 100), 'crdb']), (x[4]+.04, HEIGHT + 0.01))
axes[1].annotate(disp(crdb_norm_df.loc[(50, 10), 'crdb']), (x[5]+.04, HEIGHT))
axes[2].annotate(disp(crdb_norm_df.loc[(100, 1000), 'crdb']), (x[3]+.04, HEIGHT + 0.02))
axes[2].annotate(disp(crdb_norm_df.loc[(100, 100), 'crdb']), (x[4]+.04, HEIGHT))
axes[2].annotate(disp(crdb_norm_df.loc[(100, 10), 'crdb']), (x[5]+.04, HEIGHT))

fig.legend(bbox_to_anchor=(0, 1), loc='lower left', ncol=2)
fig.tight_layout()
fig.savefig('output/cockroach-rel-throughput.pdf', bbox_inches='tight')

In [None]:
def normalize_and_pivot(df, value_name):
    res = df.reset_index()
    res[value_name] /= res[value_name].max()
    res["hot"] = 1 / res["wl:hot"]
    return res.pivot(index="hot", columns="wl:mh", values=value_name)\
        .rename(columns={
            0: f"{value_name}_0",
            50: f"{value_name}_50",
            100: f"{value_name}_100",
        })

crdb_bar_df = normalize_and_pivot(crdb_df, "crdb")
ddr_ts_bar_df = normalize_and_pivot(ddr_ts_df, "ddr_ts")

combined_bar_df = ddr_ts_bar_df.join(crdb_bar_df)
combined_bar_df


In [None]:
fig, ax = plt.subplots(1, 1, figsize=(6, 4))
common_args = {
    "width": 0.14,
    "ax": ax,
    "legend": False,
    "edgecolor": "k",
    "rot": 0,
}
ddr_ts_color="steelblue"
cockroach_color="tomato"
hatch_0 = "x"
hatch_50 = "-"
hatch_100 = "o"

combined_bar_df["ddr_ts_0"].plot.bar(
    position=3,
    color=ddr_ts_color,
    hatch=hatch_0,
    **common_args,
)
combined_bar_df["ddr_ts_50"].plot.bar(
    position=2,
    color=ddr_ts_color,
    hatch=hatch_50,
    **common_args,
)
combined_bar_df["ddr_ts_100"].plot.bar(
    position=1,
    color=ddr_ts_color,
    hatch=hatch_100,
    **common_args,
)
combined_bar_df["crdb_0"].plot.bar(
    position=0,
    color=cockroach_color,
    hatch=hatch_0,
    **common_args,
)
combined_bar_df["crdb_50"].plot.bar(
    position=-1,
    color=cockroach_color,
    hatch=hatch_50,
    **common_args,
)
combined_bar_df["crdb_100"].plot.bar(
    position=-2,
    color=cockroach_color,
    hatch=hatch_100,
    **common_args,
)

ax.set_yscale("log")
ax.set_xlim(-0.6, 5.6)
ax.set_xlabel("HOT")
ax.set_ylabel('normalized throughput')

from matplotlib.patches import Patch
from matplotlib.lines import Line2D

legend_handles = [
    Line2D([0], [0], marker='o', color='w', label='Detock', markeredgecolor='k', markersize=12,
        markerfacecolor=ddr_ts_color, 
    ),
    Line2D([0], [0], marker='o', color='w', label='CockroachDB', markeredgecolor='k', markersize=12,
        markerfacecolor=cockroach_color,
    ),
    Patch(facecolor='w', edgecolor='k', hatch=hatch_0*2, label='MH = 0'),
    Patch(facecolor='w', edgecolor='k', hatch=hatch_50*2, label='MH = 50'),
    Patch(facecolor='w', edgecolor='k', hatch=hatch_100*2, label='MH = 100'),
    # Rectangle((0, 0), 5, 2, facecolor=cockroach_color, edgecolor='k',label='CockroachDB'),
]

fig.legend(
    handles=legend_handles,
    # bbox_to_anchor=(1.0, 0.6),
    bbox_to_anchor=(0, 1),
    loc='lower left',
    ncol=5,
    columnspacing=1.0,
    handlelength=1.0,
)

fig.tight_layout()
fig.savefig('output/cockroach-rel-throughput.pdf', bbox_inches='tight')

# Deadlock

In [None]:
crdb_cnt_df = crdb_sdf\
    .withColumn("type", F.regexp_replace("type", "too-old", "other"))\
    .groupBy("wl:mh", "wl:hot", "type")\
    .agg(F.sum("count").alias("count"))\
    .withColumn("pct", 
                100*F.col("count") / F.sum("count").over(
                    Window.partitionBy("wl:mh", "wl:hot")
                )
               )\
    .toPandas()

crdb_cnt_df

In [None]:
crdb_cnt_pivot_df = crdb_cnt_df.pivot(index=["wl:mh", "wl:hot"], columns="type", values="count")
crdb_cnt_pivot_df.sort_index(inplace=True, ascending=[True, False])
crdb_cnt_pivot_df["sum"] = crdb_cnt_pivot_df["write"] + crdb_cnt_pivot_df['deadlock'] + crdb_cnt_pivot_df['other']

_, axes = plt.subplots(1, 3, figsize=(17, 5), sharey=True)

bars = ['write', 'deadlock', 'other']

crdb_cnt_pivot_df.loc[(0, slice(None)), :].plot.bar(y=bars, ax=axes[0], title="MH = 0", xlabel="HOT", ylabel="# txn", rot=45, stacked=True)
crdb_cnt_pivot_df.loc[(50, slice(None)), :].plot.bar(y=bars, ax=axes[1], title="MH = 50", xlabel="HOT", rot=45, stacked=True)
crdb_cnt_pivot_df.loc[(100, slice(None)), :].plot.bar(y=bars, ax=axes[2], title="MH = 100", xlabel="HOT", rot=45, stacked=True)

for ax in axes:
    ax.set_xticklabels([1/1000000, 1/100000, 1/10000, 1/1000, 1/100, 1/10])

def disp(x):
    return f'{x:,.0f}'

HEIGHT = 20000

axes[0].annotate(disp(crdb_cnt_pivot_df.loc[(0, 1000), 'sum']), (2.7, HEIGHT + 180000))
axes[0].annotate(disp(crdb_cnt_pivot_df.loc[(0, 100), 'sum']), (3.7, HEIGHT + 60000))
axes[0].annotate(disp(crdb_cnt_pivot_df.loc[(0, 10), 'sum']), (4.8, HEIGHT))
axes[1].annotate(disp(crdb_cnt_pivot_df.loc[(50, 1000), 'sum']), (2.7, HEIGHT + 15000))
axes[1].annotate(disp(crdb_cnt_pivot_df.loc[(50, 100), 'sum']), (3.7, HEIGHT + 10000))
axes[1].annotate(disp(crdb_cnt_pivot_df.loc[(50, 10), 'sum']), (4.8, HEIGHT))
axes[2].annotate(disp(crdb_cnt_pivot_df.loc[(100, 1000), 'sum']), (2.7, HEIGHT + 15000))
axes[2].annotate(disp(crdb_cnt_pivot_df.loc[(100, 100), 'sum']), (3.8, HEIGHT + 10000))
axes[2].annotate(disp(crdb_cnt_pivot_df.loc[(100, 10), 'sum']), (4.8, HEIGHT))
    
plt.tight_layout()

In [None]:
crdb_pct_pivot_df = crdb_cnt_df.pivot(index=["wl:mh", "wl:hot"], columns="type", values="pct")
crdb_pct_pivot_df.sort_index(inplace=True, ascending=[True, False])

fig, axes = plt.subplots(1, 3, figsize=(15, 4), sharey=True)

xticks = np.array([1/1000000, 1/100000, 1/10000, 1/1000, 1/100, 1/10])
xticks = xticks.astype('str')
x = np.arange(len(xticks))*1.5

for i, mh in enumerate([0, 50, 100]):
    write = crdb_pct_pivot_df.loc[(mh, slice(None)), 'write'].to_numpy()
    deadlock = crdb_pct_pivot_df.loc[(mh, slice(None)), 'deadlock'].to_numpy()
    other = crdb_pct_pivot_df.loc[(mh, slice(None)), 'other'].to_numpy()
    axes[i].bar(
        x,
        write,
        fill=False,
        edgecolor='darkgreen',
        hatch='\\',
        label='Committed' if i == 0 else '_nolegend_',
    )
    axes[i].bar(
        x,
        deadlock,
        bottom=write,
        fill=False,
        edgecolor='r',
        hatch='x',
        label='Aborted (deadlock)' if i == 0 else '_nolegend_',
    )
    axes[i].bar(
        x,
        other,
        bottom=write+deadlock,
        # fill=False,
        color='m',
        edgecolor='m',
        # hatch='o',
        label='Aborted (other)' if i == 0 else '_nolegend_',
    )
    axes[i].set_title(f'MH = {mh}%')
    axes[i].set_xlabel('HOT')
    if i == 0:
        axes[i].set_ylabel('% transactions')

for ax in axes:
    ax.set_ylim((0, 100))
    ax.set_xticks(x)
    ax.set_xticklabels(xticks)

fig.legend(bbox_to_anchor=(0, 1), loc='lower left', ncol=3)
fig.tight_layout()
fig.savefig('output/cockroach-pct-abort.pdf', bbox_inches='tight')

In [None]:
crdb_pct_pivot_2_df = crdb_cnt_df\
    .pivot(index="wl:hot", columns=["type", "wl:mh"], values="pct")\
    .sort_index(ascending=False)
crdb_pct_pivot_2_df.index = 1 / crdb_pct_pivot_2_df.index
crdb_pct_pivot_2_df

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(6, 3))
commit_color = "lightgreen"
deadlock_color = "tomato"
other_color = "violet"
common_args = {
    "width": 0.3,
    "ax": ax,
    "legend": False,
    "edgecolor": "k",
    "rot": 0,
    "stacked": True,
    "color": [commit_color, deadlock_color, other_color]
}
crdb_pct_pivot_2_df[[("write", 100), ("deadlock", 100), ("other", 100)]].plot.bar(
    position=-0.5,
    hatch=hatch_100,
    **common_args,
)
crdb_pct_pivot_2_df[[("write", 50), ("deadlock", 50), ("other", 50)]].plot.bar(
    position=0.5,
    hatch=hatch_50,
    **common_args,
)
crdb_pct_pivot_2_df[[("write", 0), ("deadlock", 0), ("other", 0)]].plot.bar(
    position=1.5,
    hatch=hatch_0,
    **common_args,
)
ax.set_xlim(-0.7, 5.7)
ax.set_ylim(0, 100)
ax.set_xlabel('HOT')
ax.set_ylabel('% transactions')


from matplotlib.patches import Patch
from matplotlib.lines import Line2D

legend_handles = [
    Line2D([0], [0], marker='o', color='w', label='Committed', markeredgecolor='k', markersize=12,
        markerfacecolor=commit_color, 
    ),
    Line2D([0], [0], marker='o', color='w', label='Aborted (deadlock)', markeredgecolor='k', markersize=12,
        markerfacecolor=deadlock_color,
    ),
    Line2D([0], [0], marker='o', color='w', label='Aborted (other)', markeredgecolor='k', markersize=12,
        markerfacecolor=other_color,
    ),
    Patch(facecolor='w', edgecolor='k', hatch=hatch_0*2, label='MH = 0'),
    Patch(facecolor='w', edgecolor='k', hatch=hatch_50*2, label='MH = 50'),
    Patch(facecolor='w', edgecolor='k', hatch=hatch_100*2, label='MH = 100'),
]

fig.legend(
    handles=legend_handles,
    bbox_to_anchor=(1.0, 0.5),
    # bbox_to_anchor=(0, 1),
    loc='lower left',
    ncol=1,
    handlelength=1.0,
    fontsize=10,
)

fig.tight_layout()
fig.savefig('output/cockroach-pct-abort.pdf', bbox_inches='tight')

# Latency

In [None]:
LAT_PREFIX = "main/cockroach-latency"

IGNORE_CACHE = False

lat_index_df = from_cache_or_compute(
    f'{LAT_PREFIX}/index.parquet',
    lambda: get_index(spark, LAT_PREFIX)\
        .withColumn("config_name", F.regexp_replace("config_name", "\.conf", ""))\
        .toPandas()\
        .convert_dtypes()\
        .astype({
            "wl:hot": "int32",
            "wl:mh": "int32",
        }),
    ignore_cache=IGNORE_CACHE,
)
lat_index_df

In [None]:
IGNORE_CACHE = False

percentile_cols = [
    F.percentile_approx("latency", 0.5).alias("p50"),
    F.percentile_approx("latency", 0.95).alias("p95"),
    F.percentile_approx("latency", 0.99).alias("p99"),
]

slog_latency_df = from_cache_or_compute(
    f'{LAT_PREFIX}/latency.parquet',
    lambda: latency(spark, lat_index_df["prefix"])\
        .groupBy("prefix")\
        .agg(*percentile_cols)\
        .toPandas()\
        .merge(lat_index_df, on="prefix"),
    ignore_cache=IGNORE_CACHE,
)
slog_latency_df

In [None]:
crdb_schema = StructType([
    StructField("wl:mh", T.IntegerType(), False),
    StructField("wl:hot", T.IntegerType(), False),
    StructField("region", T.StringType(), False),
    StructField("throughput", T.DoubleType(), False),
    StructField("avg", T.DoubleType(), False),
    StructField("p50", T.DoubleType(), False),
    StructField("p95", T.DoubleType(), False),
    StructField("p99", T.DoubleType(), False),
    StructField("pMax", T.DoubleType(), False),
])
crdb_latency_df = spark.read.csv("main/crdb-latency.csv", header=True, schema=crdb_schema)\
    .groupBy("wl:mh", "wl:hot")\
    .agg(
        F.avg("p50").alias("p50"),
        F.avg("p95").alias("p95"),
        F.avg("p99").alias("p99"),
    )\
    .toPandas()

crdb_latency_df["config_name"] = "crdb"
crdb_latency_df

In [None]:
def plot_latency(df, yscale='log'):
    _, axes = plt.subplots(1, 3, figsize=(17, 6), sharey=True)

    configs = df["config_name"].unique()
    mh_pcts = sorted(df["wl:mh"].unique())
    
    colors = ['C0', 'C1']

    for i, config in enumerate(configs):
        for r, mh_pct in enumerate(mh_pcts):
            mask = (df["config_name"] == config) & (df["wl:mh"] == mh_pct)
            filtered = df[mask].sort_values("wl:hot", ascending=False)
            label = config.replace('.conf', '')

            filtered.plot(ax=axes[r], x="wl:hot", y="p50", label=f'{label}_50', linestyle='dotted', marker='.', color=colors[i])
            filtered.plot(ax=axes[r], x="wl:hot", y="p95", label=f'{label}_95', linestyle='dashed', marker='.', color=colors[i])
            filtered.plot(ax=axes[r], x="wl:hot", y="p99", label=f'{label}_99', marker='.', color=colors[i])

            axes[r].set_title(f"MH_PCT = {mh_pct}")
            axes[r].set_ylabel("latency (ms)")
            axes[r].set_xlabel("HOT")
            axes[r].grid(axis='y')
            axes[r].set_yscale(yscale)
            axes[r].set_xscale('log')
            if r != 0:
                axes[r].legend([])

    plt.tight_layout()

In [None]:
latency_df = pd.concat([slog_latency_df, crdb_latency_df])
latency_df.loc[:, "wl:hot"] = 1 / latency_df.loc[:, "wl:hot"]
plot_latency(latency_df)