In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
    .config("spark.driver.memory", "16g")\
    .getOrCreate()

spark

In [None]:
%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
import pyspark.sql.functions as F
import pyspark.sql.types as T

from pyspark.sql.functions import col, lit

from common import *

In [None]:
PREFIX = "main/ycsb-asym"

IGNORE_CACHE = False

index_df = from_cache_or_compute(
    f'{PREFIX}/index.parquet',
    lambda: get_index(spark, PREFIX)\
        .withColumn("asym_ratio", F.regexp_extract("prefix", r"asym_ratio(\d+)", 1))\
        .withColumn("config_name", F.regexp_replace("config_name", "\.conf", ""))\
        .toPandas()\
        .convert_dtypes()\
        .astype({
            "wl:hot": "int32",
            "wl:mh": "int32",
            "wl:mp": "int32",
        }),
    ignore_cache=IGNORE_CACHE,
)

index_df

# Throughput

In [None]:
IGNORE_CACHE = False

def compute_throughput(prefix):
    res = throughput(
        spark,
        prefix,
        start_offset_sec=10,
        duration_sec=50
    ).first().throughput
    print(prefix, res)
    return res


def compute_all_throughputs(index_df):
    # Extract all prefixes in the index
    throughput_df = index_df.loc[:, ["prefix"]]
    # Compute the throughput of each prefix
    throughput_df["throughput"] = throughput_df.apply(lambda r : compute_throughput(r["prefix"]), axis=1)
    # Associate metadata from the index to the throughputs
    return throughput_df.merge(index_df, on="prefix")


throughput_df = from_cache_or_compute(
    f'{PREFIX}/throughput.parquet',
    lambda: compute_all_throughputs(index_df[index_df["clients"] == 3000])\
        .sort_values("asym_ratio"),
    ignore_cache=IGNORE_CACHE,
)

# Latency

In [None]:
IGNORE_CACHE = False

percentile_cols = [
    F.percentile_approx("latency", 0.5).alias("p50"),
    F.percentile_approx("latency", 0.90).alias("p90"),
    F.percentile_approx("latency", 0.95).alias("p95"),
    F.percentile_approx("latency", 0.99).alias("p99"),
]

latency_df = from_cache_or_compute(
    f'{PREFIX}/latency.parquet',
    lambda:  latency(
        spark,
        index_df.loc[index_df["clients"] == 200, "prefix"]
    )\
        .groupBy("prefix")\
        .agg(*percentile_cols)\
        .toPandas()\
        .merge(index_df, on="prefix")\
        .sort_values("asym_ratio"),
    ignore_cache=IGNORE_CACHE,
)

latency_df

# Plot

In [None]:
_, ax = plt.subplots(1, 1, figsize=(7, 3.5))

throughput_df.plot.bar(ax=ax, x="asym_ratio", y="throughput", rot=0, fill=False, hatch='/')
ax.set_ylabel("throughput (txn/s)")
ax.set_xlabel("asymmetry ratio")
ax.legend(loc="lower left")
ax2 = ax.twinx()

latency_df.plot(ax=ax2, x="asym_ratio", y="p50", label='p50 latency', linestyle='dotted', marker='.', color="red")
latency_df.plot(ax=ax2, x="asym_ratio", y="p90", label='p90 latency', linestyle='dashed', marker='.', color="red")
latency_df.plot(ax=ax2, x="asym_ratio", y="p99", label='p99 latency', marker='.', color="red")
ax2.set_ylabel("latency (ms)")
ax2.set_ylim((0, 350))
ax2.set_xticklabels(["50:50", "60:40", "70:30", "80:20", "90:10"])

ax2.legend(loc="lower right", ncol=1)
 
plt.tight_layout()
plt.savefig("output/asymmetry.pdf")
