In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

%matplotlib inline

ModuleNotFoundError: No module named 'pandas'

In [None]:
import matplotlib.cm as cm
cmap = cm.YlGnBu
matplotlib.cm.register_cmap("cmap", cmap)
color_palette = sns.color_palette("cmap", desat=0.9)
sns.set_palette(color_palette[1:])

plt.rcParams["font.family"] = "serif"
sns.set_style("white")

paper_rc = {'lines.linewidth': 3, 'lines.markersize': 12.5}                  
sns.set_context("paper", rc = paper_rc, font_scale=2.6)

## Total Data

In [None]:
total_data = pd.read_csv("../cleaned_data/total_data.csv")
total_data = total_data[
    (total_data["execution_type"] != "iShare (w/o unshare)") &
    (total_data["query_set"] != "Ten TPC-H Queries Twice with Different Parameters")
]
total_data["execution_type"] = total_data["execution_type"].map(lambda x: x if x != "iShare (w/ unshare)" else "iShare")

total_data.head()

In [None]:
batch_data = pd.read_csv("../cleaned_data/batch_execution.csv").dropna()
batch_data.columns = ["execution_type", "Share (Batch)", "NoShare (Batch)"]
for col in batch_data.columns[1:]:
    batch_data[col] /= 1000
    
batch_data["execution_type"] = batch_data["execution_type"].map({
    "10 TPC-H": "Ten TPC-H Queries",
    "22 TPC-H": "All TPC-H Queries",
})
batch_data

In [None]:
matplotlib.rcParams.update({'errorbar.capsize': 10})
sns.set_context("paper", rc = paper_rc, font_scale=2.6)

# Get only relevant data
df = total_data[
    (total_data["constraint_type"] == "nonUniform") &
    (total_data["query_set"] == "All TPC-H Queries")
][["execution_type", "time", "query_set"]]

# Massage data.
df = df.groupby(["execution_type", "query_set"]).agg(["mean", "max", "min"])
df.columns = [col[1] for col in df.columns.values]
df = df.reset_index()
df["err"] = df["max"] - df["min"]
df = df.drop(["max", "min"], axis=1)

# batch_df = batch_data.melt(id_vars=["execution_type"], value_vars=["Share (Batch)", "NoShare (Batch)"])
# batch_df = batch_df[batch_df["execution_type"] == "All TPC-H Queries"]
# batch_df["err"] = 0
# batch_df.columns = ["query_set", "execution_type", "mean", "err"]

# df = pd.concat([df, batch_df])

def grouped_barplot(df, cat, subcat, val, err):
    u = df[cat].unique()
    x = np.arange(len(u))
    subx = ["NoShare-Uniform", "NoShare-Nonuniform", "Share-Uniform", "iShare"]# + ["Share (Batch)", "NoShare (Batch)"]
    offsets = (np.arange(len(subx))-np.arange(len(subx)).mean())/(len(subx)+1.)
    width = np.diff(offsets).mean() * .75
    for i,gr in enumerate(subx):
        dfg = df[df[subcat] == gr]
        plt.bar(x+offsets[i], dfg[val].values, width=width, 
                label="{}".format(gr), yerr=dfg[err].values, ecolor="red")
    plt.xticks(x, u)

figsize = [x * 1 for x in [9,7]]
fig, ax = plt.subplots(1, figsize=figsize)

grouped_barplot(df, "query_set", "execution_type", "mean", "err")

ax.set_ylabel("CPU Time (s)")
ax.set_yticks(np.arange(0, 2500+1, 500))

ax.set_xticklabels([])
ax.set_xlabel("")

leg = ax.legend(
    bbox_to_anchor=(-0.15, 1.), 
    mode="extend",
    ncol=2,
    loc="lower left",
)
leg.get_frame().set_linewidth(0.0)

plt.tight_layout()
plt.savefig("../img/nonuniform.pdf", bbox_inches="tight")

In [None]:
batch_data = pd.read_csv("../cleaned_data/batch_execution.csv").dropna()
batch_data.columns = ["execution_type", "Share (Batch)", "NoShare (Batch)"]
for col in batch_data.columns[1:]:
    batch_data[col] /= 1000
    
batch_data["execution_type"] = batch_data["execution_type"].map({
    "10 TPC-H": "Ten TPC-H Queries",
    "22 TPC-H": "All TPC-H Queries",
})
batch_data

In [None]:
sns.set_context("paper", rc = paper_rc, font_scale=3.5)

batch_df = batch_data.melt(id_vars=["execution_type"], value_vars=["Share (Batch)", "NoShare (Batch)"])
batch_df = batch_df[batch_df["execution_type"] == "All TPC-H Queries"]
batch_df["err"] = 0
batch_df.columns = ["query_set", "execution_type", "mean", "err"]

def grouped_barplot(df, cat, subcat, val, err):
    u = df[cat].unique()
    x = np.arange(len(u))
    subx = ["Share (Batch)", "NoShare (Batch)"]
    offsets = (np.arange(len(subx))-np.arange(len(subx)).mean())/(len(subx)+1.)
    width = np.diff(offsets).mean()* .75
    for i,gr in enumerate(subx):
        dfg = df[df[subcat] == gr]
        plt.bar(x+offsets[i], dfg[val].values, width=width, color=color_palette[i+4],
                label="{}".format(gr), yerr=dfg[err].values, ecolor="red")
    plt.xticks(x, u)
    
figsize = [x for x in [10, 10]]
fig, ax = plt.subplots(1, figsize=figsize)

grouped_barplot(batch_df, "query_set", "execution_type", "mean", "err")

ax.set_ylabel("CPU Time (s)")
ax.set_yticks(np.arange(0, 751, 250))

leg = ax.legend(
    bbox_to_anchor=(-0.23, 1.), 
    mode="extend",
    ncol=2,
    loc="lower left",
)
leg.get_frame().set_linewidth(0.0)

ax.set_xticklabels([])
ax.set_xlabel("")

plt.tight_layout()
plt.savefig("../img/nonuniform_batch.pdf")

In [None]:
paper_rc = {'lines.linewidth': 3, 'lines.markersize': 20}                  
sns.set_context("paper", rc = paper_rc, font_scale=3)

query_set_filename = {
    "Ten TPC-H Queries": "ten_longest",
    "All TPC-H Queries": "all_tpch",
    "Ten TPC-H Queries Twice with Different Parameters": "ten_longest_twice",
}
execution_types = list(total_data["execution_type"].unique())

# Get only relevant data
df = total_data[total_data["constraint_type"] == "uniform"][["execution_type", "time", "query_set", "goal"]]

for i, query_set in enumerate(reversed(df["query_set"].unique())):
    query_set_df = df[df["query_set"] == query_set]
    label_to_axis = {
        0.1: 3,
        0.2: 2,
        0.5: 1,
        1.0: 0,
        2.0: -1,
        4.0: -2,
    }
    query_set_df["x"] = query_set_df["goal"].map(label_to_axis)
    
    figsize = [x * 1 for x in [10,7]]
    fig, ax = plt.subplots(1, figsize=figsize)

    ax = sns.lineplot(x="x", y="time", style="execution_type", ci=None, markers=True, dashes=False, color="black", alpha=0.6, data=query_set_df, ax=ax)
    
    if "All" in query_set:
        yticks = np.arange(0, 2001, 400)
        plt.ylim(0, 2150)
    elif "Ten"in query_set:
        yticks = np.arange(0, 1501, 300)
    
    ax.set_yticks(yticks)
    ax.set_xticks(np.arange(-2, 4))
    ax.set_xticklabels(reversed(list(label_to_axis.keys())))
    
    ax.set_ylabel("CPU Time (s)")
    ax.set_xlabel("Final Work Constraint")
    
    bbox_anchors = [
        [-0.2, 1.02, 1.22, 0.2],
        [-0.1, 1.02, 1, 0.2]
    ]
    
    handles, labels = ax.get_legend_handles_labels()
    leg = ax.legend(
        bbox_to_anchor=bbox_anchors[i], 
        loc="lower left", 
        mode="expand", 
        ncol=2,
        handles=handles[2*i+1:2*(i+1)+1], 
        labels=labels[2*i+1:2*(i+1)+1],
    )
    leg.get_frame().set_facecolor("none")
    leg.get_frame().set_linewidth(0.0)
    
    plt.tight_layout()
    plt.savefig("../img/uniform_{}.pdf".format(query_set_filename[query_set]), bbox_inches="tight")
    
    plt.show()
    plt.close()

# Manual data

In [None]:
manual_data = pd.read_csv("../cleaned_data/manual_data.csv")
manual_data

In [None]:
paper_rc = {'lines.linewidth': 3, 'lines.markersize': 12.5}                  
sns.set_context("paper", rc = paper_rc, font_scale=2.4)

fig = plt.figure(figsize=(8, 5))

ax = sns.barplot(y="cpu_time", x="execution_type", data=manual_data)

plt.tick_params(
    axis='x',          
    which='both',      
    bottom=False,
    top=False,         
    labelbottom=False)

ax.set_yticks(np.arange(0, 2501, 500))

ax.set_xlabel("")
ax.set_ylabel("CPU Time (s)")

labels = manual_data["execution_type"]
legend_handles = [mpatches.Patch(color=color_palette[i+1], label=label) for i, label in enumerate(labels)]
leg = ax.legend(
    bbox_to_anchor=(-0.2, 1.02, 1.2, 1), 
    loc="lower left", 
    mode="expand", 
    ncol=2,
    handles = legend_handles
)
leg.get_frame().set_facecolor("none")
leg.get_frame().set_linewidth(0.0)

plt.tight_layout()
plt.savefig("../img/manual.pdf")

# Unsharing

In [None]:
total_data = pd.read_csv("../cleaned_data/total_data.csv")
total_data = total_data[
    (total_data["constraint_type"] == "uniform") &
    (total_data["query_set"] == "Ten TPC-H Queries Twice with Different Parameters")
]
total_data.head()

In [None]:
paper_rc = {'lines.linewidth': 3, 'lines.markersize': 20}                  
sns.set_context("paper", rc = paper_rc, font_scale=2.4)

size = [x*1 for x in [8, 6]]
plt.subplots(1, figsize=size)

label_to_axis = {
        0.1: 3,
        0.2: 2,
        0.5: 1,
        1.0: 0,
        2.0: -1,
        4.0: -2,
    }
total_data["x"] = total_data["goal"].map(label_to_axis)
total_data["capped time"] = total_data["time"].map(lambda x: min(x, ymax_val))

ax = sns.lineplot(x="x", y="capped time", style="execution_type", ci=None, markers=True, dashes=False, color="black", alpha=0.4, data=total_data)

for i, row in total_data[total_data["time"] != total_data["capped time"]].groupby("goal").mean().iterrows():
    ax.text(x=row["x"]-0.45, y=row["capped time"]-400, s=int(row["time"]))

ymax_val = 2500
plt.ylim(0, ymax_val+150)

ax.set_yticks(np.arange(0, ymax_val+1, 500))
ax.set_xticks(list(label_to_axis.values()))
ax.set_xticklabels(list(label_to_axis.keys()))

ax.set_ylabel("CPU Time (s)")
ax.set_xlabel("Final Work Constraint")

handles, labels = ax.get_legend_handles_labels()
leg = ax.legend(
    bbox_to_anchor=(-0.2, 1.05, 1.3, 1.02),
    ncol=2,
    mode="expand",
    loc="lower left",
    handles=handles[1:], 
    labels=labels[1:],
)
leg.get_frame().set_facecolor("none")
leg.get_frame().set_linewidth(0.0)

plt.tight_layout()
plt.savefig("../img/unshare.pdf")

# Overhead

In [None]:
opt_data = pd.read_csv("../cleaned_data/optimization.csv")
opt_data.head()

In [None]:
paper_rc = {'lines.linewidth': 3, 'lines.markersize': 12.5}                  
sns.set_context("paper", rc = paper_rc, font_scale=2.4)

size = [x*1 for x in [7.5, 6]]
fig = plt.figure(figsize=size)

opt_data["capped time"] = opt_data["time"].map(lambda x: min(x, 90))

ax = sns.lineplot(x="max_pace", y="capped time", style="execution_type", ci=None, markers=True, dashes=False, color="black", data=opt_data)

for i, row in opt_data[opt_data["time"] != opt_data["capped time"]].iterrows():
    if row["time"] == 30*60:
        text = "DNF"
    else:
        text = int(row["time"])
    top = 5 if i%2 == 1 else -15
    ax.text(x=row["max_pace"]-5, y=row["capped time"]+top, s=text)

handles, labels = ax.get_legend_handles_labels()
leg = ax.legend(
    bbox_to_anchor=(-0.25, 1.05, 1.4, 1.02),
    ncol=2,
    mode="expand",
    loc="lower left",
    handles=handles[1:], 
    labels=labels[1:],
)
leg.get_frame().set_facecolor("none")
leg.get_frame().set_linewidth(0.0)

ax.set_ylabel("Optimization time (s)")
ax.set_xlabel("Max Pace")

plt.ylim(-5, 105)
ax.set_yticks(np.arange(0, 100+1, 25))

plt.xlim(5, 108)

plt.tight_layout()
plt.savefig("../img/overhead.pdf", bbox_inches="tight")
plt.show()
plt.close()

# Overhead Breakdown

In [None]:
paper_rc = {'lines.linewidth': 3, 'lines.markersize': 12.5}                  
sns.set_context("paper", rc = paper_rc, font_scale=2.4)

size = [x*1 for x in [7.5, 8]]
fig, ax = plt.subplots(1, figsize=size)

overhead = pd.read_csv("../cleaned_data/optimization_overhead.csv", index_col=0)
overhead = overhead.drop("Total", axis=1)
overhead = overhead.set_index("Query Set")

# df.set_index("").T.plot(kind='bar', stacked=True)
print(overhead)
overhead.plot(kind="bar", stacked=True, color=[color_palette[x] for x in [1, 2, 5]], ax=ax)

ax.tick_params(axis='x', rotation=0)

ax.set_yticks(np.arange(0, 50+1, 10))
ax.set_xticks([0, 0.9, 2])

ax.set_ylabel("CPU Time (s)")
ax.set_xlabel("")

handles, labels = ax.get_legend_handles_labels()
leg = ax.legend(
    bbox_to_anchor=(0, 1),
    ncol=1,
    mode="expand",
    loc="lower left",
)
leg.get_frame().set_facecolor("none")
leg.get_frame().set_linewidth(0.0)

plt.tight_layout()
plt.savefig("../img/optimization_overhead.pdf")

# Micro data

In [None]:
micro_data = pd.read_csv("../cleaned_data/micro_data.csv")
micro_data["share"] = micro_data["execution_type"].map(lambda x: "NoShare" if "NoShare" in x else "Share")
micro_data["execution_type"] = micro_data["execution_type"].str.replace(r"\(.*\)","").str.strip()
micro_data.tail()

In [None]:
paper_rc = {'lines.linewidth': 3, 'lines.markersize': 12.5}                  
sns.set_context("paper", rc = paper_rc, font_scale=2.4)

for pairing in micro_data["pairing"].unique():
    fig, ax = plt.subplots(1)
    fig_legend, axi = plt.subplots(1)
    
    df = micro_data[micro_data["pairing"] == pairing]
    df["x"] = df["goal"].map({
        0.1: 3,
        0.2: 2,
        0.5: 1,
        1.0: 0,
    })
    
    ax = sns.lineplot(x="x", y="time", style="execution_type", ci=None, markers=True, dashes=False, color="black", data=df, ax=ax)

#     ax.set_yticks(np.arange(0, 3501, 500))
    ax.set_xticks(range(4))
    ax.set_xticklabels([1.0, 0.5, 0.2, 0.1])
    
    if pairing == "pairA":
        ymax = 100
        ysteps = 20
    elif pairing == "pairB":
        ymax = 400
        ysteps = 100
    elif pairing == "pairC":
        ymax = 125
        ysteps = 25
        
    plt.ylim(0, ymax+1)
    ax.set_yticks(np.arange(0, ymax+1, ysteps))

    ax.set_ylabel("CPU Time (s)")
    ax.set_xlabel("Final Work Constraint")

    handles, labels = ax.get_legend_handles_labels()
    leg = fig_legend.legend(handles=handles[1:], labels=labels[1:], loc="upper left", ncol=4, frameon=False)
    axi.xaxis.set_visible(False)
    axi.yaxis.set_visible(False)
    fig_legend.canvas.draw()
    sns.despine(left=True, bottom=True, right=True, ax=axi)
    
    legend = ax.legend()
    legend.remove()

    plt.tight_layout()
    fig.savefig("../img/micro_{}.pdf".format(pairing), bbox_inches="tight")
    fig_legend.savefig("../img/micro_legend.pdf", bbox_inches="tight")
    plt.show()
    plt.close()

In [None]:
micro_data.head()

In [None]:
df = micro_data[
    (micro_data["execution_type"] != "iShare") &
    (micro_data["pairing"] != "pairB") &
    ((micro_data["goal"] == 1.0) | (micro_data["goal"] == 0.1) | (micro_data["goal"] == 2.0))
]
df["execution_type"] = df["execution_type"].map(lambda x: "NoShare" if "NoShare" in x else "Share")
df = df.groupby(["execution_type", "pairing", "goal"]).mean().reset_index()
df

# ax = sns.barplot(x="execution_type", y="time", data=df)

# labels = df["execution_type"]
# legend_labels = [mpatches.Patch(color=color_palette[i], label=label) for i, label in enumerate(labels)]
# plt.legend(handles=legend_labels)

# plt.tick_params(
#     axis='x',          
#     which='both',      
#     bottom=False,
#     top=False,         
#     labelbottom=False)

# ax.set_xlabel("")
# ax.set_ylabel("CPU Time (s)")

# plt.title("1.0")

In [None]:
paper_rc = {'lines.linewidth': 3, 'lines.markersize': 12.5}
sns.set_context("paper", rc = paper_rc, font_scale=2.6)

# micro_batch = pd.DataFrame([
#     ["Share", "pairA", 200, 10],
#     ["NoShare", "pairA", 200, 10],
#     ["Share", "pairC", 200, 10],
#     ["NoShare", "pairC", 200, 10],
# ])
# micro_batch.columns = ["execution_type", "pairing", "goal", "time"]

data = df #pd.concat([df, micro_batch])

for pairing in data["pairing"].unique():
    fig, ax = plt.subplots(1)
    fig_legend, axi = plt.subplots(1)
    
#     ax = sns.barplot(x="goal", y="time", hue="execution_type", ci=None, data=data[data["pairing"] == pairing])
    temp_data = data[data["pairing"] == pairing]
    temp_data = temp_data.drop("pairing", axis=1)
    temp_data = temp_data.pivot(columns="execution_type", index="goal")
    temp_data.columns = temp_data.columns.get_level_values(1)
    temp_data = temp_data[["Share", "NoShare"]]
    temp_data = temp_data.sort_index(ascending=False)
    ax = temp_data.plot(kind="bar", legend=False, ax=ax, color=color_palette[3:5])
    
    ax.tick_params(axis='x', rotation=0)
    ax.set_xticks([0, 0.9, 2])
    ax.set_xticklabels(["Batch", "High+High", "High+Low"])
    ax.set_xlabel("")
    
    ax.set_ylabel("CPU Time (s)")
    
    if pairing == "pairC":
        ax.set_yticks(np.arange(0, 125+1, 25))
    else:
        ax.set_yticks(np.arange(0, 100+1, 25))
    
    handles, labels = ax.get_legend_handles_labels()
    leg = fig_legend.legend(handles=handles[0:], labels=labels[0:], loc="upper left", ncol=2, frameon=False)
    axi.xaxis.set_visible(False)
    axi.yaxis.set_visible(False)
    fig_legend.canvas.draw()
    sns.despine(left=True, bottom=True, right=True, ax=axi)

    plt.tight_layout()
    fig.savefig("../img/motiviation_{}.pdf".format(pairing), bbox_inches="tight")
    fig_legend.savefig("../img/motivation_legend.pdf", bbox_inches="tight")
    
    plt.show()
    
    plt.close()