# Selector runtime: stacked& grouped bar chart

Expected output:
![selector_time](../demo_out/selector_timefmow.png)


In [None]:
DATASET = "fmow"
PATH3_SE = f"../clean_data/sgs3/growing/{DATASET}/new_logs_selector_backup"
PATH4_SE = f"../clean_data/sgs4/growing/{DATASET}/new_logs_selector_backup"

TITLE_COLOR = "#FC6A49"
if DATASET == "fmow":
    TITLE_COLOR = "#6F9074"
elif DATASET == "huffpost":
    TITLE_COLOR = "#745474"

OUTPUT_FOLDER = "out_clean"


In [None]:
import pandas as pd
from chart_style_mpl import set_size
import seaborn as sns
import matplotlib as mpl
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import pylab
import json
import scienceplots

In [None]:
CONFIG_PARSING_FILE = "../config_histograms/config_selector.json"
FILE_NAME = "/path/to/your/log/file/read/below"
file_content = json.load(open(FILE_NAME, "r"))
merged_df = pd.DataFrame(file_content)

In [None]:
groups = {
    "D_25_2": ["GradNorm_25pc_2ep_BTS", "Margin_25pc_2ep_BTS", "SubmodularFL_25pc_2ep_BTS", "Kcenter_25pc_2ep_BTS", "GradMatch_25pc_2ep_BTS"],
    "D_25_4": ["GradNorm_25pc_4ep_BTS", "Margin_25pc_4ep_BTS", "SubmodularFL_25pc_4ep_BTS", "Kcenter_25pc_4ep_BTS", "GradMatch_25pc_4ep_BTS"],
    "P_25_1": ["random_25pc_1ep", "randomNoRep_25pc_1ep", "labelBalanced_25pc_1ep", "triggerBalanced_25pc_1ep"],
    "P_50_1": ["random_50pc_1ep","randomNoRep_50pc_1ep", "labelBalanced_50pc_1ep", "triggerBalanced_50pc_1ep",],
    "P_25_2": ["random_25pc_2ep","randomNoRep_25pc_2ep", "labelBalanced_25pc_2ep", "triggerBalanced_25pc_2ep"],
    "P_75_1": ["random_75pc_1ep", "randomNoRep_75pc_1ep", "labelBalanced_75pc_1ep", "triggerBalanced_75pc_1ep"],
    "P_25_4": [   "random_25pc_4ep", "randomNoRep_25pc_4ep", "labelBalanced_25pc_4ep", "triggerBalanced_25pc_4ep"],
    "P_50_2": ["random_50pc_2ep", "randomNoRep_50pc_2ep", "labelBalanced_50pc_2ep", "triggerBalanced_50pc_2ep",],
    "A": ["all_data"],
    "A+": ["all_data_2ep", "all_data_4ep"]
}

labels = []
for experiment in merged_df.index.to_list():
    found = False
    for key in groups:
        if experiment in groups[key]:
            labels.append(key)
            found = True
            break
    if not found:
        labels.append("Extra")
merged_df["group"] = labels

In [None]:
short_names = []
for el in merged_df.index.tolist():
    if "all_data" not in el:
        spl = el.split("_")
        short_name = f"{spl[0]} {spl[1].split('pc')[0]}\%, {spl[2].split('ep')[0]}e"
    else:
        short_name = {"all_data" : "all 1e","all_data_2ep": "all 2e","all_data_4ep": "all 4e","all_data_4ep_tail5": "all 4e t5"}[el]
    short_names.append(short_name)
merged_df["short_name"] = short_names

## Selector time for presampling strategies

In [None]:
chart_df = merged_df[(merged_df.group.str.startswith("P")) | (merged_df.group.str.startswith("A"))].copy()

presampling_strategy = []
presampling_ratio = []
training_epochs = []
for index, row in chart_df.iterrows():
    if "all" in index:
        presampling_strategy.append("No presampling")
        presampling_ratio.append(100)
        training_epochs.append(1)
    else:
        presampling_strategy.append(index.split("_")[0])
        presampling_ratio.append(int(index.split("_")[1].split("pc")[0]))
        training_epochs.append(int(index.split("_")[2].split("ep")[0]))
chart_df["presampling_strategy"] = presampling_strategy
chart_df["presampling_ratio"] = presampling_ratio
chart_df["training_epochs"] = training_epochs

In [None]:
chart_df = chart_df[["presampling_strategy", "training_epochs", "presampling_ratio", "SQL query", "Store samples", "selector"]].groupby(by=["presampling_strategy", "presampling_ratio"]).mean()
chart_df = pd.DataFrame(chart_df.to_records())

In [None]:
chart_df["sum"] = chart_df["SQL query"] + chart_df["Store samples"]

In [None]:
fig, ax = plt.subplots()
a = sns.barplot(data=chart_df, x='presampling_strategy', y='sum', hue='presampling_ratio',  hatch=r"||||", edgecolor="black", linewidth = .1, palette="tab10")
a = sns.barplot(data=chart_df, x='presampling_strategy', y='SQL query', hue='presampling_ratio', hatch = r"----", edgecolor="white", linewidth = .1, palette="tab10")
base = [h.get_height() for h in a.patches]

plt.style.use(['science'])
plt.rcParams.update({
    "text.usetex": True,
    "font.family": "Helvetica"
})
tex_fonts = {
    # Use LaTeX to write all text
    "text.usetex": True,
    "font.family": "Helvetica",
    # Use 10pt font in plots, to match 10pt font in document
    "axes.labelsize": 8,
    "font.size": 8,
    # Make the legend/label fonts a little smaller
    "legend.fontsize": 10,
    "xtick.labelsize": 8,
    "ytick.labelsize": 8
}

ax.get_legend().remove()
plt.rcParams['figure.figsize'] = set_size(fraction=1.5, subplots=(1.3,3))
plt.rcParams.update(tex_fonts)
plt.xlabel("")
plt.ylabel("Total Selector Time")
for spine in ["bottom", "top", "right", "left"]:
    ax.spines[spine].set_color(TITLE_COLOR)
ax.set_ylim([0, 25])
plt.savefig(f"{OUTPUT_FOLDER}/selector_time{DATASET}.pdf", format='pdf', bbox_inches='tight')


In [None]:
blue = mpl.colormaps["tab10"](0)
red = mpl.colormaps["tab10"](3)
fig, ax = plt.subplots()
ax.axis(False)
white_patch = mpatches.Patch(color='white', alpha = 0)

blue = Patch(facecolor=blue,
      label='Color Patch')
orange = Patch(facecolor=mpl.colormaps["tab10"](1),
            label='Color Patch')
green = Patch(facecolor=mpl.colormaps["tab10"](2),
               label='Color Patch')
red = Patch(facecolor=red,
             label='Color Patch')

a = Patch(facecolor="white",
            label='Color Patch', hatch = "-------", linewidth=0.1, edgecolor = "black")

b = Patch(facecolor="black",
          label='Color Patch', hatch = "|||||||", linewidth=0.01, edgecolor = "white")

legend = pylab.figlegend([white_patch, white_patch, blue, a, orange, b, green, white_patch, red] , [r"\textbf{Grouping}", r"\textbf{Stacking}", "25\% presampling", "SQL query", "50\% presampling", "Store samples", "75\% presampling", "", "No presampling"] , loc="center", bbox_to_anchor=(0.5, 0.5), ncol = 5, frameon=False)
fig.canvas.draw()
fig.savefig(f"{OUTPUT_FOLDER}/growing_legend_selector.pdf", format='pdf', bbox_inches=legend.get_window_extent().transformed(fig.dpi_scale_trans.inverted()), transparent=True)