# Experiment 07: Beyond Textbook Optimizers

Process this notebook like so to generate the PDF output:

```bash
jupyter execute --inplace 05-Beyond-Textbook.ipynb
jupyter nbconvert --to pdf --TagRemovePreprocessor.remove_cell_tags='{"hide"}' 05-Beyond-Textbook.ipynb
```

# Internals

The cells in this section can be ignored in the PDF output. They perform the technical aspects of the data analysis.
This mainly involves creating data shift plots and determining the best replacements for the original plots.

Please take a look at the following sections to see the actual outputs.

In [None]:
import json
import warnings
from pathlib import Path
from typing import Optional

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import figure, ticker

from postbound.db import postgres
from postbound.experiments import workloads

In [None]:
results_base = Path("/ari/results/experiment-07-beyond-textbook/")
output_dir = Path("/ari/results/eval/experiment-07-beyond-textbook/")
output_dir.mkdir(parents=True, exist_ok=True)
workloads.workloads_base_dir = "/ari/postbound/workloads"
plt.rcParams["figure.figsize"] = (7, 5)
plt.rcParams["pdf.fonttype"] = 42
plt.rcParams["ps.fonttype"] = 42
sns.set_style("whitegrid")
sns.set_context("talk")

In [None]:
def load_pg_explain(raw_explain: str) -> Optional[postgres.PostgresExplainPlan]:
    plan_json = json.loads(raw_explain)
    if not plan_json:
        return None
    return postgres.PostgresExplainPlan(plan_json)


def read_df(workload: workloads.Workload) -> Optional[pd.DataFrame]:
    data_file = results_base / workload.name.lower() / "data-shift.csv"
    if not data_file.exists():
        return None

    df = pd.read_csv(data_file, converters={"query_plan": load_pg_explain})
    df["label"] = pd.Categorical(df["label"], categories=workload.labels(), ordered=True)

    return df


def make_evolution_plots(df: pd.DataFrame | None, *, workload: workloads.Workload) -> dict[str, figure.Figure]:
    if df is None:
        return {}

    plots: dict[str, figure.Figure] = {}

    for label in workload.labels():
        current_samples = df.query("label == @label").copy()
        current_samples["plan_type"] = current_samples["plan_type"].map({"native-fixed": "PG native", "robust-fixed": "Robust (UES)"})
        current_samples["db_size"] = (current_samples["fill_ratio"] / 0.6) * 100

        fig, ax = plt.subplots()
        g = sns.lineplot(current_samples, x="db_size", y="total_runtime",
                        hue="plan_type", style="plan_type",
                        markers=True, dashes=False, ax=ax)
        g.axvline(100.0, color="grey", linestyle=":")
        g.set(xlabel="Database size", ylabel="Runtime [s]",
            title=f"{workload.name} query {label}")
        g.xaxis.set_major_formatter(ticker.PercentFormatter())
        g.legend(title="Optimizer type")

        fig.tight_layout()
        out_file = output_dir / workload.name.lower() / f"data-shift-{label}.pdf"
        out_file.parent.mkdir(parents=True, exist_ok=True)
        fig.savefig(out_file)
        plt.close(fig)

        plots[label] = fig

    return plots


def linear_approx(fill_ratios: pd.Series, *, min_fr, max_fr, min_rt, max_rt) -> pd.Series:
    m = (max_rt - min_rt) / (max_fr - min_fr)
    n = min_rt - (m * min_fr)
    return m * fill_ratios + n

def determine_largest_jump(current_sample: pd.DataFrame) -> pd.Series:
    current_sample = current_sample.sort_values(by="fill_ratio").reset_index(drop=True)

    min_fill_ratio, max_fill_ratio = current_sample["fill_ratio"].min(), current_sample["fill_ratio"].max()
    min_rt = current_sample.query("fill_ratio == @min_fill_ratio")["total_runtime"].item()
    max_rt = current_sample.query("fill_ratio == @max_fill_ratio")["total_runtime"].item()

    approx_rts = linear_approx(current_sample["fill_ratio"], min_fr=min_fill_ratio, max_fr=max_fill_ratio, min_rt=min_rt, max_rt=max_rt)
    rt_deviation = current_sample["total_runtime"] - approx_rts

    jump_point = rt_deviation.abs().idxmax()
    expected_runtime = current_sample.at[jump_point, "total_runtime"]
    actual_runtime = approx_rts.loc[jump_point]
    jump_slowdown = max(expected_runtime, actual_runtime) / min(expected_runtime, actual_runtime)
    jump_fill_ratio = current_sample.at[jump_point, "fill_ratio"]
    return pd.Series(dict(jump_point=jump_fill_ratio, slowdown=jump_slowdown, min_rt=min_rt, max_rt=max_rt))


def make_jumps_df(df: pd.DataFrame | None) -> Optional[pd.DataFrame]:
    if df is None:
        return None

    jumps_df = (df
                .groupby(["plan_type", "label"], as_index=False, observed=True)
                .apply(determine_largest_jump, include_groups=False)
                .assign(rt_diff=lambda samples: samples["max_rt"] - samples["min_rt"])
                .sort_values(by=["label", "plan_type"]))

    return jumps_df


def select_underest_jump_replacement(df: pd.DataFrame | None, *, workload: workloads.Workload, plots: dict[str, figure.Figure]) -> Optional[figure.Figure]:
    if df is None:
        return None

    underest_results = df.query("rt_diff >= 1 & slowdown >= 1.3 & jump_point >= 0.6")
    if underest_results.empty:
        warnings.warn(f"No jumps during underestimation detected for workload {workload.name}")
        return None

    selected: str = underest_results.iloc[underest_results["rt_diff"].argmax()]["label"]
    return plots[selected]


def select_overest_jump_replacement(df: pd.DataFrame | None, *, workload: workloads.Workload, plots: dict[str, figure.Figure]) -> Optional[figure.Figure]:
    if df is None:
        return None

    underest_results = df.query("rt_diff >= 1 & slowdown >= 1.3 & jump_point < 0.6")
    if underest_results.empty:
        warnings.warn(f"No jumps during overestimation detected for workload {workload.name}")
        return None

    selected: str = underest_results.iloc[underest_results["rt_diff"].argmax()]["label"]
    return plots[selected]


def make_jump_count_summary(df: pd.DataFrame | None) -> Optional[pd.DataFrame]:
    if df is None:
        return None

    df = df.copy()
    df["plan_type"] = df["plan_type"].map({"native-fixed": "PG native", "robust-fixed": "Robust (UES)"})
    relevant_queries = df.query("rt_diff >= 1").groupby("plan_type", as_index=True).size()
    jumping_queries = df.query("rt_diff >= 1 & slowdown >= 1.3").groupby("plan_type", as_index=True).size()
    results = pd.DataFrame({"relevant_queries": relevant_queries, "jumping_queries": jumping_queries}, index=relevant_queries.index)

    return results


def make_jump_pos_summary(df: pd.DataFrame | None) -> Optional[np.float64]:
    jump_pos = (df
            .query("rt_diff >= 1 & slowdown >= 1.3")
            .assign(jump_pos=lambda sample: np.where(sample["jump_point"] <= 0.6, "overest", "underest"))
            .groupby("jump_pos")
            .size())
    underest_share = 1 - jump_pos.loc["underest"] / jump_pos.sum()
    return underest_share


In [None]:
job = workloads.job()
stats = workloads.stats()

In [None]:
df_job = read_df(job)
df_stats = read_df(stats)

In [None]:
job_plots = make_evolution_plots(df_job, workload=job)
_ = make_evolution_plots(df_stats, workload=stats)

In [None]:
jumps_job = make_jumps_df(df_job)

# Suggested replacement plots

These are the plots that might be the best replacements for the original plots based on the current hardware. See the README
for the motivation behind this strategy.

Please note that these plots are determined automatically using a coarse heuristic. In contrast, the plots in the original
paper have been selected semi-automatically using the same statistics but with human oversight. If one of the plots shows a
significant difference, please take a look at the remaining plots for the other queries. Perhaps the heuristic selected a poor
replacement plot.

In [None]:
select_overest_jump_replacement(jumps_job, workload=job, plots=job_plots)

In [None]:
select_underest_jump_replacement(jumps_job, workload=job, plots=job_plots)

# Aggregated statistics

In [None]:
make_jump_count_summary(jumps_job)

In [None]:
make_jump_pos_summary(jumps_job)