# Experiment 04: Stability of the Statistics Catalog

Process this notebook like so to generate the PDF output:

```bash
jupyter execute --inplace 04-Analyze-Stability.ipynb
jupyter nbconvert --to pdf --TagRemovePreprocessor.remove_cell_tags='{"hide"}' 04-Analyze-Stability.ipynb
```

# Internals

The cells in this section can be ignored in the PDF output. They perform the technical aspects of the data analysis.
Please take a look at the following sections to see the actual outputs.

In [None]:
import json
from pathlib import Path
from typing import Optional

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import figure, ticker

from postbound.db import postgres
from postbound.experiments import workloads

In [None]:
results_base = Path("/ari/results/experiment-05-analyze-stability/")
out_dir = Path("/ari/results/eval/experiment-05-analyze-stability/")
out_dir.mkdir(parents=True, exist_ok=True)
workloads.workloads_base_dir = "/ari/postbound/workloads"
plt.rcParams["figure.figsize"] = (10, 4)
plt.rcParams["pdf.fonttype"] = 42
plt.rcParams["ps.fonttype"] = 42
sns.set_style("whitegrid")
sns.set_context("talk")

In [None]:
def explain_plan(explain_data: str) -> postgres.PostgresExplainPlan | None:
    json_data = json.loads(explain_data)
    if not json_data:
        return None
    pg_plan = postgres.PostgresExplainPlan(json_data)
    return pg_plan


def n_tables(label: str, *, workload: workloads.Workload) -> int:
    query = workload[label]
    return len(query.tables())


def read_df(
    workload: workloads.Workload[str], *, geqo_thresh: int = 12
) -> Optional[pd.DataFrame]:
    enumerators = ["dynprog", "geqo"]
    results_dir = results_base / workload.name.lower()
    result_df = pd.DataFrame()

    for enum in enumerators:
        data_file = (
            results_dir / f"pg-{workload.name.lower()}-analyze-stability_{enum}.csv"
        )
        if not data_file.exists():
            continue

        current_df = pd.read_csv(data_file, converters={"query_result": explain_plan})
        current_df["plan_hash"] = current_df["query_result"].map(hash)
        current_df["geqo"] = "on" if enum == "geqo" else "off"
        current_df["triggers_geqo"] = (
            current_df["label"].apply(n_tables, workload=workload) >= geqo_thresh
        )

        result_df = pd.concat([result_df, current_df], ignore_index=True)

    if result_df.empty:
        return None

    result_df["label"] = pd.Categorical(
        result_df["label"], categories=workload.labels(), ordered=True
    )
    return result_df


def plans_per_query(df: pd.DataFrame | None) -> Optional[pd.DataFrame]:
    if df is None:
        return None

    raw_ppq = df.groupby(
        ["label", "geqo", "triggers_geqo"], as_index=False, observed=True
    ).agg(
        n_plans=pd.NamedAgg(column="plan_hash", aggfunc=lambda plans: len(set(plans))),
        min_rt=pd.NamedAgg(column="exec_time", aggfunc="min"),
        max_rt=pd.NamedAgg(column="exec_time", aggfunc="max"),
    )

    ppq = (
        raw_ppq.merge(
            raw_ppq.query("geqo == 'on'")[["label", "n_plans"]],
            on="label",
            how="left",
            suffixes=("", "_geqo"),
        )
        .fillna({"n_plans_geqo": 1})
        .assign(
            n_plans=lambda df: np.where(
                ~df["triggers_geqo"],
                np.max([df["n_plans"], df["n_plans_geqo"]], axis=0),
                df["n_plans"],
            )
        )
        .drop(columns="n_plans_geqo")
        .fillna({"n_plans": 1})
    )

    return ppq


def make_ppq_plot(
    ppq: pd.DataFrame | None, *, workload: workloads.Workload
) -> Optional[figure.Figure]:
    if ppq is None:
        return None

    changing_plans = (
        ppq.query("n_plans > 1")
        .pivot(columns="geqo", index="label", values="n_plans")
        .reset_index()
        .fillna({"off": 1.0, "on": 1.0})
        .assign(
            on=lambda df: np.where(
                df["label"].apply(n_tables, workload=workload) >= 12, df["on"], np.nan
            )
        )
        .melt(id_vars="label", var_name="geqo", value_name="n_plans")
    )

    g = sns.barplot(
        changing_plans,
        x="label",
        y="n_plans",
        hue="geqo",
        order=changing_plans["label"].unique(),
    )  # restrict to the labels that are actually present
    g.tick_params(axis="x", rotation=90)
    g.yaxis.set_major_locator(ticker.MaxNLocator(integer=True))
    g.set(xlabel="Query", ylabel="Distinct plans")
    g.legend(title="GEQO")

    out_file = out_dir / f"{workload.name.lower()}-plan-stability.pdf"
    fig = g.get_figure()
    fig.tight_layout()
    fig.savefig(out_file)
    plt.close(fig)
    return fig


def plan_changes_summary(ppq: pd.DataFrame | None) -> Optional[pd.DataFrame]:
    if ppq is None:
        return None

    return (
        ppq.query("n_plans > 1")
        .groupby("geqo", as_index=False, observed=True)["n_plans"]
        .count()
        .rename(columns={"n_plans": "changing_queries"})
    )


def runtime_changes_summary(ppq: pd.DataFrame | None) -> Optional[pd.DataFrame]:
    if ppq is None:
        return None

    return (
        ppq.query("n_plans > 1")
        .groupby("geqo", as_index=False)[["min_rt", "max_rt"]]
        .sum()
    )


In [None]:
job = workloads.job()
stats = workloads.stats()
stack = workloads.stack()

In [None]:
df_job = read_df(job)
df_stats = read_df(stats)
df_stack = read_df(stack)

In [None]:
job_ppq = plans_per_query(df_job)
stats_ppq = plans_per_query(df_stats)
stack_ppq = plans_per_query(df_stack)

In [None]:
job_ppq_plot = make_ppq_plot(job_ppq, workload=job)
_ = make_ppq_plot(stats_ppq, workload=stats)
_ = make_ppq_plot(stack_ppq, workload=stack)

# Replacement plots

**Replacement for Figure 10:** This plot should show that plan changes happen when GEQO is activated as well as when GEQO is
turned off.

This is an aggregated plot that does not rely on a selection heuristic.

In [None]:
job_ppq_plot

# Aggregated statistics

These numbers roughly correspond to the numbers reported in the paper.

First, we compute how many queries are affected by plan changes caused by the sampled statistics (GEQO is off) catalog or
caused by the statistics catalog and GEQO (GEQO is on):

In [None]:
plan_changes_summary(job_ppq)

Second, we compute how large the runtime difference is for queries that select different execution plans.
Again, we compute these numbers for plan changes caused by the sampled statistics (GEQO is off) and caused by the statistics
catalog and GEQO (GEQO is on):

In [None]:
runtime_changes_summary(job_ppq)