# Experiment 02: Ablation Study for Cardinality Distortion

Process this notebook like so to generate the PDF output:

```bash
jupyter execute --inplace 01-Cardinality-Distortion.ipynb
jupyter nbconvert --to pdf 01-Cardinality-Distortion.ipynb
```

# Internals

The cells in this section can be ignored in the PDF output. They perform the technical aspects of the data analysis.
This includes generating a full version of the distortion plots (Figure 5 in the original paper) and selecting the best
replacement queries.
The plots for all queries and both cost models is exported to the `results/eval/2-distortion-ablation` directory.

Please take a look at the following sections to see the actual outputs.

In [None]:
import itertools
import json
from pathlib import Path
from collections.abc import Iterable
from typing import Literal, Optional

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import figure

from postbound.experiments import workloads
from postbound.db import postgres

In [None]:
results_base = Path("/ari/results/experiment-02-distortion-ablation")
output_dir = Path("/ari/results/eval/experiment-02-distortion-ablation/")
output_dir.mkdir(parents=True, exist_ok=True)
workloads.workloads_base_dir = "/ari/postbound/workloads"
plt.rcParams["figure.figsize"] = (7, 6)
plt.rcParams["pdf.fonttype"] = 42
plt.rcParams["ps.fonttype"] = 42
sns.set_style("whitegrid")
sns.set_context("talk")

In [None]:
stages = {
    1: "minimal",
    2: "textbook",
    3: "textbook-opt",
    4: "intermediates",
    5: "parallel",
}


def load_pg_explain(raw_explain: str) -> Optional[postgres.PostgresExplainPlan]:
    plan_json = json.loads(raw_explain)
    if not plan_json:
        return None
    return postgres.PostgresExplainPlan(plan_json)


def load_results(workload: workloads.Workload) -> Optional[pd.DataFrame]:
    e2e_df = pd.DataFrame()

    data_files = itertools.product(stages.keys(), ("vanilla", "cout"))
    for stage, cost_model in data_files:
        data_file = (
            results_base
            / workload.name
            / f"{workload.name}-distortion-{cost_model}-cost-stage-{stage}.csv"
        )
        if not data_file.exists():
            continue

        distortion_df = pd.read_csv(
            data_file, converters={"query_plan": load_pg_explain}
        )
        distortion_df["label"] = pd.Categorical(
            distortion_df["label"], categories=workload.labels(), ordered=True
        )
        distortion_df["stage"] = pd.Categorical(
            distortion_df["stage"].map(stages), categories=stages.values(), ordered=True
        )
        distortion_df["plan_hash"] = distortion_df["query_plan"].map(hash)
        distortion_df["cost_model"] = cost_model

        e2e_df = pd.concat([e2e_df, distortion_df], ignore_index=True)

    if len(e2e_df) == 0:
        return None

    return e2e_df


def count_plan_changes(plans: pd.Series) -> int:
    change_indicators = plans != plans.shift(-1)
    return change_indicators.sum() - 1


def count_jump_backs(plans: pd.Series) -> int:
    jumps = 0
    prev_plan = plans.iloc[0]
    seen_plans = {prev_plan}
    for plan in plans.iloc[1:]:
        if plan == prev_plan:
            continue

        if plan in seen_plans:
            jumps += 1
        seen_plans.add(plan)
        prev_plan = plan
    return jumps


def make_changes_df(df: pd.DataFrame | None) -> Optional[pd.DataFrame]:
    if df is None:
        return None

    changes_df = (
        df.groupby(["cost_model", "label", "stage"], as_index=False, observed=True)
        .agg(
            plan_changes=pd.NamedAgg(column="plan_hash", aggfunc=count_plan_changes),
            jump_backs=pd.NamedAgg(column="plan_hash", aggfunc=count_jump_backs),
        )
        .sort_values(by=["cost_model", "label", "stage"])
    )
    changes_df["plot_label"] = changes_df.apply(
        lambda row: f"{row['plan_changes']} ({row['jump_backs']})", axis=1
    )

    return changes_df


def make_evolution_plot(
    df: pd.DataFrame | None,
    *,
    workload: str,
    cost_model: Literal["vanilla", "cout"],
    export: bool,
    queries: Optional[Iterable[str]] = None,
) -> Optional[figure.Figure]:
    if df is None:
        return None

    df = df.copy()
    vmin, vmax = 0, df["plan_changes"].max()
    cmap = sns.color_palette("Blues", 12, as_cmap=True)

    stages_pretty = {
        "minimal": "SeqScan\nNestLoopJ\n(minimal)",
        "textbook": "+\nIdxScan\nMergeJ.\nHashJ.",
        "textbook-opt": "+\nIdxOnlyS.\nBitmapS.",
        "intermediates": "+\nMemoize\nMaterial",
        "parallel": "+\nParallel.\n(full)",
    }

    df["stage_pretty"] = pd.Categorical(
        df["stage"].map(stages_pretty), categories=stages_pretty.values(), ordered=True
    )

    data_matrix = df.pivot(index="label", columns="stage_pretty", values="plan_changes")
    annot_matrix = df.pivot(index="label", columns="stage_pretty", values="plot_label")

    if queries:
        data_matrix = data_matrix.loc[queries]
        annot_matrix = annot_matrix.loc[queries]

    fig, ax = plt.subplots()
    g = sns.heatmap(
        data_matrix,
        annot=annot_matrix,
        fmt="",
        cmap=cmap,
        vmin=vmin,
        vmax=vmax,
        linewidth=1,
        cbar=False,
        annot_kws={"size": 18, "fontweight": "normal"},
        ax=ax,
    )
    g.set(xlabel="Available physical operators", ylabel="Query")
    g.tick_params(axis="both", which="major", labelsize=16)

    for i in range(len(df)):
        ax.axhline(i, color="white", lw=7.5)

    if export:
        out_file = (
            output_dir / f"{workload.lower()}-distortion-ablation-{cost_model}.pdf"
        )
        fig.tight_layout()
        fig.savefig(out_file)

    plt.close(fig)
    return fig


def select_replacement_queries(
    df: pd.DataFrame | None, *, workload: str
) -> Optional[figure.Figure]:
    if df is None:
        return None

    target_queries: list[str] = []
    df = df.query("cost_model == 'cout'")

    # determine the query with the most plan changes at the minimal stage
    df_stage1 = df.query("stage == 'minimal'")
    target_label: str = df_stage1.loc[df_stage1["plan_changes"].argmax()]["label"]
    target_queries.append(target_label)

    # determine the query with the most jump backs at the minimal stage
    target_label = df_stage1.loc[df_stage1["jump_backs"].argmax()]["label"]
    if target_label not in target_queries:
        target_queries.append(target_label)

    # determine the query with the largest difference in plan changes between minimal and full stage
    df_stage5 = df.query("stage == 'parallel'")
    df_extremes = pd.merge(
        df_stage1[["label", "plan_changes"]],
        df_stage5[["label", "plan_changes"]],
        on="label",
        suffixes=("_min", "_full"),
    )

    df_extremes["plan_changes_diff"] = np.abs(
        df_extremes["plan_changes_full"] - df_extremes["plan_changes_min"]
    )
    target_label = df_extremes.loc[df_extremes["plan_changes_diff"].argmax()]["label"]
    if target_label not in target_queries:
        target_queries.append(target_label)

    # determine the query with the most plan changes at an intermediate stage where this number is larger than the number of
    # plan changes at the minimal and full stages
    df_extremes["extremes_changes"] = np.maximum(
        df_extremes["plan_changes_min"], df_extremes["plan_changes_full"]
    )
    df_inners = pd.merge(
        df[~df["stage"].isin(["minimal", "parallel"])][["label", "plan_changes"]],
        df_extremes[["label", "extremes_changes"]],
        on="label",
    )
    df_inners = df_inners.query("plan_changes > extremes_changes")
    target_label = df_inners.loc[df_inners["plan_changes"].argmax()]["label"]
    target_queries.append(target_label)

    return make_evolution_plot(
        df, workload=workload, cost_model="cout", export=False, queries=target_queries
    )


def make_summary(df: pd.DataFrame | None) -> Optional[pd.DataFrame]:
    if df is None:
        return None

    extremes_df = (
        df.drop(columns=["plot_label", "stage_pretty"], errors="ignore")
        .query("stage == 'minimal' | stage == 'parallel'")
        .pivot(columns=["stage"], index=["cost_model", "label"])
    )
    extremes_df.columns = [
        f"{measure}_{stage[:3]}"
        for measure, stage in extremes_df.columns.to_flat_index()
    ]
    extremes_df.reset_index(inplace=True)

    plans_min_less_full = (
        extremes_df.query("plan_changes_min < plan_changes_par")
        .groupby("cost_model", as_index=False)["label"]
        .count()
    )

    plans_min_larger_full = (
        extremes_df.query("plan_changes_min > plan_changes_par")
        .groupby("cost_model", as_index=False)["label"]
        .count()
    )

    plans_intermediate_max = (
        df.merge(
            df.query("stage == 'minimal'")[["cost_model", "label", "plan_changes"]],
            on=["cost_model", "label"],
            suffixes=("", "_min"),
        )
        .merge(
            df.query("stage == 'parallel'")[["cost_model", "label", "plan_changes"]],
            on=["cost_model", "label"],
            suffixes=("", "_full"),
        )
        .query(
            "stage != 'minimal' & stage != 'parallel' & plan_changes > plan_changes_min & plan_changes > plan_changes_full"
        )
        .groupby(["cost_model", "label"], as_index=False, observed=True)["plan_changes"]
        .agg(lambda xs: 1)
        .groupby("cost_model", as_index=False, observed=True)["label"]
        .count()
    )

    jumps_min_less_full = (
        extremes_df.query("jump_backs_min > jump_backs_par")
        .groupby("cost_model", as_index=False)["label"]
        .count()
    )

    jumps_min_larger_full = (
        extremes_df.query("jump_backs_min < jump_backs_par")
        .groupby("cost_model", as_index=False)["label"]
        .count()
    )

    jumps_intermediate_max = (
        df.merge(
            df.query("stage == 'minimal'")[["cost_model", "label", "jump_backs"]],
            on=["cost_model", "label"],
            suffixes=("", "_min"),
        )
        .merge(
            df.query("stage == 'parallel'")[["cost_model", "label", "jump_backs"]],
            on=["cost_model", "label"],
            suffixes=("", "_full"),
        )
        .query(
            "stage != 'minimal' & stage != 'parallel' & jump_backs > jump_backs_min & jump_backs > jump_backs_full"
        )
        .groupby(["cost_model", "label"], as_index=False, observed=True)["jump_backs"]
        .agg(lambda xs: 1)
        .groupby("cost_model", as_index=False, observed=True)["label"]
        .count()
    )

    summary_df = pd.concat(
        [
            plans_min_less_full,
            plans_min_larger_full,
            plans_intermediate_max,
            jumps_min_less_full,
            jumps_min_larger_full,
            jumps_intermediate_max,
        ],
        names=["statistic"],
        keys=[
            "nPlans / min < full",
            "nPlans / min > full",
            "nPlans / intermediate = max",
            "nJumps / min < full",
            "nJumps / min > full",
            "nJumps / intermediate = max",
        ],
    )
    summary_df.rename(columns={"label": "n_queries"}, inplace=True)
    return summary_df

In [None]:
job = workloads.job()
stats = workloads.stats()

In [None]:
job_df = load_results(job)
stats_df = load_results(stats)

In [None]:
job_changes = make_changes_df(job_df)
stats_changes = make_changes_df(stats_df)

In [None]:
make_evolution_plot(job_changes, workload=job.name, cost_model="cout", export=True)
make_evolution_plot(job_changes, workload=job.name, cost_model="vanilla", export=True)
make_evolution_plot(stats_changes, workload=stats.name, cost_model="cout", export=True)
make_evolution_plot(
    stats_changes, workload=stats.name, cost_model="vanilla", export=True
)


# Replacement plot

The main takeaway from Section 4.2 is that the issue of unstable plan selection remains,
even if the plan enumerator and the cost model are heavily simplified. The example queries shown in Figure 5 try to paint a
representative picture of the potential plan behavior at different ablation stages.

The suggested replacement plot shows the following queries for the simplified cost model:

1. query with the most number of plan changes at the _minimal_ stage (only sequential scan and nested-loop join)
2. query with the most jumps to earlier plans at the _minimal_ stage (only sequential scan and nested-loop join)
3. query with the largest difference between plans at the _minimal_ stage (only sequential scan and nested-loop join) and
   the _full_ stage (all operators and parallelization)
4. query with the largest number of plan changes at an intermediate stage (i.e. neither _minimal_, nor _full_) where this
   number also exceeds the number of plan changes  at the _minimal_ and _full_ stages

In [None]:
select_replacement_queries(job_changes, workload=job.name, export=False)

# Aggregated statistics

In [None]:
make_summary(job_changes)