# Experiment 01: Cardinality Distortion

Process this notebook like so to generate the PDF output:

```bash
jupyter execute --inplace 01-Cardinality-Distortion.ipynb
jupyter nbconvert --to pdf --TagRemovePreprocessor.remove_cell_tags='{"hide"}' 01-Cardinality-Distortion.ipynb
```

# Internals

The cells in this section can be ignored in the PDF output. They perform the technical aspects of the data analysis.
This mainly involves creating distortion plots (Figure 3 in the original paper) and determining the best replacements for
the original plots.

Please take a look at the following sections to see the actual outputs.

In [None]:
import json
import sys
from collections.abc import Iterable
from pathlib import Path
from typing import Optional

import matplotlib.pyplot as plt
import natsort
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import figure, ticker

from postbound.db import db, postgres
from postbound.experiments import workloads
from postbound.optimizer import jointree

In [None]:
results_base = Path("/ari/results/experiment-01-cardinality-distortion/")
output_dir = Path("/ari/results/eval/experiment-01-cardinality-estimation/")
output_dir.mkdir(parents=True, exist_ok=True)
workloads.workloads_base_dir = "/ari/postbound/workloads"
plt.rcParams["figure.figsize"] = (15, 5)
plt.rcParams["pdf.fonttype"] = 42
plt.rcParams["ps.fonttype"] = 42
sns.set_style("whitegrid")
sns.set_context("talk")

In [None]:
def label_sort(
    df: pd.DataFrame, *, col_names: str | Iterable[str] = "label"
) -> pd.DataFrame:
    return df.sort_values(
        by=col_names, key=lambda series: np.argsort(natsort.index_natsorted(series))
    )


def explain_plan(explain_data: str) -> db.QueryExecutionPlan | None:
    json_data = json.loads(explain_data)
    if not json_data:
        return None
    pg_plan = postgres.PostgresExplainPlan(json_data)
    return pg_plan.as_query_execution_plan()


def parse_physical_qep(
    sample: pd.Series, *, workload: workloads.Workload[str]
) -> jointree.PhysicalQueryPlan | None:
    if sample["query_plan"] is None:
        return None
    query = workload[sample["label"]]
    return jointree.PhysicalQueryPlan.load_from_query_plan(
        sample["query_plan"], query=query, operators_only=True
    )


def make_plan_idxs(sample: pd.Series) -> np.ndarray:
    sample = pd.Series(np.where(np.isnan(sample), sys.float_info.max, sample))
    unique_plans = sample.unique()
    plan_lookup = dict(zip(unique_plans, range(1, len(unique_plans) + 1)))

    lookup_fn = np.vectorize(lambda plan_hash: plan_lookup[plan_hash])
    return lookup_fn(sample)


def load_results(
    file: str, *, workload: workloads.Workload[str]
) -> Optional[pd.DataFrame]:
    data_file = results_base / file
    if not data_file.exists():
        return None
    distortion_df: pd.DataFrame = (
        pd.read_csv(data_file, converters={"query_plan": explain_plan})
        .query("distortion_factor <= 4")
        .pipe(label_sort)
    )

    distortion_df["label"] = pd.Categorical(
        distortion_df["label"], categories=workload.labels(), ordered=True
    )
    distortion_df.sort_values(
        by=["label", "distortion_factor"], inplace=True
    )  # sorting is important to determine the correct plan indexes
    distortion_df["plan_hash"] = distortion_df["query_plan"].apply(hash)
    distortion_df["plan_idx"] = distortion_df.groupby(
        "label", as_index=False, observed=True
    )["plan_hash"].transform(make_plan_idxs)
    distortion_df["timeout"] = np.isinf(distortion_df["runtime"])

    return distortion_df


def make_distortion_plot(
    df: pd.DataFrame | None, *, label: str, workload: str, suffix: str
) -> Optional[figure.Figure]:
    sns.set_context("talk")

    current_sample = (
        df.query("label == @label").copy().sort_values(by="distortion_factor")
    )

    fig, (ax_distortion, ax_plans) = plt.subplots(nrows=2, sharex=True, figsize=(7, 6))

    g_distortion = sns.lineplot(
        current_sample, x="distortion_factor", y="runtime", ax=ax_distortion
    )
    g_distortion.set(title=f"{workload} query {label}", ylabel="Runtime [s]")
    g_distortion.set_ylim(
        bottom=-10
        if current_sample.runtime.min() < 1.0 and current_sample.runtime.max() > 5
        else 0,
        top=1.1 * current_sample.runtime.max(),
    )
    g_distortion.axvline(1.0, color="grey", linestyle="dashed")

    g_plans = sns.scatterplot(
        current_sample, x="distortion_factor", y="plan_idx", ax=ax_plans
    )
    g_plans.set(xlabel="Distortion factor", ylabel="Plan")
    g_plans.yaxis.set_major_locator(
        ticker.MaxNLocator(integer=True, nbins="auto", min_n_ticks=1)
    )
    g_plans.axvline(1.0, color="grey", linestyle="dashed")

    out_file = output_dir / f"{workload.lower()}-cardinality-distortion-{suffix}.pdf"
    fig.tight_layout()
    fig.savefig(out_file)
    plt.close(fig)
    return fig


def count_jump_backs(plan_idxs: pd.Series) -> int:
    seen_plans: set[int] = set()
    prev_plan = 0
    jumps = 0
    for i in plan_idxs:
        if i != prev_plan and i in seen_plans:
            jumps += 1
        prev_plan = i
        seen_plans.add(i)
    return jumps

In [None]:
def generate_all_plots(
    df: pd.DataFrame | None, *, benchmark: workloads.Workload
) -> dict[str, figure.Figure]:
    if df is None:
        return {}
    return {
        label: make_distortion_plot(
            df, label=label, workload=benchmark.name, suffix=label
        )
        for label in benchmark.labels()
    }


def make_plan_eval_df(df: pd.DataFrame | None) -> Optional[pd.DataFrame]:
    if df is None:
        return None

    plan_selection_df: pd.DataFrame = (
        df.groupby("label", as_index=False, observed=True)
        .agg(
            n_plans=pd.NamedAgg(
                column="plan_idx", aggfunc=lambda plans: len(set(plans))
            ),
            n_jump_backs=pd.NamedAgg(column="plan_idx", aggfunc=count_jump_backs),
        )
        .pipe(label_sort)
    )
    plan_selection_df["n_steps"] = (
        plan_selection_df["n_plans"] - plan_selection_df["n_jump_backs"] - 1
    )
    return plan_selection_df


def select_evolution_replacement(
    df: pd.DataFrame | None, *, plots: dict
) -> Optional[figure.Figure]:
    if df is None:
        return None

    evolution_plans = df.query("n_jump_backs == 0")
    most_evolutions = evolution_plans["n_steps"].idxmax()
    selected = evolution_plans.loc[most_evolutions].label
    return plots[selected]


def select_jitter_replacement(
    df: pd.DataFrame | None, *, plots: dict
) -> Optional[figure.Figure]:
    if df is None:
        return None

    selected = df.loc[df["n_jump_backs"].argmax()].label
    return plots[selected]


def make_summary(df: pd.DataFrame | None) -> Optional[pd.Series]:
    if df is None:
        return None

    same_plan = df.query("n_plans == 1")["label"].count()
    evolution = df.query("n_plans > 1 & n_jump_backs == 0")["label"].count()
    jumps = df["label"].count() - same_plan - evolution
    return pd.Series(
        [same_plan, evolution, jumps], index=["Same plan", "Evolution", "Jumps"]
    )

In [None]:
job = workloads.job()
stats = workloads.stats()

In [None]:
job_df = load_results("card-distortion-job.csv", workload=job)
stats_df = load_results("card-distortion-stats.csv", workload=stats)

In [None]:
job_plots = generate_all_plots(job_df, benchmark=job)
stats_plots = generate_all_plots(stats_df, benchmark=stats)

In [None]:
job_plan_eval = make_plan_eval_df(job_df)

# Suggested replacement plots

These are the plots that might be the best replacements for the original plots based on the current hardware. See the README
for the motivation behind this strategy.

Please note that these plots are determined automatically using a coarse heuristic. In contrast, the plots in the original
paper have been selected semi-automatically using the same statistics but with human oversight. If one of the plots shows a
significant difference, please take a look at the remaining plots for the other queries. Perhaps the heuristic selected a poor
replacement plot.

In [None]:
select_evolution_replacement(job_plan_eval, plots=job_plots)

In [None]:
select_jitter_replacement(job_plan_eval, plots=job_plots)

# Aggregated statistics

The numbers in this section correspond to the bullet list in Section 4.1 of the original paper.

In [None]:
make_summary(job_plan_eval)