In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from analytics.app.data.load import list_pipelines
from analytics.app.data.transform import dfs_models_and_evals, logs_dataframe, patch_yearbook_time

%load_ext autoreload
%autoreload 2

In [None]:
# INPUTS

drift_pipeline = True
if drift_pipeline:
    pipelines_dir = Path("/Users/robinholzinger/robin/dev/eth/modyn-sigmod-data/yearbook/triggering/logs_agg")
else:
    pipelines_dir = Path(
        "/Users/robinholzinger/robin/dev/eth/modyn-sigmod-data/yearbook/data_selection_50%/logs_agg_patch"
    )
output_dir = Path("/Users/robinholzinger/robin/dev/eth/modyn-2/.analytics.log/.data/_plots")
assert pipelines_dir.exists()
assert output_dir.exists()

In [None]:
pipelines = list_pipelines(pipelines_dir)
max_pipeline_id = max(pipelines.keys())
pipelines

In [None]:
from analytics.app.data.load import load_pipeline_logs

pipeline_logs = {p_id: load_pipeline_logs(p_id, pipelines_dir) for (p_id, (_, p_path)) in pipelines.items()}

In [None]:
type(pipeline_logs[5 if not drift_pipeline else 38])

In [None]:
# mode:
pipeline_id = 5 if not drift_pipeline else 38

# doesn't do anything unless include_composite_model = True
composite_model_variant = "currently_trained_model" if not drift_pipeline else "currently_active_model"

patch_yearbook = True
dataset_id = "yearbook_test"
eval_handler = "slidingmatrix"
metric = "Accuracy"
include_composite_model = False

# Wrangle data

In [None]:
pipeline_log = pipeline_logs[pipeline_id]
pipeline_ref = f"{pipeline_id}".zfill(len(str(max_pipeline_id))) + f" - {pipelines[pipeline_id][0]}"

df_all = logs_dataframe(pipeline_log, pipeline_ref)

df_logs_models, _, df_eval_single = dfs_models_and_evals(
    # subtracting would interfere with yearbook patching
    pipeline_log,
    df_all["sample_time"].max(),
    pipeline_ref,
)

df_adjusted = df_eval_single

df_adjusted = df_adjusted[
    (df_adjusted["dataset_id"] == dataset_id)
    & (df_adjusted["eval_handler"] == eval_handler)
    & (df_adjusted["metric"] == metric)
]

# in percent (0-100)
df_adjusted["value"] = df_adjusted["value"] * 100

In [None]:
df_logs_models

In [None]:
if patch_yearbook:
    for column in ["interval_start", "interval_center", "interval_end"]:
        patch_yearbook_time(df_adjusted, column)
    for column in ["train_start", "train_end", "real_train_end", "usage_start", "usage_end"]:
        patch_yearbook_time(df_logs_models, column)

    # correction for -1 second in timestamp format before patching
    df_logs_models["usage_end"] = (
        df_logs_models["usage_end"].dt.to_period("M") + 1
    ).dt.to_timestamp()  # december (because of -1 second in timestamp format) -> start of year

df_logs_models

In [None]:
df_adjusted = df_adjusted.sort_values(by=["interval_center"])

In [None]:
# Add composite model

assert df_adjusted["pipeline_ref"].nunique() <= 1
# add the pipeline time series which is the performance of different models stitched together dep.
# w.r.t which model was active
pipeline_composite_model = df_adjusted[df_adjusted[composite_model_variant]]
pipeline_composite_model["model_idx"] = 0
pipeline_composite_model["id_model"] = 0

label_map = {k: f"{k}" for k, v in df_adjusted[["model_idx", "id_model"]].values}
label_map[0] = "Pipeline composite model"

if include_composite_model:
    df_adjusted = pd.concat([pipeline_composite_model, df_adjusted])
else:
    df_adjusted["model_idx"] = df_adjusted["model_idx"]

# Dump Data backup

# Create Plot

In [None]:
df_adjusted["interval_center"] = df_adjusted["interval_center"].astype(str).str.split("-").str[0]

In [None]:
df_train_end_years_per_model = df_logs_models[["model_idx", "real_train_end"]]
df_train_end_years_per_model["real_train_end"] = df_train_end_years_per_model["real_train_end"].dt.year
df_train_end_years_per_model

In [None]:
df_merged = df_adjusted.merge(df_train_end_years_per_model, on="model_idx", how="left")
df_merged

In [None]:
# build heatmap matrix dataframe:
heatmap_data = df_merged.pivot(index=["real_train_end"], columns="interval_center", values="value")

In [None]:
heatmap_data.index.min(), heatmap_data.index.max()

In [None]:
# Create the heatmap
from analytics.plotting.common.common import INIT_PLOT

INIT_PLOT()
# sns.set_theme(style="ticks")
plt.rcParams["svg.fonttype"] = "none"

FONTSIZE = 20
DOUBLE_FIG_WIDTH = 10
DOUBLE_FIG_HEIGHT = 3.5
DOUBLE_FIG_SIZE = (DOUBLE_FIG_WIDTH, (1.5 if drift_pipeline else 2.2) * DOUBLE_FIG_HEIGHT)

fig = plt.figure(
    edgecolor="black",
    frameon=True,
    figsize=DOUBLE_FIG_SIZE,
    dpi=300,
)

ax = sns.heatmap(
    heatmap_data,
    cmap="RdBu_r",
    linewidths=0.0,
    linecolor="black",
    cbar=True,
    # color bar from 0 to 1
    cbar_kws={
        "label": "Accuracy %",
        "ticks": [0, 25, 50, 75, 100],
        "orientation": "vertical",
    },
)
ax.collections[0].set_rasterized(True)

# Adjust x-axis tick labels
xticks = [x for x in range(1, len(heatmap_data.columns) + 1)]
plt.xlabel("Evaluation Year")
plt.xticks(
    ticks=[x + 0.5 for x in range(0, 2010 - 1930 + 1, 20)],
    labels=[x for x in range(1930, 2010 + 1, 20)],
    rotation=0,
    # ha='right'
)
ax.invert_yaxis()

# Set y-axis ticks to be equally spaced
# y_ticks = range(25, 100, 25) if not drift_pipeline else range(1, 9, 2)
# # -0.5 instead of +0.5 to make 0-based index 1-based
# plt.yticks(ticks=[y-0.5 for y in y_ticks], labels=[y for y in y_ticks], rotation=0)
# plt.ylabel("Model Index")

if not drift_pipeline:
    plt.yticks(
        ticks=[x + 0.5 for x in range(0, 2010 - 1930 + 1, 20)],
        labels=[x for x in range(1930, 2010 + 1, 20)],
        rotation=0,
        # ha='right'
    )
plt.ylabel("Trained up to")

# Draft training boxes
if drift_pipeline:
    for type_, dashed in [("train", False), ("usage", False), ("train", True)]:
        for active_ in df_logs_models.iterrows():
            x_start = active_[1][f"{type_}_start"].year - 1930
            x_end = active_[1][f"{type_}_end"].year - 1930
            y = active_[1]["model_idx"]
            rect = plt.Rectangle(
                (x_start, y - 1),  # y: 0 based index, model_idx: 1 based index
                x_end - x_start,
                1,
                edgecolor="White" if type_ == "train" else "Black",
                facecolor="none",
                linewidth=3,
                linestyle="dotted" if dashed else "solid",
                hatch="/",
                joinstyle="bevel",
                # capstyle="round",
            )
            ax.add_patch(rect)

# Display the plot
plt.tight_layout()
plt.show()

In [None]:
list(df_logs_models.iterrows())

In [None]:
for img_type in ["png", "svg"]:
    img_path = output_dir / f"yearbook_heatmap{'_trigger' if drift_pipeline else ''}.{img_type}"
    fig.savefig(img_path, bbox_inches="tight", transparent=True)