In [None]:
from pathlib import Path

import pandas as pd

from analytics.app.data.load import list_pipelines
from analytics.app.data.transform import dfs_models_and_evals, logs_dataframe, patch_yearbook_time

%load_ext autoreload
%autoreload 2

In [None]:
# INPUTS
drift_pipeline = False
if drift_pipeline:
    pipelines_dir = Path(
        "/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/yearbook/20_datadrift_static"
    )
else:
    pipelines_dir = Path(
        "/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/yearbook/20_datadrift_static"
    )
output_dir = Path("/Users/robinholzinger/robin/dev/eth/modyn-2/.data/plots/triggering")
assert pipelines_dir.exists()
assert output_dir.exists()

In [None]:
pipelines = list_pipelines(pipelines_dir)
max_pipeline_id = max(pipelines.keys())
pipelines

In [None]:
from analytics.app.data.load import load_pipeline_logs

pipeline_logs = {p_id: load_pipeline_logs(p_id, pipelines_dir) for (p_id, (_, p_path)) in pipelines.items()}

In [None]:
# mode:
pipeline_id = 107  # yb drift mmd 0.06 250 4d

# doesn't do anything unless include_composite_model = True
composite_model_variant = "currently_active_model"

patch_yearbook = True
dataset_id = "yearbook_test"
eval_handler = "periodic-delta+-1y"
metric = "Accuracy"
include_composite_model = False

# Wrangle data

In [None]:
pipeline_log = pipeline_logs[pipeline_id]
pipeline_ref = f"{pipeline_id}".zfill(len(str(max_pipeline_id))) + f" - {pipelines[pipeline_id][0]}"

df_all = logs_dataframe(pipeline_log, pipeline_ref)

df_logs_models, _, df_eval_single = dfs_models_and_evals(
    # subtracting would interfere with yearbook patching
    pipeline_log,
    df_all["sample_time"].max(),
    pipeline_ref,
)

df_adjusted = df_eval_single


df_adjusted = df_adjusted[
    (df_adjusted["dataset_id"] == dataset_id)
    & (df_adjusted["eval_handler"] == eval_handler)
    & (df_adjusted["metric"] == metric)
]

# in percent (0-100)
df_adjusted["value"] = df_adjusted["value"] * 100

In [None]:
df_adjusted

In [None]:
df_logs_models

In [None]:
if patch_yearbook:
    for column in ["interval_start", "interval_center", "interval_end"]:
        patch_yearbook_time(df_adjusted, column)
    for column in ["train_start", "train_end", "real_train_end", "usage_start", "usage_end"]:
        patch_yearbook_time(df_logs_models, column)

    # correction for -1 second in timestamp format before patching
    df_logs_models["usage_end"] = (
        df_logs_models["usage_end"].dt.to_period("M") + 1
    ).dt.to_timestamp()  # december (because of -1 second in timestamp format) -> start of year

df_logs_models

In [None]:
df_adjusted = df_adjusted.sort_values(by=["interval_center"])
df_adjusted

In [None]:
# Add composite model

assert df_adjusted["pipeline_ref"].nunique() <= 1
# add the pipeline time series which is the performance of different models stitched together dep.
# w.r.t which model was active
pipeline_composite_model = df_adjusted[df_adjusted[composite_model_variant]]
pipeline_composite_model["model_idx"] = 0
pipeline_composite_model["id_model"] = 0

label_map = {k: f"{k}" for k, v in df_adjusted[["model_idx", "id_model"]].values}
label_map[0] = "Pipeline composite model"

if include_composite_model:
    df_adjusted = pd.concat([pipeline_composite_model, df_adjusted])
else:
    df_adjusted["model_idx"] = df_adjusted["model_idx"]

# Create Plot

In [None]:
df_adjusted["interval_center"] = df_adjusted["interval_center"].astype(str).str.split("-").str[0]
df_adjusted

In [None]:
df_train_end_years_per_model = df_logs_models[["model_idx", "real_train_end"]]
df_train_end_years_per_model["real_train_end"] = df_train_end_years_per_model["real_train_end"].dt.year
df_train_end_years_per_model

In [None]:
df_merged = df_adjusted.merge(df_train_end_years_per_model, on="model_idx", how="left")
df_merged

In [None]:
# build heatmap matrix dataframe:
heatmap_data = df_merged.pivot(index=["real_train_end"], columns="interval_center", values="value")

In [None]:
heatmap_data.index.min(), heatmap_data.index.max()

In [None]:
heatmap_data.index

In [None]:
from analytics.plotting.common.heatmap import build_heatmap
from analytics.plotting.common.save import save_plot

fig = build_heatmap(
    heatmap_data,
    reverse_col=True,
    x_ticks=[1950, 1975, 2000],
    y_custom_ticks=[(i + 0.5, str(y)) for i, y in enumerate(heatmap_data.index)],
    y_label="Trained up to",
    x_label="Evaluation Year",
    title_label="Yearbook 4y Drift Windows: Static MMD Threshold=0.07",
    color_label="Accuracy %",
    width_factor=1,
    height_factor=0.55,
    # grid_alpha=0.4,
    grid_alpha=0.0,
    # disable_horizontal_grid=True,
    # cbar=False,
    df_logs_models=df_logs_models,
)
save_plot(fig, "yb_trigger_heatmap_drift_single_static")