In [None]:
from pathlib import Path

import pandas as pd

from analytics.app.data.load import list_pipelines, load_pipeline_logs
from analytics.app.data.transform import (
    dfs_models_and_evals,
    patch_yearbook_time,
    pipeline_leaf_times_df,
)
from analytics.plotting.common.save import save_plot
from modyn.supervisor.internal.pipeline_executor.models import PipelineLogs

%load_ext autoreload
%autoreload 2

In [None]:
pipelines_dirs = [
    Path("/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/yearbook/20_datadrift_static"),
    Path("/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/yearbook/21_datadrift_dynamic"),
]

pipeline_logs: dict[int, PipelineLogs] = {}
pipelines: dict[int, tuple[str, Path]] = {}

for dir in pipelines_dirs:
    dir_pipelines = list_pipelines(dir)
    pipelines.update(dir_pipelines)
    max_pipeline_id = max(dir_pipelines.keys())
    print(pipelines)
    pipeline_logs.update({p_id: load_pipeline_logs(p_id, dir) for (p_id, (_, p_path)) in dir_pipelines.items()})
    assert dir.exists()

In [None]:
pipeline_ids = list(pipelines.keys())

# doesn't do anything unless include_composite_model = True
composite_model_variant = "currently_active_model"

patch_yearbook = True
dataset_id = "yearbook_test"
eval_handler = "periodic-delta+-1y"
metric = "Accuracy"
include_composite_model = False

# Wrangle data

In [None]:
list_df_eval_single: list[pd.DataFrame] = []
df_logs_models_list: list[pd.DataFrame] = []

for pipeline_id in pipeline_ids:
    logs = pipeline_logs[pipeline_id]
    df_leaf_single = pipeline_leaf_times_df(logs, use_traintime_patch_at_trainer=False, pipeline_id=pipeline_id)
    df_logs_models_single, _, df_eval_single = dfs_models_and_evals(
        pipeline_logs[pipeline_id], df_leaf_single["sample_time"].max(), pipelines[pipeline_id][0]
    )
    df_eval_single["pipeline_id"] = pipeline_id
    df_logs_models_single["pipeline_id"] = pipeline_id
    list_df_eval_single.append(df_eval_single)
    df_logs_models_list.append(df_logs_models_single)

df_adjusted = pd.concat(list_df_eval_single)
df_adjusted

df_logs_models = pd.concat(df_logs_models_list)

In [None]:
df_adjusted = df_adjusted[
    (df_adjusted["dataset_id"] == dataset_id)
    & (df_adjusted["eval_handler"] == eval_handler)
    & (df_adjusted["metric"] == metric)
]

In [None]:
if patch_yearbook:
    for column in ["interval_start", "interval_center", "interval_end"]:
        patch_yearbook_time(df_adjusted, column)
    for column in ["train_start", "train_end", "real_train_end", "usage_start", "usage_end"]:
        patch_yearbook_time(df_logs_models, column)

    # correction for -1 second in timestamp format before patching
    df_logs_models["usage_end"] = (
        df_logs_models["usage_end"].dt.to_period("M") + 1
    ).dt.to_timestamp()  # december (because of -1 second in timestamp format) -> start of year

df_logs_models

In [None]:
df_adjusted = df_adjusted.sort_values(by=["interval_center"])
len(df_adjusted)

In [None]:
# Reduce to composite models
df_adjusted = df_adjusted[df_adjusted[composite_model_variant]]
df_adjusted[composite_model_variant].unique()
len(df_adjusted)

In [None]:
df_adjusted["interval_center"] = df_adjusted["interval_center"].astype(str).str.split("-").str[0]

df_train_end_years_per_model = df_logs_models[["pipeline_id", "model_idx", "real_train_end"]]
df_train_end_years_per_model["real_train_end"] = df_train_end_years_per_model["real_train_end"].dt.year

In [None]:
df_adjusted.groupby(["pipeline_id"]).size()

In [None]:
df_adjusted["value"] = df_adjusted["value"] * 100

# Window Sizes

In [None]:
_pids = [117, 107, 95]

df_merged = df_adjusted.merge(df_train_end_years_per_model, on=["pipeline_id", "model_idx"], how="left")
# build heatmap matrix dataframe:
df_merged["pipeline_id"] = df_merged["pipeline_id"].astype(int)
df_merged = df_merged[df_merged["pipeline_id"].isin(_pids)]
heatmap_data = df_merged.pivot(index=["pipeline_id"], columns="interval_center", values="value")

heatmap_data.index.min(), heatmap_data.index.max()
heatmap_data

# sort index by pipeline_refs
heatmap_data = heatmap_data.reindex(_pids)

In [None]:
from analytics.plotting.common.heatmap import build_heatmap

pipelines_refs = {
    117: "10y",
    107: "4y",
    95: "1y",
}

fig = build_heatmap(
    heatmap_data,
    reverse_col=True,
    x_ticks=[1950, 1975, 2000],
    y_custom_ticks=[(i + 0.5, pipelines_refs[y]) for i, y in enumerate(heatmap_data.index)],
    y_label="Pipeline with\nWindow Size",
    x_label="Evaluation Year",
    title_label="Yearbook Composite Models: Drift Window Sizes (MMD=0.07)",
    color_label="Accuracy %",
    width_factor=1,
    height_factor=0.38,
    # grid_alpha=0.4,
    grid_alpha=0.0,
    # disable_horizontal_grid=True,
    # cbar=False,
    triggers={
        i: df_logs_models[df_logs_models["pipeline_id"] == p_id][
            ["train_start", "train_end", "usage_start", "usage_end"]
        ]
        for i, p_id in enumerate(heatmap_data.index)
    },
)
save_plot(fig, "yb_trigger_heatmap_drift_multi_static_window_size")

# Static Thresholds

In [None]:
_pids = list(reversed([113, 112, 107, 109, 85]))

df_merged = df_adjusted.merge(df_train_end_years_per_model, on=["pipeline_id", "model_idx"], how="left")
# build heatmap matrix dataframe:
df_merged["pipeline_id"] = df_merged["pipeline_id"].astype(int)
df_merged = df_merged[df_merged["pipeline_id"].isin(_pids)]
heatmap_data = df_merged.pivot(index=["pipeline_id"], columns="interval_center", values="value")

heatmap_data.index.min(), heatmap_data.index.max()

# sort index by pipeline_refs
heatmap_data = heatmap_data.reindex(_pids)

In [None]:
from analytics.plotting.common.heatmap import build_heatmap
from analytics.plotting.common.save import save_plot

pipelines_refs = {
    113: "0.03",
    112: "0.05",
    107: "0.07",
    109: "0.09",
    85: "0.12",
}

fig = build_heatmap(
    heatmap_data,
    reverse_col=True,
    x_ticks=[1950, 1975, 2000],
    y_custom_ticks=[(i + 0.5, pipelines_refs[y]) for i, y in enumerate(heatmap_data.index)],
    y_label="MMD Threshold",
    x_label="Evaluation Year",
    title_label="Yearbook Composite Models: Static Drift Thresholds",
    color_label="Accuracy %",
    width_factor=1,
    height_factor=0.45,
    # grid_alpha=0.4,
    grid_alpha=0.0,
    # disable_horizontal_grid=True,
    # cbar=False,
    triggers={
        i: df_logs_models[df_logs_models["pipeline_id"] == p_id][
            ["train_start", "train_end", "usage_start", "usage_end"]
        ]
        for i, p_id in enumerate(heatmap_data.index)
    },
)
save_plot(fig, "yb_trigger_heatmap_drift_multi_static_threshold")

# Dynamic Quantile / Roll Avg

In [None]:
_pids = list([372, 370, 369, 363]) + list([353, 357])  # 345

df_merged = df_adjusted.merge(df_train_end_years_per_model, on=["pipeline_id", "model_idx"], how="left")
# build heatmap matrix dataframe:
df_merged["pipeline_id"] = df_merged["pipeline_id"].astype(int)
df_merged = df_merged[df_merged["pipeline_id"].isin(_pids)]
heatmap_data = df_merged.pivot(index=["pipeline_id"], columns="interval_center", values="value")

heatmap_data.index.min(), heatmap_data.index.max()

# sort index by pipeline_refs
heatmap_data = heatmap_data.reindex(_pids)

In [None]:
from analytics.plotting.common.heatmap import build_heatmap
from analytics.plotting.common.save import save_plot

pipelines_refs = {
    # dyn quantile
    353: "% 0.05",
    # 345: "% 0.10",
    357: "% 0.15",
    # dyn roll. avg
    372: "Δ 2.0",
    370: "Δ 1.0",
    369: "Δ 0.5",
    363: "Δ 0.05",
}

fig = build_heatmap(
    heatmap_data,
    reverse_col=True,
    x_ticks=[1950, 1975, 2000],
    y_custom_ticks=[(i + 0.5, pipelines_refs[y]) for i, y in enumerate(heatmap_data.index)],
    y_label="Criterion",
    x_label="Evaluation Year",
    title_label="Yearbook Composite Models: Dynamic Drift Thresholds",
    color_label="Accuracy %",
    width_factor=1,
    height_factor=0.47,
    # grid_alpha=0.4,
    grid_alpha=0.0,
    # disable_horizontal_grid=True,
    # cbar=False,
    triggers={
        i: df_logs_models[df_logs_models["pipeline_id"] == p_id][
            ["train_start", "train_end", "usage_start", "usage_end"]
        ]
        for i, p_id in enumerate(heatmap_data.index)
    },
)
save_plot(fig, "yb_trigger_heatmap_drift_multi_dynamic_thresholds")