In [None]:
from pathlib import Path

import matplotlib.dates as mdates
import pandas as pd
from matplotlib.ticker import FixedFormatter, FixedLocator

from analytics.app.data.load import list_pipelines, load_pipeline_logs
from analytics.app.data.transform import pipeline_leaf_times_df
from analytics.plotting.common.cost_matrix import plot_cost_matrix
from analytics.plotting.common.save import save_plot
from modyn.supervisor.internal.pipeline_executor.models import PipelineLogs

%load_ext autoreload
%autoreload 2

In [None]:
pipelines_dirs = [
    Path("/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/huffpost/30_performance/static_dyn")
]

pipeline_logs: dict[int, PipelineLogs] = {}
pipelines: dict[int, tuple[str, Path]] = {}

for dir in pipelines_dirs:
    dir_pipelines = list_pipelines(dir)
    pipelines.update(dir_pipelines)
    max_pipeline_id = max(dir_pipelines.keys())
    print(pipelines)
    pipeline_logs.update({p_id: load_pipeline_logs(p_id, dir) for (p_id, (_, p_path)) in dir_pipelines.items()})
    assert dir.exists()

In [None]:
# mode: time + amount
pipeline_ids = [639]  # performancetrigger_static-0.5-int1500y


# doesn't do anything unless include_composite_model = True
composite_model_variant = "currently_active_model"

patch_yearbook = True
dataset_id = "huffpost_kaggle_test"
eval_handler = "periodic-current"
metric = "Accuracy"
include_composite_model = True

# Wrangle data

In [None]:
df_leaf_list = []
for pipeline_id in pipeline_ids:
    logs = pipeline_logs[pipeline_id]
    df_leaf_single = pipeline_leaf_times_df(logs, use_traintime_patch_at_trainer=True, pipeline_id=pipeline_id)
    df_leaf_list.append(df_leaf_single)

df_leaf = pd.concat(df_leaf_list)
df_leaf

In [None]:
df_adjusted = df_leaf.copy()

# coloring in order of decreasing avg. duration
avg_duration_per_stage = df_adjusted.groupby(["pipeline_ref", "id"])["duration"].mean().sort_values(ascending=False)
df_adjusted = df_adjusted.merge(avg_duration_per_stage, on=["pipeline_ref", "id"], suffixes=("", "_avg")).sort_values(
    "duration_avg", ascending=False
)

In [None]:
df_adjusted
df_adjusted["sample_time_year"] = df_adjusted["sample_time"]
df_adjusted["sample_time_year_bin"] = pd.cut(df_adjusted["sample_time_year"], bins=10, labels=False)

In [None]:
df_new = df_adjusted[
    (
        df_adjusted["id"].isin(
            [
                "TRAIN",
                "STORE_TRAINED_MODEL",
                "INFORM_SELECTOR_REMAINING_DATA",
                "INFORM_SELECTOR_ABOUT_TRIGGER",
                "EVALUATE_TRIGGER_POLICY",
            ]
        )
    )
][["pipeline_ref", "id", "sample_time_year", "duration"]].copy()
df_new = df_new.sort_values("sample_time_year")

In [None]:
state_rename = {
    "INFORM_SELECTOR_REMAINING_DATA": "inform remaining data",
    "INFORM_SELECTOR_ABOUT_TRIGGER": "inform trigger",
}

df_new["id"] = df_new["id"].replace(state_rename).str.lower().str.replace("_", " ")

In [None]:
df_new

In [None]:
fig = plot_cost_matrix(
    df_new,
    [639],
    grid_alpha=0.75,
    title_map={
        639: "HuffPost Static PerformanceTrigger",
    },
    height_factor=0.7,
    width_factor=1.0,
    duration_ylabel="Duration (min)",
    cumulative_ylabel="Cumulative Duration (min)",
    x_date_locator=FixedLocator([mdates.date2num(pd.Timestamp(d)) for d in ["2014-05-01", "2018-06-01", "2021-01-01"]]),
    x_date_formatter=FixedFormatter([str(year) for year in ["May\n2014", "Jun\n2018", "Jan\n2021"]]),
    x_lim=(pd.Timestamp("2012-01-01"), pd.Timestamp("2022-09-01")),
    y_ticks_cumulative=[x for x in range(0, 110, 25)],
    y_lim_cumulative=(0, 100),
    y_minutes=True,
    y_minutes_cumulative=True,
)

save_plot(fig, "huffpost_performance-trigger-cost-matrix")

In [None]:
# Lower policy eval costs compared to drift