In [None]:
from pathlib import Path

import pandas as pd

from analytics.app.data.load import list_pipelines, load_pipeline_logs
from analytics.app.data.transform import (
    df_aggregate_eval_metric,
    dfs_models_and_evals,
    pipeline_leaf_times_df,
)
from analytics.plotting.common.save import save_plot
from analytics.plotting.common.tradeoff_scatterplot import plot_tradeoff_scatter
from modyn.supervisor.internal.grpc.enums import PipelineStage
from modyn.supervisor.internal.pipeline_executor.models import PipelineLogs

%load_ext autoreload
%autoreload 2

In [None]:
pipelines_dirs = [
    Path("/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/arxiv/10_baselines_time"),
    Path("/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/arxiv/11_baselines_amount"),
    Path(
        "/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/arxiv/21_datadrift_dynamic"
    ),  # TODO
    Path("/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/arxiv/30_performance"),
]

pipeline_logs: dict[int, PipelineLogs] = {}
pipelines: dict[int, tuple[str, Path]] = {}

for dir in pipelines_dirs:
    print("Reading", dir)
    dir_pipelines = list_pipelines(dir)
    print(dir_pipelines)
    pipelines.update(dir_pipelines)

    max_pipeline_id = max(dir_pipelines.keys())
    print(pipelines)
    pipeline_logs.update({p_id: load_pipeline_logs(p_id, dir) for (p_id, (_, p_path)) in dir_pipelines.items()})
    assert dir.exists()

In [None]:
# doesn't do anything unless include_composite_model = True
composite_model_variant = "currently_active_model"

patch_yearbook = True
dataset_id = "arxiv_kaggle_test"
eval_handler = "periodic-current"
metric = "Accuracy"
include_composite_model = True

In [None]:
pipelines = {p_id: (pname, p_path) for p_id, (pname, p_path) in pipelines.items()}

pipeline_ids = pipelines.keys()
pipeline_ids = [
    y
    for y, _ in [
        (263, "timetrigger_5y"),
        (265, "timetrigger_10y"),
        # (267, 'timetrigger_26w'),
        (269, "timetrigger_2y"),
        (272, "timetrigger_1y"),
        # (264, 'dataamount_1000000'),
        (266, "dataamount_50000"),
        # (268, 'dataamount_500000'),
        (270, "dataamount_25000"),
        (271, "dataamount_100000"),
        (782, "drifttrigger_mmd-quant-0.05-20_int20000_win1y"),
        (783, "drifttrigger_mmd-rollavg-0.5-20_int20000_win1y"),
        (784, "drifttrigger_mmd-rollavg-5.0-20_int20000_win1y"),
        (785, "drifttrigger_mmd-quant-0.15-20_int20000_win1y"),
        (786, "drifttrigger_mmd-rollavg-0.2-20_int20000_win1y"),
        (787, "drifttrigger_mmd-quant-0.1-20_int20000_win1y"),
        (788, "drifttrigger_mmd-rollavg-1.0-20_int20000_win1y"),
        (789, "drifttrigger_mmd-quant-0.3-20_int20000_win1y"),
        (790, "drifttrigger_mmd-rollavg-2.0-20_int20000_win1y"),
        (674, "performancetrigger_static-0.45-int20000"),
        (675, "performancetrigger_dynamic-quant-0.05-20-int20000"),
        (676, "performancetrigger_dynamic-rollavg-0.3-20-int20000"),
        (677, "performancetrigger_num_misclass-100000-exp-0.6-red-False--int20000"),
        (678, "performancetrigger_dynamic-rollavg-0.2-20-int20000"),
        (679, "performancetrigger_dynamic-rollavg-0.1-20-int20000"),
        (680, "performancetrigger_static-0.5-int20000"),
        (681, "performancetrigger_dynamic-quant-0.15-20-int20000"),
        (682, "performancetrigger_num_misclass-50000-exp-0.6-red-False--int20000"),
        (723, "performancetrigger_num_misclass-30000-exp-0.6-red-False--int20000"),
        (756, "performancetrigger_num_misclass-15000-exp-0.6-red-False--int20000"),
        (762, "performancetrigger_num_misclass-10000-exp-0.6-red-False--int20000"),
    ]
]

[(p_id, pname) for p_id, (pname, _) in pipelines.items() if p_id in pipeline_ids]

# Wrangle data

In [None]:
list_df_eval_single: list[pd.DataFrame] = []
df_leaf_list: list[pd.DataFrame] = []

for pipeline_id in pipeline_ids:
    logs = pipeline_logs[pipeline_id]
    df_leaf_single = pipeline_leaf_times_df(logs, use_traintime_patch_at_trainer=True, pipeline_id=pipeline_id)
    df_leaf_single["pipeline_id"] = pipeline_id
    df_leaf_list.append(df_leaf_single)

    _, _, df_eval_single = dfs_models_and_evals(
        pipeline_logs[pipeline_id], df_leaf_single["sample_time"].max(), pipelines[pipeline_id][0]
    )
    df_eval_single["pipeline_id"] = pipeline_id
    list_df_eval_single.append(df_eval_single)

df_adjusted = pd.concat(list_df_eval_single)
df_adjusted

df_leaf = pd.concat(df_leaf_list)
df_leaf

In [None]:
print(df_leaf["id"].unique())
assert set(df_leaf["id"].unique()) == {
    "TRAIN",
    "INIT_CLUSTER_CONNECTION",
    "EVALUATE_TRIGGER_POLICY",
    "INFORM_SELECTOR_REMAINING_DATA",
    "INFORM_SELECTOR_ABOUT_TRIGGER",
    "TRAINING_COMPLETED",
    "STORE_TRAINED_MODEL",
    "EVALUATE",
    "DONE",
}

In [None]:
df_adjusted = df_adjusted[
    (df_adjusted["dataset_id"] == dataset_id)
    & (df_adjusted["eval_handler"] == eval_handler)
    & (df_adjusted["metric"] == metric)
]

# in percent (0-100)
df_adjusted["value"] = df_adjusted["value"] * 100
df_adjusted

In [None]:
df_adjusted = df_adjusted.sort_values(by=["interval_center"])

In [None]:
# Reduce to composite models
df_adjusted = df_adjusted[df_adjusted[composite_model_variant]]
df_adjusted[composite_model_variant].unique()

In [None]:
# reduce evaluation interval to interval where all policies have evaluations
min_active_eval_center_per_pipeline = (
    df_adjusted[df_adjusted[composite_model_variant]].groupby("pipeline_ref")["interval_center"].min()
)

maximum_min = pd.to_datetime(min_active_eval_center_per_pipeline).max()
print(maximum_min, min_active_eval_center_per_pipeline)

assert maximum_min < pd.to_datetime("2006-01-01")

df_adjusted = df_adjusted[pd.to_datetime(df_adjusted["interval_center"]) >= maximum_min]
df_adjusted["interval_center"].unique()

In [None]:
df_adjusted["interval_center"] = df_adjusted["interval_center"].astype(str).str.split("-").str[0]

In [None]:
df_adjusted

In [None]:
# Aggregate metrics to a scalar value per pipeline
mean_accuracies = df_aggregate_eval_metric(
    df_adjusted,
    group_by=["pipeline_id", "pipeline_ref", "metric"],
    in_col="value",
    out_col="metric_value",
    aggregate_func="mean",
)
mean_accuracies

In [None]:
df_triggers = df_leaf[df_leaf["id"] == PipelineStage.TRAIN.name]

In [None]:
df_triggers = df_leaf[df_leaf["id"] == PipelineStage.TRAIN.name]
df_triggers = df_triggers[df_triggers["sample_time"] > maximum_min]
df_triggers

In [None]:
# Find number of trigger per pipeline that are after maximum_min

# before the cutoff there was one trigger (equivalent to start of our reduced dataset): +1
num_triggers = df_triggers.groupby("pipeline_id").aggregate(count=("id", "count"), sum_duration=("duration", "sum"))
num_triggers["count"] += 1
num_triggers

In [None]:
num_triggers

In [None]:
merged = num_triggers.merge(mean_accuracies, on="pipeline_id", how="inner")
assert num_triggers.shape[0] == merged.shape[0]
merged

In [None]:
def create_type(x: str):
    if "year" in x:
        return "time"
    elif "samples" in x:
        return "amount"
    elif "d" in x:
        return "drift"
    else:
        return "unknown"


merged["type"] = merged["pipeline_ref"].apply(lambda x: create_type(x))
merged

In [None]:
renamed = merged.copy()

# renamed = merged[
#     merged["pipeline_id"].isin(
#         [
#             # # static thresholds
#             # 113,  # 0.03
#             # 112,  # 0.05
#             # 107,  # 0.07
#             # 109,  # 0.09
#             # 85,  # 0.12
#             # # dyn quantile
#             # 353,  # % 0.05
#             # 345,  # % 0.10
#             # 357,  # % 0.15
#             # # dyn roll. avg
#             # 372,  # Δ 2.0
#             # 370,  # Δ 1.0
#             # 369,  # Δ 0.5
#             # 363,  # Δ 0.05
#         ]
#     )
# ].copy()
renamed["Trigger SubType"] = renamed["pipeline_ref"].apply(
    lambda x: (
        "DataAmount"
        if "dataamount" in x
        else (
            "Time"
            if "time" in x
            else (
                (
                    "Static"
                    if "_mmd-0" in x
                    else ("Quantile" if "quant" in x else ("Rolling Avg." if "roll" in x else ("unknown")))
                )
                if "drift" in x
                else (
                    (
                        "Static"
                        if "static" in x
                        else (
                            "Quantile"
                            if "quant" in x
                            else (
                                "Rolling Avg."
                                if "roll" in x
                                else ("AvoidableMisclass" if "num_misclass" in x else ("unknown"))
                            )
                        )
                    )
                    if "performancetrigger" in x
                    else (
                        "DataIncorporationLatency"
                        if "data_inc" in x
                        else ("AvoidableMisclass" if "avoidable" in x else ("unknown"))
                    )
                )
            )
        )
    )
)
renamed["Trigger Type"] = renamed["pipeline_ref"].apply(
    lambda x: (
        "Simple"
        if "dataamount" in x
        else (
            "Simple"
            if "time" in x
            else (
                "DataDrift"
                if "drift" in x
                else ("Performance" if "performancetrigger" in x else ("Cost" if "costtrigger" in x else ("unknown")))
            )
        )
    )
)

# assert no unknowns and DataIncorporationLatency
assert not renamed["Trigger Type"].str.contains("unknown").any()
assert not renamed["Trigger SubType"].str.contains("unknown").any()
assert not renamed["Trigger SubType"].str.contains("DataIncorporationLatency").any()

# assert no cost triggers
assert not renamed["Trigger Type"].str.contains("Cost").any()

renamed["Trigger Type"] = pd.Categorical(
    renamed["Trigger Type"], categories=["Simple", "DataDrift", "Performance"], ordered=True
)

renamed["Trigger SubType"] = pd.Categorical(
    renamed["Trigger SubType"],
    categories=["DataAmount", "Time", "Static", "Quantile", "Rolling Avg.", "AvoidableMisclass"],
    ordered=True,
)

renamed = renamed.sort_values(by=["Trigger Type", "Trigger SubType", "pipeline_id"])

In [None]:
fig = plot_tradeoff_scatter(
    renamed,
    x="count",
    y="metric_value",
    hue="Trigger Type",
    style="Trigger SubType",
    x_label="Number of Triggers",
    y_label="Mean Accuracy %",
    height_factor=0.8,
    width_factor=0.8,
    manual_legend_title=False,
    legend_ncol=2,
)

save_plot(fig, "_all_tradeoff_arxiv_triggers_performance")

In [None]:
in_minutes = renamed.copy()
in_minutes["sum_duration"] = in_minutes["sum_duration"] / 60

fig = plot_tradeoff_scatter(
    in_minutes,
    x="sum_duration",
    y="metric_value",
    hue="Trigger Type",
    style="Trigger SubType",
    x_label="Total Cost (Minutes)",
    y_label="Mean Accuracy %",
    height_factor=0.7,
    width_factor=0.8,
    manual_legend_title=False,
    legend_ncol=2,
)

# save_plot(fig, "tradeoff_drift_yearbook_cost_performance")

In [None]:
fig = plot_tradeoff_scatter(
    renamed,
    x="count",
    y="sum_duration",
    hue="Trigger Type",
    style="Trigger SubType",
    x_label="Number of Triggers",
    y_label="Total Cost (seconds)",
    height_factor=1.5,
    width_factor=1.8,
)

# save_plot(fig, "tradeoff_drift_yearbook_triggers_cost")