In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from analytics.app.data.load import list_pipelines
from analytics.app.data.transform import dfs_models_and_evals, logs_dataframe, patch_yearbook_time

%load_ext autoreload
%autoreload 2

In [None]:
# INPUTS

pipelines_dir = Path("/Users/robinholzinger/robin/dev/eth/modyn-sigmod-data/yearbook/data_selection_50%/logs_agg_patch")
output_dir = Path("/Users/robinholzinger/robin/dev/eth/modyn-2/.analytics.log/.data/_plots")
assert pipelines_dir.exists()
assert output_dir.exists()

In [None]:
def map_pipeline_names(pipeline_ref: str) -> str:
    stripped = pipeline_ref.removeprefix("yearbook_yearbooknet_").removesuffix("_nosched_epoch5_warm2")
    return {"full": "Full", "rs2wo": "RS2", "grad_bts": "DLIS", "margin_bts": "Margin"}.get(stripped, stripped)

In [None]:
pipelines = list_pipelines(pipelines_dir)
# rename
pipelines = {int(k): (map_pipeline_names(v[0]), v[1]) for k, v in pipelines.items()}
max_pipeline_id = max(pipelines.keys())

In [None]:
from analytics.app.data.load import load_pipeline_logs

pipeline_logs = {
    p_id: load_pipeline_logs(p_id, pipelines_dir)
    for (p_id, (_, p_path)) in pipelines.items()
    if p_id != 21  # exclude rho loss
}

In [None]:
# mode:
# single pipeline
composite_model_variant = "currently_trained_model"  # currently_trained_model
patch_yearbook = True
dataset_id = "yearbook_test"
eval_handler = "slidingmatrix"
metric = "Accuracy"
pipeline_ids = [5, 11, 18, 25]

# Wrangle data

In [None]:
df_all = logs_dataframe(pipeline_logs[5], "100%_baseline")

list_df_eval_single: list[pd.DataFrame] = []

for pipeline_id in pipeline_ids:
    _, _, df_eval_single = dfs_models_and_evals(
        pipeline_logs[pipeline_id], df_all["sample_time"].max(), pipelines[pipeline_id][0]
    )
    list_df_eval_single.append(df_eval_single)

df_adjusted = pd.concat(list_df_eval_single)
df_adjusted

In [None]:
df_adjusted["pipeline_ref"].unique()

In [None]:
# df_adjusted["dataset_id"].unique()
df_adjusted[df_adjusted["dataset_id"] == "yearbook-test"]["pipeline_ref"].unique()

In [None]:
df_adjusted = df_adjusted[
    (df_adjusted["dataset_id"] == dataset_id)
    & (df_adjusted["eval_handler"] == eval_handler)
    & (df_adjusted["metric"] == metric)
]

# in percent (0-100)
df_adjusted["value"] = df_adjusted["value"] * 100
df_adjusted

In [None]:
if patch_yearbook:
    for column in ["interval_start", "interval_center", "interval_end"]:
        patch_yearbook_time(df_adjusted, column)

In [None]:
df_adjusted = df_adjusted.sort_values(by=["interval_center"])

In [None]:
# Add composite model

df_composite = df_adjusted[df_adjusted[composite_model_variant]]
df_composite

# Dump Data backup

# Create Plot

In [None]:
df_composite["interval_center"] = df_composite["interval_center"].astype(str).str.split("-").str[0]
df_composite

In [None]:
df_composite[df_composite["model_idx"] == 3]

In [None]:
palette = sns.color_palette("RdBu", 10)
palette

In [None]:
palette2 = sns.color_palette("colorblind", 10)
palette2

In [None]:
from analytics.plotting.common.common import INIT_PLOT

INIT_PLOT()
plt.rcParams["svg.fonttype"] = "none"
sns.set_style("whitegrid")

FONTSIZE = 20
DOUBLE_FIG_WIDTH = 10
DOUBLE_FIG_HEIGHT = 3.5
DOUBLE_FIG_SIZE = (DOUBLE_FIG_WIDTH, 1.2 * DOUBLE_FIG_HEIGHT)

fig = plt.figure(
    edgecolor="black",
    frameon=True,
    figsize=DOUBLE_FIG_SIZE,
    dpi=300,
)

ax = sns.lineplot(
    # ax = sns.pointplot(
    df_composite,
    x="interval_center",
    y="value",
    hue="pipeline_ref",
    markersize=2,
    # line width
    linewidth=2.5,
    palette=[palette[1], palette[-2], palette2[1], palette2[2]],
    style="pipeline_ref",
)
ax.set(xlim=(0, 85))
ax.set(ylim=(58, 100))


# Adjust x-axis tick labels
plt.xlabel("Evaluation Year")

# reduce number of xticks
x_ticks = 20
current_xticks = plt.xticks()[0]
new_xticks = current_xticks[::x_ticks]
plt.xticks(
    new_xticks,
    rotation=45,
    # ha='right'
)

# Set y-axis ticks to be equally spaced
plt.xticks(
    ticks=[x for x in range(0, 2010 - 1930 + 1, 20)],
    labels=[x for x in range(1930, 2010 + 1, 20)],
    rotation=0,
    # ha='right'
)
plt.ylabel("Accuracy %")


# Map the hue labels to their LaTeX equivalents
handles, labels = ax.get_legend_handles_labels()
label_map = {
    "Full": "Full",
    "RS2 (w/o)": "RS2 (w/o)",
    "DLIS": "DLIS",
    "Margin": "Margin ",
}
# extra spaces because latex fonts will be bigger
latex_labels = [f"{label_map.get(label, label)}  " for label in labels]
legend = ax.legend(
    # title="Selection Policy     ",
    loc="lower left",
    ncol=2,
    handles=handles,
    labels=latex_labels,
    labelspacing=0.2,
    columnspacing=0.9,
    handlelength=1.3,
    bbox_to_anchor=(0.05, 0.05),
)


# set aspect ratio
# ax.set_aspect(0.4)
plt.setp(legend.get_title())

# Display the plot
plt.tight_layout()
plt.show()

# Save Plot as svg

In [None]:
for img_type in ["png", "svg"]:
    img_path = output_dir / f"eval_over_time.{img_type}"
    fig.savefig(img_path, bbox_inches="tight", dpi=300)