In [None]:
from pathlib import Path
from analytics.app.data.load import list_pipelines
from analytics.app.data.transform import dfs_models_and_evals
from analytics.app.data.transform import patch_yearbook_time
from analytics.app.data.transform import logs_dataframe
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import re


from analytics.plotting.common.common import SAVE_PLOT

%load_ext autoreload
%autoreload 2

In [None]:
# INPUTS

yearbook = True
if yearbook:
    pipelines_dir = Path(
        "/Users/robinholzinger/robin/dev/eth/modyn-sigmod-data/yearbook/data_selection_50%/logs_agg_patch"
    )
else:
    pipelines_dir = Path(
        "/Users/robinholzinger/robin/dev/eth/modyn-sigmod-data/cglm-landmark/data_selection/logs_agg_patch_currently_trained"
    )
    
output_dir = Path(
    "/Users/robinholzinger/robin/dev/eth/modyn-2/.analytics.log/.data/_plots"
)
assert pipelines_dir.exists()
assert output_dir.exists()

In [None]:
def map_pipeline_names(pipeline_ref: str) -> str:
    stripped = re.sub(
            "_nosched.*",
            "",
            (
                pipeline_ref\
                .removeprefix("yearbook_yearbooknet_")\
                .removeprefix("cglm_")
            ),
    )
    return {
        "full": "Full",
        "rs2wo": "RS2 (w/o)",
        "grad_bts": "DLIS",
        "margin_bts": "Margin",
        "lc_bts": "Least conf.",
        "entropy_bts": "Entropy",
        "rs2w": "RS2",
        "classb": "Class-Bal.",
        "uniform": "Uniform",
        "loss_bts": "Loss",
    }.get(stripped, stripped) + "  "

In [None]:
pipelines = list_pipelines(pipelines_dir)
# rename
pipelines = {
    int(k): (
        map_pipeline_names(v[0]), v[1]
    ) 
    for k, v in pipelines.items()
    #if (v[0].endswith("_r250") or "full" in v[0]) # for inspecting 25% selection
    #if (v[0].endswith("_r125") or "full" in v[0]) # for inspecting 12.5% selection
    if not (v[0].endswith("_r125") or v[0].endswith("_r250"))
}
max_pipeline_id = max(pipelines.keys())
pipelines

In [None]:
from analytics.app.data.load import load_pipeline_logs


pipeline_logs = {
    p_id: load_pipeline_logs(p_id, pipelines_dir)
    for (p_id, (_, p_path)) in pipelines.items()
    if p_id != 21 # exclude rho loss
}

In [None]:
composite_model_variant = "currently_trained_model"  # currently_trained_model
patch_yearbook = yearbook
if yearbook:
    dataset_id = "yearbook_test"
    eval_handler = "slidingmatrix"
else:
    dataset_id = "cglm_landmark_min25-test"
    eval_handler = "exactmatrix"
metric = "Accuracy" if yearbook else "Top-5-Accuracy"
pipeline_ids = list(pipeline_logs.keys())

# Wrangle data

In [None]:
df_all = logs_dataframe(pipeline_logs[5 if yearbook else 2], "100%_baseline") # if you inspect 25/12.5%, update these numbers

list_df_eval_single: list[pd.DataFrame] = []

for pipeline_id in pipeline_ids:
    _, _, df_eval_single = dfs_models_and_evals(
        pipeline_logs[pipeline_id], df_all["sample_time"].max(), pipelines[pipeline_id][0]
    )
    list_df_eval_single.append(df_eval_single)

df_adjusted = pd.concat(list_df_eval_single)
df_adjusted

In [None]:
if not yearbook:
    # Filter out first two and last evaluations on CGLM because it's very small
    df_adjusted = df_adjusted[df_adjusted["dataset_size"] > 300]

In [None]:
df_adjusted["pipeline_ref"].unique()


In [None]:
# df_adjusted["dataset_id"].unique()
df_adjusted[df_adjusted["dataset_id"] == "yearbook-test"]["pipeline_ref"].unique()


In [None]:
df_adjusted = df_adjusted[
    (df_adjusted["dataset_id"] == dataset_id)
    & (df_adjusted["eval_handler"] == eval_handler)
    & (df_adjusted["metric"] == metric)
]

# in percent (0-100)
df_adjusted["value"] = df_adjusted["value"] * 100
df_adjusted

In [None]:
if patch_yearbook:
    for column in ["interval_start", "interval_center", "interval_end"]:
        patch_yearbook_time(df_adjusted, column)

In [None]:
df_adjusted = df_adjusted.sort_values(by=["interval_center"])

In [None]:
# Add composite model

df_composite = df_adjusted[df_adjusted[composite_model_variant]]
df_composite

# Dump Data backup

# Create Plot

In [None]:
df_composite["interval_center"] = df_composite["interval_center"].astype(str).str.split("-").str[0]
df_composite

In [None]:
mean_accuracies_per_pipeline = df_composite.groupby("pipeline_ref")["value"].mean()
mean_accuracies_per_pipeline = mean_accuracies_per_pipeline.sort_values(ascending=False)
mean_accuracies_per_pipeline

In [None]:
from turtle import title
import matplotlib as mpl
from analytics.plotting.common.common import FIG_LEGEND, INIT_PLOT

INIT_PLOT()
plt.rcParams['svg.fonttype'] = 'none'
sns.set_style("whitegrid")

FONTSIZE = 20
DOUBLE_FIG_WIDTH = 10
DOUBLE_FIG_HEIGHT = 3.5
DOUBLE_FIG_SIZE = (DOUBLE_FIG_WIDTH, (1.4 if yearbook else 1.4) * DOUBLE_FIG_HEIGHT)

fig = plt.figure(
    edgecolor="black",
    frameon=True,
    figsize=DOUBLE_FIG_SIZE,
    dpi=300,
)

ax = sns.boxplot(
    df_composite,
    x='pipeline_ref',
    order=mean_accuracies_per_pipeline.index,
    y='value',
    hue='pipeline_ref',
    hue_order=mean_accuracies_per_pipeline.index,
    palette="RdBu",
    linewidth=2.5,
    flierprops={"markeredgewidth": 2}
)
if yearbook:
    ax.set(ylim=(55, 100))
else:
    ax.set(ylim=(15, 75))

plt.xlabel("")
plt.xticks(rotation=45)

plt.ylabel("Accuracy %" if yearbook else "Top-5 Accuracy %")

# Display the plot
plt.tight_layout()
plt.show()

# Save Plot as svg

In [None]:
for img_type in ["png", "svg"]:
    img_path = output_dir / f"boxplot_{'yb' if yearbook else 'cglm'}.{img_type}"
    fig.savefig(img_path, bbox_inches="tight", dpi=300)