In [None]:
from pathlib import Path
from analytics.app.data.load import list_pipelines
from analytics.app.data.transform import dfs_models_and_evals
from analytics.app.data.transform import patch_yearbook_time
from analytics.app.data.transform import logs_dataframe
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from analytics.plotting.common.common import SAVE_PLOT

%load_ext autoreload
%autoreload 2

In [None]:
# INPUTS

pipelines_dir = Path(
    "/Users/robinholzinger/robin/dev/eth/modyn-sigmod-data/cglm-landmark/data_selection/logs_agg_patch_currently_trained"
)
output_dir = Path(
    "/Users/robinholzinger/robin/dev/eth/modyn-2/.analytics.log/.data/_plots"
)
assert pipelines_dir.exists()
assert output_dir.exists()

In [None]:
pipelines = list_pipelines(pipelines_dir)
# rename
max_pipeline_id = max(pipelines.keys())
pipelines[24]

In [None]:
from analytics.app.data.load import load_pipeline_logs


pipeline_logs = {
    p_id: load_pipeline_logs(p_id, pipelines_dir)
    for (p_id, (_, p_path)) in pipelines.items()
}

In [None]:
dataset_id = "cglm_landmark_min25-test"
eval_handler = "exactmatrix"
metric = "Top-5-Accuracy"
pipeline_id = 24
non_composite_models = [7, 9, 10, 15]

# Wrangle data

In [None]:
df_all = logs_dataframe(pipeline_logs[pipeline_id], "100%_baseline")
_, _, df_eval_single = dfs_models_and_evals(
    pipeline_logs[pipeline_id], df_all["sample_time"].max(), pipelines[pipeline_id][0]
)

df_adjusted = df_eval_single
df_adjusted

In [None]:
# Filter out first two and last evaluations on CGLM because it's very small
df_adjusted = df_adjusted[df_adjusted["dataset_size"] > 300]

In [None]:
df_adjusted["pipeline_ref"].unique()


In [None]:
# df_adjusted["dataset_id"].unique()
df_adjusted[df_adjusted["dataset_id"] == "yearbook-test"]["pipeline_ref"].unique()


In [None]:
df_adjusted = df_adjusted[
    (df_adjusted["dataset_id"] == dataset_id)
    & (df_adjusted["eval_handler"] == eval_handler)
    & (df_adjusted["metric"] == metric)
]

# in percent (0-100)
df_adjusted["value"] = df_adjusted["value"] * 100
df_adjusted

In [None]:
df_adjusted = df_adjusted.sort_values(by=["interval_center"])

In [None]:
# Add composite model

df_composite_currently_trained = df_adjusted[df_adjusted["currently_trained_model"]]
df_composite_currently_trained["composite"] = True
df_composite_currently_trained["model_idx"] = "Curr. Trained"

df_composite_currently_active = df_adjusted[df_adjusted["currently_active_model"]]
df_composite_currently_active["composite"] = True
df_composite_currently_active["model_idx"] = "Curr. Active"

df_non_composite_selection = df_adjusted[
    df_adjusted["model_idx"].isin([x for x in non_composite_models])
]
df_non_composite_selection["composite"] = False

df_composite = pd.concat([df_composite_currently_trained, df_composite_currently_active, df_non_composite_selection])

# df_composite = df_composite[
#     df_composite["model_idx"].isin(
#         [str(x) for x in shown_non_composite_models] + ["00-pipeline-composite-model"]
#     )
# ]

# Dump Data backup

# Create Plot

In [None]:
# datetime to date
df_composite["interval_center"] = df_composite["interval_center"].dt.date

In [None]:
df_composite[df_composite["model_idx"] == 3]

In [None]:
palette = sns.color_palette("RdBu", 10)
palette

In [None]:
from turtle import title
import matplotlib as mpl
from analytics.plotting.common.common import FIG_LEGEND, INIT_PLOT

INIT_PLOT()
plt.rcParams['svg.fonttype'] = 'none'
sns.set_style("whitegrid")

FONTSIZE = 20
DOUBLE_FIG_WIDTH = 10
DOUBLE_FIG_HEIGHT = 3.5
DOUBLE_FIG_SIZE = (DOUBLE_FIG_WIDTH, 1.2 * DOUBLE_FIG_HEIGHT)

fig = plt.figure(
    edgecolor="black",
    frameon=True,
    figsize=DOUBLE_FIG_SIZE,
    dpi=300,
)

ax = sns.lineplot(
# ax = sns.pointplot(
    df_composite[~df_composite["composite"]],
    x='interval_center',
    y='value',
    hue='model_idx',
    style="composite",
    style_order=[True, False],
    markers=False,
    palette=["silver"],
    linewidth=2,
)

ax = sns.lineplot(
# ax = sns.pointplot(
    df_composite[df_composite["composite"]],
    ax=ax,
    x='interval_center',
    y='value',
    hue='model_idx',
    style="model_idx",
    # style="composite",
    # style_order=[True, False],
    markers=True,
    markersize=12.5,
    palette=[palette[1], palette[-2]],
    linewidth=3
)

ax.set(ylim=(-2, 80))

# Adjust x-axis tick labels
plt.xlabel("Evaluation Year", labelpad=10)
x_start = 2010
x_ticks = [pd.to_datetime(f"2007-01-01")] + [pd.to_datetime(f"{x}-01-01") for x in range(x_start, 2025, 5)]
x_labels = ["2007"] + [f"{x}" for x in range(x_start, 2025, 5)]
plt.xticks(x_ticks, x_labels, rotation=0)

# Set y-axis ticks to be equally spaced
plt.ylabel("Top-5 Accuracy %", labelpad=15)


# Map the hue labels to their LaTeX equivalents
handles, labels = ax.get_legend_handles_labels()
label_map = {}
# extra spaces because latex fonts will be bigger
latex_labels = [f"{label_map.get(label, label)}         " for label in labels]
legend = ax.legend(
    # title="Selection Policy     ",
    loc='upper left',
    # ncol=4,
    handles=handles[8:11],
    labels=latex_labels[8:11],
    labelspacing=0.3,
    columnspacing=0.9,
)


# Display the plot
plt.tight_layout()
plt.show()

# Save Plot as svg

In [None]:
for img_type in ["png", "svg"]:
    img_path = output_dir / f"composite.{img_type}"
    fig.savefig(img_path, bbox_inches="tight", dpi=300)