In [None]:
from pathlib import Path

import pandas as pd

from analytics.app.data.load import list_pipelines
from analytics.plotting.common.color import discrete_colors
from analytics.plotting.common.linear_regression_scatterplot import scatter_linear_regression
from modyn.supervisor.internal.grpc.enums import PipelineStage
from modyn.supervisor.internal.pipeline_executor.models import StageLog

%load_ext autoreload
%autoreload 2

In [None]:
# INPUTS

pipelines_dir = Path(
    "/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/yearbook/11_baselines_amount"
)
# pipelines_dir = Path(
#     "/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/huffpost/11_baselines_amount"
# )
# pipelines_dir = Path(
#     "/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/arxiv/11_baselines_amount"
# )
output_dir = Path("/Users/robinholzinger/robin/dev/eth/modyn-2/.analytics.log/.data/_plots")
assert pipelines_dir.exists()
assert output_dir.exists()

In [None]:
pipelines = list_pipelines(pipelines_dir)
max_pipeline_id = max(pipelines.keys())
pipelines

In [None]:
from analytics.app.data.load import load_pipeline_logs

pipeline_logs = {p_id: load_pipeline_logs(p_id, pipelines_dir) for (p_id, (_, p_path)) in pipelines.items()}

In [None]:
# extract number of epochs
num_epochs: int | None = None

for p_id, logs in pipeline_logs.items():
    for log in logs:
        if num_epochs is None:
            num_epochs = logs.config.pipeline.training.epochs_per_trigger
        else:
            assert num_epochs == logs.config.pipeline.training.epochs_per_trigger

assert num_epochs

# Wrangle data

In [None]:
list_df_train: list[pd.DataFrame] = []

for pipeline_id in pipelines:
    logs = pipeline_logs[pipeline_id]
    train_logs = [record for record in logs.supervisor_logs.stage_runs if record.id == PipelineStage.TRAIN.name]
    df_train = StageLog.df(stage_logs=train_logs, extended=True)
    df_train["pipeline_id"] = pipelines[pipeline_id][0]
    list_df_train.append(df_train)

df_train = pd.concat(list_df_train)
df_train.head()

# Conversion

In [None]:
# Clean pipeline name

import re


def pipeline_name_cleaner(name: str):
    return re.sub(r".*dataamount_(\d+)", r"\1", name)


df_train["pipeline_id"] = df_train["pipeline_id"].apply(pipeline_name_cleaner)
df_train.head()

In [None]:
# to seconds
df_train["duration"] = df_train["duration"].dt.total_seconds() / 60
# df_train["duration"] = df_train["duration"].dt.total_seconds()
# df_train["train_time_at_trainer"] = df_train["train_time_at_trainer"] / 1_000  # millis to seconds
df_train["train_time_at_trainer"] = df_train["train_time_at_trainer"] / 1_000 / 60  # millis to minutes

# vs. number of passed sample: num_samples
df_train["num_input_samples"] = df_train["num_samples"] / num_epochs


dataset = pipelines_dir.parent.name

if dataset != "yearbook":
    df_train["num_input_samples"] = df_train["num_input_samples"] / 1_000
    df_train["pipeline_id"] = (df_train["pipeline_id"].astype(int) // 1_000).astype(str) + "k"


df_train

In [None]:
# Sort by number of samples
df_train = df_train.sort_values(by="num_samples")

# Create Plot

In [None]:
from analytics.plotting.common.save import save_plot

fig = scatter_linear_regression(
    df_train,
    x="num_input_samples",
    y="train_time_at_trainer",  # duration is broken due to bug in grpc interface
    hue="pipeline_id",
    palette=(
        discrete_colors(14)[0:4] + discrete_colors(14)[10:14]
        if "yearbook" in str(pipelines_dir)
        else (
            discrete_colors(12)[0:4] + discrete_colors(12)[9:12]
            if "huffpost" in str(pipelines_dir)
            else discrete_colors(8)[0:3] + discrete_colors(8)[6:8]
        )
    ),
    title_label="Training Size (Samples) vs. Cost (Time)",
    x_label="#Trained Samples (k) / #Epochs",
    y_label="Training Time (min)",
    legend_label="Trigger every",
    height_factor=0.5 if dataset != "yearbook" else 0.55,
    width_factor=0.575 if dataset != "yearbook" else 0.7,
    small_legend_fonts=dataset != "yearbook",
    # x_ticks=[],
    # y_ticks=[],
)

save_plot(
    fig=fig,
    name=dataset + "_training_size_vs_cost",
)

In [None]:
# TODO: run more variants of in less dense areas
# TODO: plot / add number of datapoints to thesis so that the signicance of regression line is clear
# State in thesis that there are no outliers to be expected!

# Plotting faulty time at supervisor

In [None]:
from analytics.plotting.common.save import save_plot

fig = scatter_linear_regression(
    df_train,
    x="num_input_samples",
    y="duration",  # broken due to bug in grpc interface
    hue="pipeline_id",
    palette=(
        discrete_colors(14)[0:4] + discrete_colors(14)[10:14]
        if "yearbook" in str(pipelines_dir)
        else (
            discrete_colors(12)[0:4] + discrete_colors(12)[9:12]
            if "huffpost" in str(pipelines_dir)
            else discrete_colors(8)[0:3] + discrete_colors(8)[6:8]
        )
    ),
    title_label="Training Size (Samples) vs. Cost (Time)",
    x_label="#Trained Samples (k) / #Epochs",
    y_label="Supervisor TRAIN" if dataset != "yearbook" else "Supervisor TRAIN Stage (min)",
    legend_label="Trigger every",
    height_factor=0.5 if dataset != "yearbook" else 0.7,
    width_factor=0.575 if dataset != "yearbook" else 0.7,
    small_legend_fonts=dataset != "yearbook",
    # x_ticks=[],
    # y_ticks=[],
)

save_plot(
    fig=fig,
    name=dataset + "_training_size_vs_cost_bug_supervisor_time",
)