In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from analytics.app.data.load import list_pipelines
from analytics.plotting.common.common import init_plot
from analytics.plotting.common.font import setup_font
from modyn.supervisor.internal.grpc.enums import PipelineStage
from modyn.supervisor.internal.pipeline_executor.models import StageLog

%load_ext autoreload
%autoreload 2

In [None]:
# INPUTS

pipelines_dir = Path(
    "/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/arxiv/11_baselines_amount"
)
# pipelines_dir = Path("/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/huffpost/11_baselines_amount")
# pipelines_dir = Path("/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/yearbook/11_baselines_amount")
output_dir = Path("/Users/robinholzinger/robin/dev/eth/modyn-2/.analytics.log/.data/_plots")
assert pipelines_dir.exists()
assert output_dir.exists()

In [None]:
pipelines = list_pipelines(pipelines_dir)
max_pipeline_id = max(pipelines.keys())
pipelines

In [None]:
from analytics.app.data.load import load_pipeline_logs

pipeline_logs = {p_id: load_pipeline_logs(p_id, pipelines_dir) for (p_id, (_, p_path)) in pipelines.items()}

# Wrangle data

In [None]:
list_df_train: list[pd.DataFrame] = []

for pipeline_id in pipelines:
    logs = pipeline_logs[pipeline_id]
    train_logs = [record for record in logs.supervisor_logs.stage_runs if record.id == PipelineStage.TRAIN.name]
    df_train = StageLog.df(stage_logs=train_logs, extended=True)
    df_train["pipeline_id"] = pipelines[pipeline_id][0]
    list_df_train.append(df_train)

df_train = pd.concat(list_df_train)
df_train.head()

# Conversion

In [None]:
# Clean pipeline name

import re


def pipeline_name_cleaner(name: str):
    return re.sub(r".*_dataamount_(\d+)", "trigger every \\1 samples", name)


df_train["pipeline_id"] = df_train["pipeline_id"].apply(pipeline_name_cleaner)
df_train.head()

In [None]:
# to seconds
df_train["duration"] = df_train["duration"].dt.total_seconds() / 60
# df_train["duration"] = df_train["duration"].dt.total_seconds()
# df_train["train_time_at_trainer"] = df_train["train_time_at_trainer"] / 1_000  # millis to seconds
df_train["train_time_at_trainer"] = df_train["train_time_at_trainer"] / 1_000 / 60  # millis to minutes
df_train

In [None]:
# Sort by number of samples
df_train = df_train.sort_values(by="num_samples")

# Create Plot

In [None]:
from analytics.plotting.common.color import discrete_colors, main_color

sns.set_style("whitegrid")

init_plot()
setup_font(small_label=True, small_title=True)


FONTSIZE = 20
DOUBLE_FIG_WIDTH = 10
DOUBLE_FIG_HEIGHT = 3.5
DOUBLE_FIG_SIZE = (DOUBLE_FIG_WIDTH, 1.5 * DOUBLE_FIG_HEIGHT)

width_factor = 0.5
height_factor = 0.5

fig = plt.figure(
    edgecolor="black",
    frameon=True,
    figsize=(
        DOUBLE_FIG_WIDTH * width_factor,
        2 * DOUBLE_FIG_HEIGHT * height_factor,
    ),
    dpi=300,
)

ax1 = sns.regplot(
    df_train,
    x="num_samples",
    y="train_time_at_trainer",  # duration
    color=main_color(0),
)

ax2 = sns.scatterplot(
    df_train,
    x="num_samples",
    y="train_time_at_trainer",  # duration
    hue="pipeline_id",
    palette=(
        discrete_colors(14)[0:5] + discrete_colors(14)[9:14]
        if "yearbook" in str(pipelines_dir)
        else (
            discrete_colors(8)[0:3] + discrete_colors(8)[6:8]
            if "huffpost" in str(pipelines_dir)
            else discrete_colors(8)[0:3] + discrete_colors(8)[6:8]
        )
    ),
    s=200,
    legend=True,
    marker="X",
)

# Display the plot
plt.tight_layout()
plt.show()

In [None]:
# TODO: run more variants of in less dense areas
# TODO: plot / add number of datapoints to thesis so that the signicance of regression line is clear
# State in thesis that there are no outliers to be expected!