# Motivation

This notebooks serves a simple logfile patching purpose. As there are different ways to define the interval when a model
is the `most recent` model for a certain interval, we allow patching the logfile to the desired definition.

By default our pipeline assumes a model is most recent for the time AFTER the training interval.
However, sometimes we want to consider the model most recent for the time DURING the training interval.

This script mutates the `most_recent_model` field in the logfile to the non-default 
definition (during training interval).

In [None]:
from modyn.supervisor.internal.pipeline_executor.models import PipelineLogs


from modyn.supervisor.internal.grpc.enums import PipelineStage
# fill missing times in cumulative plot


from analytics.app.data.transform import logs_dataframe
from pathlib import Path
from analytics.app.data.transform import dfs_models_and_evals


%load_ext autoreload
%autoreload 2

# Data loading

In [None]:
# VARIABLES

pipeline_logfile = Path("/Users/robinholzinger/robin/dev/eth/modyn-2/.analytics.log/.data/pipeline_11/pipeline.log")

In [None]:
logs = PipelineLogs.model_validate_json(pipeline_logfile.read_text())


In [None]:

df_logs = logs_dataframe(logs)
# max_timestamp = df_logs["sample_time"].max()
max_timestamp = df_logs["sample_time"].max()
df_models, eval_requests, evals_metrics = dfs_models_and_evals(logs, max_timestamp)

# Data exploration

In [None]:
df_models

In [None]:
eval_requests

In [None]:
eval_requests[eval_requests["currently_active_model"]]

# Patch logfile

In [None]:
for eval_log in logs.supervisor_logs.stage_runs:
    if eval_log.id == PipelineStage.EVALUATE_MULTI.name:
        # For a fixed interval the evaluation request of a certain model is the most recent, if the model training
        # interval center lies within the evaluation interval.
        # Note: this is not a generic solution, but works for the slicing case with fixed evaluation and trigger
        # intervals in the same order of magnitude.
        model_row = df_models[df_models["id_model"] == eval_log.info.eval_request.id_model]
        assert len(model_row) == 1

        training_center = (model_row.iloc[0]["train_start"].to_pydatetime().timestamp() + model_row.iloc[0]["train_end"].to_pydatetime().timestamp()) / 2
        eval_log.info.eval_request.currently_trained_model = eval_log.info.eval_request.interval_start <= training_center <= eval_log.info.eval_request.interval_end

In [None]:
# Write results back
pipeline_logfile.write_text(logs.model_dump_json(by_alias=True))

In [None]:

def patch_logfile(path):
    logs = PipelineLogs.model_validate_json(path.read_text())
    df_logs = logs_dataframe(logs)
    max_timestamp = df_logs["sample_time"].max()
    df_models, eval_requests, evals_metrics = dfs_models_and_evals(logs, max_timestamp)

    for eval_log in logs.supervisor_logs.stage_runs:
        if eval_log.id == PipelineStage.EVALUATE_MULTI.name:
            # Let's throw away all information about the most recent model, let's rebuild it
            eval_log.info.eval_request.currently_active_model = False

            # For a fixed interval the evaluation request of a certain model is the most recent, if the model training
            # interval center lies within the evaluation interval.
            # Note: this is not a generic solution, but works for the slicing case with fixed evaluation and trigger
            # intervals in the same order of magnitude.
            model_row = df_models[df_models["id_model"] == eval_log.info.eval_request.id_model]
            assert len(model_row) == 1

            training_center = (model_row.iloc[0]["train_start"].to_pydatetime().timestamp() + model_row.iloc[0]["train_end"].to_pydatetime().timestamp()) / 2
            eval_log.info.eval_request.currently_active_model = eval_log.info.eval_request.interval_start <= training_center <= eval_log.info.eval_request.interval_end
            eval_log.info.eval_request.currently_trained_model = eval_log.info.eval_request.interval_start <= training_center <= eval_log.info.eval_request.interval_end

    patched_path = path.parent / "pipeline.patched"
    patched_path.write_text(logs.model_dump_json(by_alias=True))

In [None]:
log_dir = Path("/Users/mboether/phd/dynamic-data/sigmod-data/cglm-landmark/data_selection_50%/logs")
logfiles = [logfile for logfile in log_dir.glob("**/pipeline.log") if (logfile.parent / "snapshot").exists()]

In [None]:
from tqdm import tqdm
for logfile in tqdm(logfiles):
    patch_logfile(logfile)

In [None]:
models_red = df_models[["trigger_id", "id_model", "train_start", "train_end"]]
models_red

In [None]:
eval_red = eval_requests[["trigger_id", "training_idx", "model_idx", "interval_start", "interval_end", "eval_handler", "dataset_id"]]
eval_red

In [None]:
df_cross = models_red.merge(eval_red, on="trigger_id").rename(columns={"train_start": "first_timestamp", "train_end": "last_timestamp"})
assert df_cross.shape[0] == eval_red.shape[0]
df_cross

# Adapted logic from handler.py

In [None]:
# df_cross["active_candidate"] = df_cross["last_timestamp"] < df_cross["active_model_trained_before"]

# # find the maximum model for every EvalCandidate that doesn't violate that constraint
# max_model_id = (
#     df_cross[df_cross["active_candidate"]]
#     .groupby("active_model_trained_before")["id_model"]
#     .aggregate(max_model_id="max")
# )

# # combine: a model in the cross product is most recent for a certain interval iff
# #  it has maximum model id for its active_model_trained_before
# df_active_models = df_cross.merge(max_model_id, on="active_model_trained_before", how="left")
# df_active_models["active_model"] = df_active_models["id_model"] == df_active_models["max_model_id"]

# # for a given interval, the currently trained model is the model with the smallest id
# # from all models that have a strictly bigger id than the most recent model. Hence it is the model after the
# # most recent model.
# # For that we first build a model -> successor model mapping:
# model_successor_relation = df_active_models[["id_model"]].drop_duplicates().sort_values(by="id_model")
# model_successor_relation["next_id_model"] = model_successor_relation["id_model"].shift(-1, fill_value=-1)

# # if there's no active model for the first interval(s), we still need to define the next model as the
# # trained model
# model_successor_relation = pd.concat(
#     [
#         model_successor_relation,
#         pd.DataFrame([{"id_model": None, "next_id_model": df_active_models["id_model"].min()}]),
#     ]
# )

# df_trained_models = df_active_models.merge(
#     model_successor_relation, how="left", left_on="max_model_id", right_on="id_model", suffixes=("", "__")
# )
# df_trained_models["trained_model"] = df_trained_models["id_model"] == df_trained_models["next_id_model"]
