In [None]:
%load_ext autoreload 
%autoreload 2

In [None]:
import json
from privacypacking.utils.utils import load_logs, global_metrics
import pandas as pd
from experiments.ray.analysis import load_tasks, load_ray_experiment, load_latest_ray_experiment, load_latest_scheduling_results, load_latest_scheduling_results, load_latest_ray_experiment, load_scheduling_queue
import plotly.express as px
from privacypacking.budget.curves import  LaplaceCurve, GaussianCurve, SubsampledGaussianCurve
from privacypacking.budget import Budget, Task, Block
from privacypacking.schedulers.metrics import OverflowRelevance, FlatRelevance
from privacypacking.budget.block_selection import RandomBlocks
from privacypacking.utils.plot import plot_budgets
import yaml
from pathlib import Path
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
from collections import defaultdict

In [None]:
import plotly.io as pio
pio.renderers.default = "iframe"

In [None]:
df = load_latest_scheduling_results(alphas=True, tasks_dir="/home/pierre/privacypacking/data/demo_workload/tasks")

In [None]:
df

In [None]:
grouped = df[["id","task","allocated","metric"]].drop_duplicates(subset=['id', 'metric']).groupby(["task","metric"]).agg([np.sum, "count"])
grouped = grouped.reset_index()
grouped["n_allocated"] = grouped["allocated"]["sum"]
grouped["total"] = grouped["allocated"]["count"]
grouped = grouped.drop(["id", "allocated"], axis=1)
grouped["n_rejected"] = grouped["total"] - grouped["n_allocated"]

In [None]:
px.bar(grouped[["metric", "n_allocated"]].groupby("metric").sum().reset_index(), 
         x = "metric",
         y = "n_allocated",
         title = 'Total number of tasks allocated per scheduler', 
        # facet_col="metric",
        #     facet_col_wrap=2,
            # height=600,
            width=1000
             )

In [None]:
px.bar(grouped, 
         x = "task",
         y = ["n_allocated", "n_rejected"],
         title = 'Type of task allocated per scheduler', 
        facet_col="metric",
            facet_col_wrap=2,
            height=600,
             )

In [None]:
px.bar(
    # df.query("allocated"),
    df.query("metric == 'VectorizedBatchOverflowRelevance'"),
    x="blockid_alpha",
    y="normalized_epsilon",
    range_y=[0,20],
#     color="log_id",
    color="task",
    # barmode="group",
    # pattern_shape="allocated",
    # facet_col="metric",
    facet_col_wrap=1,
    height=500,
    title="All demands per block and alpha (workload)"
#     animation_frame="id"
)

In [None]:
df.query("allocated")

In [None]:
px.bar(
    df.query("allocated").sort_values("blockid_alpha"),
    x="blockid_alpha",
    y="normalized_epsilon",
    range_y=[0,3],
    range_x=[0, 5 * 5],
#     color="log_id",
    color="task",
    # barmode="group",
    # pattern_shape="allocated",
    facet_col="metric",
    facet_col_wrap=1,
    height=500,
    title="Allocated tasks per block and alpha"
#     animation_frame="id"
)

In [None]:
px.bar(
    df.query("allocated and metric == 'BatchOverflowRelevance'").sort_values(["allocation_index", "blockid_alpha"]),
    x="blockid_alpha",
    y="normalized_epsilon",
    range_y=[0,3],
    range_x=[0, 5 * 5],
#     color="log_id",
    color="task",
    # barmode="group",
    # pattern_shape="allocated",
    # facet_col="metric",
    # facet_col_wrap=1,
    height=500,
    title="Allocated tasks per block and alpha",
    animation_frame="allocation_index"
)

Why does overflow relevance sometimes allocate short flat instead of bumpy tasks??

In [None]:
df.query("allocated and metric == 'BatchOverflowRelevance' and blockid_alpha == '002-08'")

In [None]:
# Inefficient approach first: duplicate the rows for each scheduling time
sdf = df.query("allocated and metric == 'BatchOverflowRelevance'")
len(sdf)

In [None]:
sdf.scheduling_time.head()

In [None]:
px.histogram(sdf, x="scheduling_time", nbins=1000)

In [None]:
scheduling_steps = sdf.scheduling_time.unique()
scheduling_steps

In [None]:
sdf["scheduled_at_or_before"] = sdf.scheduling_time.apply(lambda scheduling_time: [i for i in scheduling_steps if i >= scheduling_time])
sdf = sdf.explode("scheduled_at_or_before")

In [None]:
px.bar(
    sdf.sort_values(["blockid_alpha", "scheduled_at_or_before","T"]),
    x="blockid_alpha",
    y="normalized_epsilon",
    range_y=[0,3],
    range_x = [0, 20 * 5],
    color="task",
    facet_col="scheduled_at_or_before",
    facet_col_wrap=1,
    height=600,
    width=1200,
    # animation_frame="scheduled_at_or_before",
    title="BatchOverflowRelevance allocation mix for different scheduling step sizes<br><sup>Only showing 4 alphas per block</sup>",
)

Why on earth are the blocks not ordered properly?

In [None]:
queue = load_scheduling_queue()
tasks = load_tasks(tasks_dir="/home/pierre/privacypacking/data/demo_workload/tasks")
# top_queue = defaultdict()
# for row in queue.itertuples():
#     for task_id, metric_val in row.ids_and_metrics:
#         print(task_id, metric_val)
def zip_positions(l):
    o = []
    i = 0
    for a,b in l:
        o.append((a,b,i))
        i+=1
    return o

queue["ids_and_metrics"] = queue.ids_and_metrics.apply(zip_positions)
queue.head()
queue = queue.explode("ids_and_metrics")
queue["id"] = queue.ids_and_metrics.apply(lambda x: x[0])
queue["efficiency"] = queue.ids_and_metrics.apply(lambda x: x[1])
queue["position_in_queue"] = queue.ids_and_metrics.apply(lambda x: x[2])

queue = queue.merge(tasks, on="id")

# Cap infinite profits
queue["efficiency"] = queue["efficiency"].apply(lambda x: 1000 if x > 1000 else x)
# px.bar(
#     queue,
#     x="position_in_queue",
#     y="efficiency",
#     range_y=[1e-3, 1e3],
#     log_y=True,
#     # range_x = [0, 20 * 5],
#     color="task",
#     facet_col="T",
#     facet_col_wrap=1,
#     height=600,
#     width=1200,
#     animation_frame="scheduling_time",
# )

Warning: the slider doesn't seem reliable (misses some colors)

In [None]:
queue.columns

In [None]:
queue.first_block_id.unique()

In [None]:
px.bar(
    queue.query("metric == 'BatchOverflowRelevance'"),
    x="position_in_queue",
    y="id",
    # range_y=[1e-3, 1e3],
    # log_y=True,
    hover_name="first_block_id",
    # range_x = [0, 20 * 5],
    color="task",
    facet_col="scheduling_time",
    facet_col_wrap=1,
    height=600,
    width=1200,
    animation_frame="iteration_counter",
)

In [None]:
px.bar(
    sdf.sort_values(["blockid_alpha", "scheduled_at_or_before","T"]),
    x="blockid_alpha",
    y="normalized_epsilon",
    range_y=[0,3],
    range_x = [0, 20 * 5],
    color="task",
    facet_col="scheduled_at_or_before",
    facet_col_wrap=1,
    height=600,
    width=1200,
    # animation_frame="scheduled_at_or_before",
    title="BatchOverflowRelevance allocation mix for different scheduling step sizes<br><sup>Only showing 4 alphas per block</sup>",
)

In [None]:
# TODO: allocation iteration by iteration. Then, understand why infinite metrics come from, and why they are not scheduled if they are infinite!

Now, let's compare the queue for different schedulers:

In [None]:
df = load_scheduling_queue()
tasks = load_tasks(tasks_dir="/home/pierre/privacypacking/data/mixed_curves/tasks")

In [None]:
queue = df.query("iteration_counter == 1 and T == 1.0 and scheduling_time == 2 ")
# top_queue = defaultdict()
# for row in queue.itertuples():
#     for task_id, metric_val in row.ids_and_metrics:
#         print(task_id, metric_val)
def zip_positions(l):
    o = []
    i = 0
    for a,b in l:
        o.append((a,b,i))
        i+=1
    return o

queue["ids_and_metrics"] = queue.ids_and_metrics.apply(zip_positions)
queue.head()
queue = queue.explode("ids_and_metrics")
queue["id"] = queue.ids_and_metrics.apply(lambda x: x[0])
queue["efficiency"] = queue.ids_and_metrics.apply(lambda x: x[1])
queue["position_in_queue"] = queue.ids_and_metrics.apply(lambda x: x[2])

queue = queue.merge(tasks, on="id")
px.bar(
    queue,
    x="position_in_queue",
    y="efficiency",
    range_y=[1e-3, 1e3],
    log_y=True,
    # range_x = [0, 20 * 5],
    color="task",
    facet_col="metric",
    facet_col_wrap=1,
    height=600,
    width=1200,
    # animation_frame="scheduling_time",
    orientation='v'
)

In [None]:
ddf = load_ray_experiment(Path("/home/pierre/privacypacking/logs/ray/run_and_report_2021-11-29_10-42-15"))
px.line(
    ddf.sort_values("T"),
    x="T",
    y="n_allocated_tasks",
    color="scheduler_metric",
    log_x=True,
    width=1_000,
    title="Allocated tasks depending on the scheduling step size<br><sup>Online mixed curves, 20 blocks, no initial blocks, 100 tasks per block on average, lifetime = 5 blocks, N = 10_000 (i.e. almost continuous unlocking)</sup>"
)

In [None]:
sdf = load_latest_scheduling_results(alphas=True, tasks_dir="/home/pierre/privacypacking/data/mixed_curves/tasks")

In [None]:
sdf.columns

In [None]:
sdf.drop_duplicates(subset=["id", "T", "metric"]).id.nunique()

In [None]:
delay_df = sdf.drop_duplicates(subset=["id", "T", "metric"])
delay_df.head()

In [None]:
px.line(
    delay_df.groupby(["T", "metric"]).mean().reset_index().sort_values("T"),
    x="T",
    y="scheduling_delay",
    color="metric",
    log_x=True,
    width=1_000,
    title="Average delay of allocated tasks, depending on the scheduling step size<br><sup>Online mixed curves, 20 blocks, no initial blocks, 100 tasks per block on average, lifetime = 5 blocks, N = 10_000 (i.e. almost continuous unlocking)<br> FCFS is 'batched FCFS'</sup>"
)

In [None]:
ddf = load_ray_experiment(Path("/home/pierre/privacypacking/logs/ray/run_and_report_2021-11-30_18-08-51"))
px.line(
    ddf.sort_values("T"),
    x="T",
    y="n_allocated_tasks",
    color="scheduler_metric",
    log_x=True,
    width=1_000,
    title="Allocated tasks depending on the scheduling step size<br><sup>Online mixed curves, 20 blocks, no initial blocks, 100 tasks per block on average, lifetime = 5 blocks, N = 10_000 (i.e. almost continuous unlocking)</sup>"
)

In [None]:
ddf = load_ray_experiment(Path("/home/pierre/privacypacking/logs/ray/run_and_report_2021-11-29_22-26-18"))
px.line(
    ddf.sort_values("T"),
    x="T",
    y="n_allocated_tasks",
    color="scheduler_metric",
    log_x=True,
    width=1_000,
    title="Allocated tasks depending on the scheduling step size<br><sup>Online mixed curves, 60 blocks, no initial blocks, 100 tasks per block on average, lifetime = 20 blocks, N = 10_000 (i.e. almost continuous unlocking)</sup>"
)