In [1]:
%load_ext autoreload 
%autoreload 2

In [2]:
import json
from privacypacking.utils.utils import load_logs, global_metrics
import pandas as pd
from experiments.ray.analysis import load_tasks, load_ray_experiment, load_latest_ray_experiment, load_latest_scheduling_results, load_latest_scheduling_results, load_latest_ray_experiment, load_scheduling_queue
import plotly.express as px
from privacypacking.budget.curves import  LaplaceCurve, GaussianCurve, SubsampledGaussianCurve
from privacypacking.budget import Budget, Task, Block
from privacypacking.schedulers.metrics import OverflowRelevance, FlatRelevance
from privacypacking.budget.block_selection import RandomBlocks
from privacypacking.utils.plot import plot_budgets
import yaml
from pathlib import Path
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
from collections import defaultdict

In [3]:
import plotly.io as pio
pio.renderers.default = "iframe"

In [4]:
df = load_latest_scheduling_results(alphas=True, tasks_dir="/home/pierre/privacypacking/data/demo_workload/tasks")

/home/pierre/privacypacking/logs/exp_1201-140714/time_based_budget_unlocking_DominantShares/1201-140719_aa1bc2.json
/home/pierre/privacypacking/logs/exp_1201-140714/time_based_budget_unlocking_BatchOverflowRelevance/1201-140719_410b92.json
/home/pierre/privacypacking/logs/exp_1201-140714/time_based_budget_unlocking_DynamicFlatRelevance/1201-140719_d5c13b.json
/home/pierre/privacypacking/logs/exp_1201-140714/time_based_budget_unlocking_FlatRelevance/1201-140719_0a63c5.json
/home/pierre/privacypacking/logs/exp_1201-140714/time_based_budget_unlocking_Fcfs/1201-140719_1dd3d4.json
/home/pierre/privacypacking/logs/exp_1201-140714/time_based_budget_unlocking_VectorizedBatchOverflowRelevance/1201-140718_d98bfe.json


In [5]:
df

Unnamed: 0,alpha,blockid_alpha,epsilon,normalized_epsilon,id,hashed_id,allocated,scheduler,profit,realized_profit,...,scheduling_delay,block,block_selection,totalblocks_scheduler_selection,metric,nblocks_maxeps,T,N,data_lifetime,task
2105,0,000-00,0.000000,0.0,0,69,True,time_based_budget_unlocking,1.0,1.0,...,2.0,0,LatestBlocksFirst,5-time_based_budget_unlocking-LatestBlocksFirst,Fcfs,1-14.726,2,10000,0.01,1-short_bumpy
0,0,000-00,0.000000,0.0,0,69,False,time_based_budget_unlocking,1.0,0.0,...,,0,LatestBlocksFirst,5-time_based_budget_unlocking-LatestBlocksFirst,DominantShares,1-14.726,2,10000,0.01,1-short_bumpy
440,0,000-00,0.000000,0.0,0,69,False,time_based_budget_unlocking,1.0,0.0,...,,0,LatestBlocksFirst,5-time_based_budget_unlocking-LatestBlocksFirst,BatchOverflowRelevance,1-14.726,2,10000,0.01,1-short_bumpy
880,0,000-00,0.000000,0.0,0,69,False,time_based_budget_unlocking,1.0,0.0,...,,0,LatestBlocksFirst,5-time_based_budget_unlocking-LatestBlocksFirst,DynamicFlatRelevance,1-14.726,2,10000,0.01,1-short_bumpy
1320,0,000-00,0.000000,0.0,0,69,False,time_based_budget_unlocking,1.0,0.0,...,,0,LatestBlocksFirst,5-time_based_budget_unlocking-LatestBlocksFirst,FlatRelevance,1-14.726,2,10000,0.01,1-short_bumpy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
809,64,004-64,14.725883,1.5,61,38,False,time_based_budget_unlocking,1.0,0.0,...,,4,LatestBlocksFirst,5-time_based_budget_unlocking-LatestBlocksFirst,BatchOverflowRelevance,1-14.726,2,10000,0.01,1-short_bumpy
1264,64,004-64,14.725883,1.5,61,38,False,time_based_budget_unlocking,1.0,0.0,...,,4,LatestBlocksFirst,5-time_based_budget_unlocking-LatestBlocksFirst,DynamicFlatRelevance,1-14.726,2,10000,0.01,1-short_bumpy
1694,64,004-64,14.725883,1.5,61,38,False,time_based_budget_unlocking,1.0,0.0,...,,4,LatestBlocksFirst,5-time_based_budget_unlocking-LatestBlocksFirst,FlatRelevance,1-14.726,2,10000,0.01,1-short_bumpy
2104,64,004-64,14.725883,1.5,61,38,False,time_based_budget_unlocking,1.0,0.0,...,,4,LatestBlocksFirst,5-time_based_budget_unlocking-LatestBlocksFirst,Fcfs,1-14.726,2,10000,0.01,1-short_bumpy


In [6]:
grouped = df[["id","task","allocated","metric"]].drop_duplicates(subset=['id', 'metric']).groupby(["task","metric"]).agg([np.sum, "count"])
grouped = grouped.reset_index()
grouped["n_allocated"] = grouped["allocated"]["sum"]
grouped["total"] = grouped["allocated"]["count"]
grouped = grouped.drop(["id", "allocated"], axis=1)
grouped["n_rejected"] = grouped["total"] - grouped["n_allocated"]


dropping on a non-lexsorted multi-index without a level parameter may impact performance.



In [7]:
px.bar(grouped[["metric", "n_allocated"]].groupby("metric").sum().reset_index(), 
         x = "metric",
         y = "n_allocated",
         title = 'Total number of tasks allocated per scheduler', 
        # facet_col="metric",
        #     facet_col_wrap=2,
            # height=600,
            width=1000
             )

In [8]:
px.bar(grouped, 
         x = "task",
         y = ["n_allocated", "n_rejected"],
         title = 'Type of task allocated per scheduler', 
        facet_col="metric",
            facet_col_wrap=2,
            height=600,
             )

In [9]:
px.bar(
    # df.query("allocated"),
    # df.query("metric == 'BatchOverflowRelevance'"),
    df, x="blockid_alpha",
    y="normalized_epsilon",
    range_y=[0,20],
#     color="log_id",
    color="task",
    # barmode="group",
    # pattern_shape="allocated",
    facet_col="metric",
    facet_col_wrap=1,
    height=500,
    title="All demands per block and alpha (workload)"
#     animation_frame="id"
)

In [10]:
df.query("allocated")

Unnamed: 0,alpha,blockid_alpha,epsilon,normalized_epsilon,id,hashed_id,allocated,scheduler,profit,realized_profit,...,scheduling_delay,block,block_selection,totalblocks_scheduler_selection,metric,nblocks_maxeps,T,N,data_lifetime,task
2105,0,000-00,0.000000,0.0,0,69,True,time_based_budget_unlocking,1.0,1.0,...,2.000000,0,LatestBlocksFirst,5-time_based_budget_unlocking-LatestBlocksFirst,Fcfs,1-14.726,2,10000,0.01,1-short_bumpy
2110,0,000-00,0.000000,0.0,1,24,True,time_based_budget_unlocking,1.0,1.0,...,1.813939,0,LatestBlocksFirst,5-time_based_budget_unlocking-LatestBlocksFirst,Fcfs,1-14.726,2,10000,0.01,1-short_bumpy
2115,0,000-00,0.000000,0.0,2,12,True,time_based_budget_unlocking,1.0,1.0,...,1.672076,0,LatestBlocksFirst,5-time_based_budget_unlocking-LatestBlocksFirst,Fcfs,1-14.726,2,10000,0.01,1-short_bumpy
385,0,000-00,0.000000,0.0,3,46,True,time_based_budget_unlocking,1.0,1.0,...,1.617505,0,LatestBlocksFirst,5-time_based_budget_unlocking-LatestBlocksFirst,DominantShares,1-4.909,2,10000,0.01,1-short_flat
810,0,000-00,0.000000,0.0,3,46,True,time_based_budget_unlocking,1.0,1.0,...,1.617505,0,LatestBlocksFirst,5-time_based_budget_unlocking-LatestBlocksFirst,BatchOverflowRelevance,1-4.909,2,10000,0.01,1-short_flat
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1749,64,004-64,4.908628,0.5,44,51,True,time_based_budget_unlocking,1.0,1.0,...,0.845748,4,LatestBlocksFirst,5-time_based_budget_unlocking-LatestBlocksFirst,FlatRelevance,1-4.909,2,10000,0.01,1-short_flat
874,64,004-64,14.725883,1.5,45,93,True,time_based_budget_unlocking,1.0,1.0,...,0.806413,4,LatestBlocksFirst,5-time_based_budget_unlocking-LatestBlocksFirst,BatchOverflowRelevance,1-14.726,2,10000,0.01,1-short_bumpy
2634,64,004-64,14.725883,1.5,45,93,True,time_based_budget_unlocking,1.0,1.0,...,0.806413,4,LatestBlocksFirst,5-time_based_budget_unlocking-LatestBlocksFirst,VectorizedBatchOverflowRelevance,1-14.726,2,10000,0.01,1-short_bumpy
879,64,004-64,14.725883,1.5,46,63,True,time_based_budget_unlocking,1.0,1.0,...,0.602028,4,LatestBlocksFirst,5-time_based_budget_unlocking-LatestBlocksFirst,BatchOverflowRelevance,1-14.726,2,10000,0.01,1-short_bumpy


In [11]:
px.bar(
    df.query("allocated").sort_values("blockid_alpha"),
    x="blockid_alpha",
    y="normalized_epsilon",
    range_y=[0,3],
    range_x=[0, 5 * 5],
#     color="log_id",
    color="task",
    # barmode="group",
    # pattern_shape="allocated",
    facet_col="metric",
    facet_col_wrap=1,
    height=500,
    title="Allocated tasks per block and alpha"
#     animation_frame="id"
)

In [25]:
df.query("allocated and metric == 'BatchOverflowRelevance' and blockid_alpha == '002-08'")

Unnamed: 0,alpha,blockid_alpha,epsilon,normalized_epsilon,id,hashed_id,allocated,scheduler,profit,realized_profit,...,scheduling_delay,block,block_selection,totalblocks_scheduler_selection,metric,nblocks_maxeps,T,N,data_lifetime,task
838,8,002-08,6.266472,0.75,18,37,True,time_based_budget_unlocking,1.0,1.0,...,1.867502,2,LatestBlocksFirst,5-time_based_budget_unlocking-LatestBlocksFirst,BatchOverflowRelevance,1-14.726,2,10000,0.01,1-short_bumpy
843,8,002-08,6.266472,0.75,19,6,True,time_based_budget_unlocking,1.0,1.0,...,1.701315,2,LatestBlocksFirst,5-time_based_budget_unlocking-LatestBlocksFirst,BatchOverflowRelevance,1-14.726,2,10000,0.01,1-short_bumpy
853,8,002-08,5.013178,0.6,21,82,True,time_based_budget_unlocking,1.0,1.0,...,1.431739,2,LatestBlocksFirst,5-time_based_budget_unlocking-LatestBlocksFirst,BatchOverflowRelevance,2-11.781,2,10000,0.01,2-long_bumpy
848,8,002-08,6.266472,0.75,22,46,True,time_based_budget_unlocking,1.0,1.0,...,1.300868,2,LatestBlocksFirst,5-time_based_budget_unlocking-LatestBlocksFirst,BatchOverflowRelevance,1-14.726,2,10000,0.01,1-short_bumpy


In [13]:
# Inefficient approach first: duplicate the rows for each scheduling time
sdf = df.query("allocated and metric == 'BatchOverflowRelevance' and T == 2")
len(sdf)

70

In [14]:
sdf.scheduling_time.head()

810    2.0
815    2.0
811    2.0
816    2.0
812    2.0
Name: scheduling_time, dtype: float64

In [15]:
px.histogram(sdf, x="scheduling_time", nbins=1000)

In [16]:
scheduling_steps = sdf.scheduling_time.unique()
scheduling_steps

array([2., 4., 6.])

In [17]:
sdf["scheduled_at_or_before"] = sdf.scheduling_time.apply(lambda scheduling_time: [i for i in scheduling_steps if i >= scheduling_time])
sdf = sdf.explode("scheduled_at_or_before")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [18]:
px.bar(
    sdf.sort_values(["blockid_alpha", "scheduled_at_or_before","T"]),
    x="blockid_alpha",
    y="normalized_epsilon",
    range_y=[0,3],
    range_x = [0, 20 * 5],
    color="task",
    facet_col="T",
    facet_col_wrap=1,
    height=600,
    width=1200,
    animation_frame="scheduled_at_or_before",
    title="BatchOverflowRelevance allocation mix for different scheduling step sizes<br><sup>Only showing 4 alphas per block</sup>",
)

In [36]:
queue = load_scheduling_queue()
tasks = load_tasks(tasks_dir="/home/pierre/privacypacking/data/demo_workload/tasks")
# top_queue = defaultdict()
# for row in queue.itertuples():
#     for task_id, metric_val in row.ids_and_metrics:
#         print(task_id, metric_val)
def zip_positions(l):
    o = []
    i = 0
    for a,b in l:
        o.append((a,b,i))
        i+=1
    return o

queue["ids_and_metrics"] = queue.ids_and_metrics.apply(zip_positions)
queue.head()
queue = queue.explode("ids_and_metrics")
queue["id"] = queue.ids_and_metrics.apply(lambda x: x[0])
queue["efficiency"] = queue.ids_and_metrics.apply(lambda x: x[1])
queue["position_in_queue"] = queue.ids_and_metrics.apply(lambda x: x[2])

queue = queue.merge(tasks, on="id")

# Cap infinite profits
queue["efficiency"] = queue["efficiency"].apply(lambda x: 1000 if x > 1000 else x)
# px.bar(
#     queue,
#     x="position_in_queue",
#     y="efficiency",
#     range_y=[1e-3, 1e3],
#     log_y=True,
#     # range_x = [0, 20 * 5],
#     color="task",
#     facet_col="T",
#     facet_col_wrap=1,
#     height=600,
#     width=1200,
#     animation_frame="scheduling_time",
# )

/home/pierre/privacypacking/logs/exp_1201-140714/time_based_budget_unlocking_DominantShares/1201-140719_aa1bc2.json
/home/pierre/privacypacking/logs/exp_1201-140714/time_based_budget_unlocking_BatchOverflowRelevance/1201-140719_410b92.json
/home/pierre/privacypacking/logs/exp_1201-140714/time_based_budget_unlocking_DynamicFlatRelevance/1201-140719_d5c13b.json
/home/pierre/privacypacking/logs/exp_1201-140714/time_based_budget_unlocking_FlatRelevance/1201-140719_0a63c5.json
/home/pierre/privacypacking/logs/exp_1201-140714/time_based_budget_unlocking_Fcfs/1201-140719_1dd3d4.json
/home/pierre/privacypacking/logs/exp_1201-140714/time_based_budget_unlocking_VectorizedBatchOverflowRelevance/1201-140718_d98bfe.json


Warning: the slider doesn't seem reliable (misses some colors)

In [30]:
queue.columns

Index(['scheduling_time', 'iteration_counter', 'ids_and_metrics', 'metric',
       'T', 'N', 'data_lifetime', 'id', 'efficiency', 'position_in_queue',
       'first_block_id', 'n_blocks', 'profit', 'creation_time',
       'nblocks_maxeps', 'task'],
      dtype='object')

In [41]:
queue.first_block_id.unique()

array(['4'], dtype=object)

In [39]:
px.bar(
    queue.query("iteration_counter == 1 and T == 2.0 and metric == 'BatchOverflowRelevance' ").query("iteration_counter == 1 and scheduling_time in [2.0, 4.0, 6.0, 8.0]"),
    x="position_in_queue",
    y="efficiency",
    range_y=[1e-3, 1e3],
    log_y=True,
    hover_name="first_block_id",
    # range_x = [0, 20 * 5],
    color="task",
    facet_col="scheduling_time",
    facet_col_wrap=1,
    height=600,
    width=1200,
    # animation_frame="scheduling_time",
)

Now, let's compare the queue for different schedulers:

In [None]:
df = load_scheduling_queue()
tasks = load_tasks(tasks_dir="/home/pierre/privacypacking/data/mixed_curves/tasks")

In [None]:
queue = df.query("iteration_counter == 1 and T == 1.0 and scheduling_time == 2 ")
# top_queue = defaultdict()
# for row in queue.itertuples():
#     for task_id, metric_val in row.ids_and_metrics:
#         print(task_id, metric_val)
def zip_positions(l):
    o = []
    i = 0
    for a,b in l:
        o.append((a,b,i))
        i+=1
    return o

queue["ids_and_metrics"] = queue.ids_and_metrics.apply(zip_positions)
queue.head()
queue = queue.explode("ids_and_metrics")
queue["id"] = queue.ids_and_metrics.apply(lambda x: x[0])
queue["efficiency"] = queue.ids_and_metrics.apply(lambda x: x[1])
queue["position_in_queue"] = queue.ids_and_metrics.apply(lambda x: x[2])

queue = queue.merge(tasks, on="id")
px.bar(
    queue,
    x="position_in_queue",
    y="efficiency",
    range_y=[1e-3, 1e3],
    log_y=True,
    # range_x = [0, 20 * 5],
    color="task",
    facet_col="metric",
    facet_col_wrap=1,
    height=600,
    width=1200,
    # animation_frame="scheduling_time",
    orientation='v'
)

In [None]:
ddf = load_ray_experiment(Path("/home/pierre/privacypacking/logs/ray/run_and_report_2021-11-29_10-42-15"))
px.line(
    ddf.sort_values("T"),
    x="T",
    y="n_allocated_tasks",
    color="scheduler_metric",
    log_x=True,
    width=1_000,
    title="Allocated tasks depending on the scheduling step size<br><sup>Online mixed curves, 20 blocks, no initial blocks, 100 tasks per block on average, lifetime = 5 blocks, N = 10_000 (i.e. almost continuous unlocking)</sup>"
)

In [None]:
sdf = load_latest_scheduling_results(alphas=True, tasks_dir="/home/pierre/privacypacking/data/mixed_curves/tasks")

In [None]:
sdf.columns

In [None]:
sdf.drop_duplicates(subset=["id", "T", "metric"]).id.nunique()

In [None]:
delay_df = sdf.drop_duplicates(subset=["id", "T", "metric"])
delay_df.head()

In [None]:
px.line(
    delay_df.groupby(["T", "metric"]).mean().reset_index().sort_values("T"),
    x="T",
    y="scheduling_delay",
    color="metric",
    log_x=True,
    width=1_000,
    title="Average delay of allocated tasks, depending on the scheduling step size<br><sup>Online mixed curves, 20 blocks, no initial blocks, 100 tasks per block on average, lifetime = 5 blocks, N = 10_000 (i.e. almost continuous unlocking)<br> FCFS is 'batched FCFS'</sup>"
)

In [None]:
ddf = load_ray_experiment(Path("/home/pierre/privacypacking/logs/ray/run_and_report_2021-11-30_18-08-51"))
px.line(
    ddf.sort_values("T"),
    x="T",
    y="n_allocated_tasks",
    color="scheduler_metric",
    log_x=True,
    width=1_000,
    title="Allocated tasks depending on the scheduling step size<br><sup>Online mixed curves, 20 blocks, no initial blocks, 100 tasks per block on average, lifetime = 5 blocks, N = 10_000 (i.e. almost continuous unlocking)</sup>"
)

In [None]:
ddf = load_ray_experiment(Path("/home/pierre/privacypacking/logs/ray/run_and_report_2021-11-29_22-26-18"))
px.line(
    ddf.sort_values("T"),
    x="T",
    y="n_allocated_tasks",
    color="scheduler_metric",
    log_x=True,
    width=1_000,
    title="Allocated tasks depending on the scheduling step size<br><sup>Online mixed curves, 60 blocks, no initial blocks, 100 tasks per block on average, lifetime = 20 blocks, N = 10_000 (i.e. almost continuous unlocking)</sup>"
)