In [None]:
%load_ext autoreload 
%autoreload 2

In [None]:
import json
from privacypacking.utils.utils import load_logs
import pandas as pd
from experiments.ray.analysis import load_ray_experiment, load_latest_ray_experiment, load_latest_scheduling_results, load_latest_scheduling_results, load_latest_ray_experiment
import plotly.express as px
from privacypacking.budget.curves import  LaplaceCurve, GaussianCurve, SubsampledGaussianCurve
from privacypacking.budget import Budget, Task, Block
from privacypacking.schedulers.metrics import OverflowRelevance, FlatRelevance
from privacypacking.budget.block_selection import RandomBlocks
from privacypacking.utils.plot import plot_budgets
import yaml
from pathlib import Path
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
from experiments.ray.analysis import get_percentiles

In [None]:
# df = load_latest_scheduling_results(expname="ray/run_and_report_2022-04-06_17-27-02")
df = load_latest_scheduling_results(expname="ray/run_and_report_2022-04-06_22-35-08")


In [None]:
original_df = df.copy()
df = original_df.query("data_lifetime == 100 and max_blocks == 100 and T == 10")

In [None]:
px.ecdf(
    df.query("scheduler_metric == 'ArgmaxKnapsack' and allocated"),
    x="n_blocks"
)

In [None]:
px.ecdf(
    df.query("scheduler_metric == 'DominantShares' and allocated"),
    x="n_blocks"
)

Most allocated tasks have below 10 blocks, because we have T=10? The data lifetime is way too small, basically no unlocking so after T = 10 all the previous budget is unlocked and eaten up, not much space for anything else. Result: DPF does not fall into multiblock traps.

Conclusion: you should scale up everything.

In [None]:
px.ecdf(
    df,
    x="n_blocks"
)

In [None]:
tasks_df = pd.read_csv("/home/pierre/privacypacking/data/alibaba-privacy-workload/outputs/privacy_tasks.csv")

In [None]:
tasks_df.head(2)

In [None]:
def listify(row):
    return list(map(float, row.strip('][').split(', ')))
tasks_df["alphas"] = tasks_df["alphas"].apply(listify)
tasks_df["rdp_epsilons"] = tasks_df["rdp_epsilons"].apply(listify)
tasks_df["normalized_rdp_epsilons"] = tasks_df["normalized_rdp_epsilons"].apply(listify)

In [None]:
tasks_df["epsilon_min"] = tasks_df["normalized_rdp_epsilons"].apply(min)
tasks_df["epsilon_max"] = tasks_df["normalized_rdp_epsilons"].apply(max)

In [None]:
tasks_df.describe()

In [None]:
px.scatter(
    tasks_df,
    x="epsilon_max",
    y="n_blocks"
)

In [None]:
px.scatter(
    tasks_df,
    x="epsilon_min",
    y="n_blocks"
)

In [None]:
# rdf = load_ray_experiment(Path("/home/pierre/privacypacking/logs/ray/run_and_report_2022-04-06_22-35-08"))
rdf = load_ray_experiment(Path("/home/pierre/privacypacking/logs/ray/run_and_report_2022-04-07_08-24-11"))


In [None]:
# rdf = load_ray_experiment(Path("/home/pierre/privacypacking/logs/ray/run_and_report_2022-04-06_17-27-02"))
# rdf["scheduler_metric"] = rdf.apply(lambda row: row.scheduler_metric if row.scheduler == "basic_scheduler" else "Simplex", axis=1)
fig = px.line(
    # rdf.query("data_lifetime == 50 and max_blocks == 400").sort_values("T"),
    rdf.sort_values(["data_lifetime", "max_blocks", "T"]),
    x="T",
    y="n_allocated_tasks",
    color="scheduler_metric",
    log_x=True,
    width=800,
    height=600,
    markers=True,
    facet_col="data_lifetime",
    facet_row="max_blocks",
)
fig.update_yaxes(rangemode="tozero")
fig

In [None]:
sdf = load_latest_scheduling_results(expname="ray/run_and_report_2022-04-07_08-24-11")


In [None]:
m = sdf.query("allocated").groupby(["data_lifetime", "max_blocks","scheduler_metric","T"]).agg({"n_blocks": "mean"}).reset_index()
m = m.rename(columns={"n_blocks": "avg_n_blocks"})
s = sdf.query("allocated").groupby(["data_lifetime", "max_blocks","scheduler_metric","T"]).agg({"n_blocks": "std"}).reset_index()
s = s.rename(columns={"n_blocks": "std_n_blocks"})
m = m.merge(s, on = ["data_lifetime", "max_blocks","scheduler_metric","T"])

In [None]:
fig = px.line(
    # rdf.query("data_lifetime == 50 and max_blocks == 400").sort_values("T"),
    m.sort_values(["data_lifetime", "max_blocks", "T"]),
    x="T",
    y="avg_n_blocks",
    error_y="std_n_blocks",
    color="scheduler_metric",
    log_x=True,
    width=800,
    height=600,
    markers=True,
    facet_col="data_lifetime",
    facet_row="max_blocks",
)
fig.update_yaxes(rangemode="tozero")
fig

Surprisingly, FCFS allocates small tasks (in number of blocks). Probably because it is wasting so much space with poor RDP packing that it has no choice but using small tasks?

In [None]:
m = sdf.groupby(["data_lifetime", "max_blocks","scheduler_metric","T"]).agg({"n_blocks": "mean"}).reset_index()
m = m.rename(columns={"n_blocks": "avg_n_blocks"})
s = sdf.groupby(["data_lifetime", "max_blocks","scheduler_metric","T"]).agg({"n_blocks": "std"}).reset_index()
s = s.rename(columns={"n_blocks": "std_n_blocks"})
m = m.merge(s, on = ["data_lifetime", "max_blocks","scheduler_metric","T"])

The task distribution changes when we change the max number of blocks: when we have only 50 blocks, we drop tasks that ask more than the number of blocks at generation time because they have no chance of being allocated.

In [None]:
fig = px.line(
    # rdf.query("data_lifetime == 50 and max_blocks == 400").sort_values("T"),
    m.sort_values(["data_lifetime", "max_blocks", "T"]),
    x="T",
    y="avg_n_blocks",
    error_y="std_n_blocks",
    color="scheduler_metric",
    log_x=True,
    width=800,
    height=600,
    markers=True,
    facet_col="data_lifetime",
    facet_row="max_blocks",
)
fig.update_yaxes(rangemode="tozero")
fig