In [None]:
%load_ext autoreload 
%autoreload 2

In [None]:
import json
from privacypacking.utils.utils import load_logs
import pandas as pd
from experiments.ray.analysis import load_tasks, load_ray_experiment, load_latest_ray_experiment, load_latest_scheduling_results, load_latest_scheduling_results, load_latest_ray_experiment, load_scheduling_queue
import plotly.express as px
from privacypacking.budget.curves import  LaplaceCurve, GaussianCurve, SubsampledGaussianCurve,SubsampledLaplaceCurve, SyntheticPolynomialCurve
from privacypacking.budget import Budget, Task, Block
from privacypacking.schedulers.metrics import OverflowRelevance, FlatRelevance
from privacypacking.budget.block_selection import RandomBlocks
from privacypacking.utils.plot import plot_budgets
import yaml
from pathlib import Path
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

from omegaconf import OmegaConf
from pathlib import Path
from collections import defaultdict
from privacypacking.budget import ALPHAS
from privacypacking.utils.zoo import zoo_df, build_zoo, build_synthetic_zoo


In [None]:
block = Budget.from_epsilon_delta(epsilon=10, delta=1e-7)
block.dump()

In [None]:
from itertools import product

In [None]:
r = [0.05,0.1, 0.5, 1, 2, 3, 4, 5]

names_and_curves = [
    # ("sublaplace", SubsampledLaplaceCurve(0.1, 1, 10) + LaplaceCurve(2)),
    # ("subgaussian", SubsampledGaussianCurve(0.001, 0.8, 100)),
    # ("laplace", LaplaceCurve(1))
    # (f"g-{k}", LaplaceCurve(10) * k) for k in [1,10,50,100]
    (f"l{i}g{j}", LaplaceCurve(i) +GaussianCurve(j)) for i,j in product(r,r)
]

# names_and_curves.extend([
#     (f"g{i}", GaussianCurve(i)) for i in r
# ])

names_and_curves.extend([
    (f"l{i}l{j}", LaplaceCurve(i) + LaplaceCurve(j)) for i,j in product(r,r)
])
for k in [1, 10, 100, 200]:
    names_and_curves.extend([
        (f"sg-q{q}s{s}-k{k}", SubsampledGaussianCurve(q,s,k/q)) for q,s in product([0.001, 0.01, 0.05, 0.1, 0.2, 0.5],[0.1, 0.5, 1,2])
    ])
# names_and_curves.extend([
#     (f"sl-q{q}s{s}", SubsampledLaplaceCurve(q,s,100/q)) for q,s in product([0.01,0.05, 0.1, 0.2],[0.1, 0.5, 1,2])
# ])
alphas_df, tasks_df = zoo_df(names_and_curves, delta=1e-7)
px.line(
    # alphas_df.sort_values("alphas").query("best_alpha_x == 6 and epsilon_min == 0.01"),
    # alphas_df.sort_values("alphas").query("best_alpha_x == 6"),
    alphas_df.sort_values("alphas"),
    x="alphas",
    y="normalized_epsilons",
    color="task_name",
    log_y=True,
    log_x=True,
)

In [None]:
px.scatter(
    alphas_df.query("alphas == best_alpha"),
    x="alphas",
    y="normalized_epsilons",
    color="task_id",
    log_y=True,
    log_x=True,
    title="Epsilon for the best alpha of each task",
)

In [None]:
px.scatter(
    alphas_df.query("alphas == best_alpha"),
    x="alphas",
    y="epsilon_range",
    color="task_id",
    log_y=True,
    log_x=True,
    title="Range by best alpha",
)

In [None]:
offset = alphas_df.query("alphas == best_alpha").groupby("alphas").agg({"normalized_epsilons":"mean"}).reset_index().rename(columns={"alphas": "best_alpha", "normalized_epsilons": "epsilon_min_avg"})
offset_2 = alphas_df.query("alphas == best_alpha").groupby("alphas").agg({"normalized_epsilons":"std"}).reset_index().rename(columns={"alphas": "best_alpha", "normalized_epsilons": "epsilon_min_std"})
offset = offset.merge(offset_2)
offset

In [None]:
alphas_df = alphas_df.merge(offset)
alphas_df.head()

In [None]:
alphas_df.query("task_id in [90,91]")

In [None]:
epsilon_min_avg = 0.5
epsilon_min_std = 0.1
rescaled = alphas_df.copy()
# Vertical shift the whole curve depending on epsilon_min
rescaled["normalized_epsilons"] = alphas_df["normalized_epsilons"] + (epsilon_min_avg - alphas_df["epsilon_min"]) + epsilon_min_std * (alphas_df["epsilon_min"] - alphas_df["epsilon_min_avg"]) / alphas_df["epsilon_min_std"]
# alphas_df["normalized_epsilons"] = (alphas_df["normalized_epsilons"] - 0.2 * alphas_df["alphas"]) 
# 

In [None]:
rescaled.query("task_id in [1,2]")

In [None]:
px.scatter(
    rescaled.query("alphas == best_alpha"),
    x="alphas",
    y="normalized_epsilons",
    color="task_id",
    # log_y=True,
    log_x=True,
    title="Normalized eps by best alpha",
)

In [None]:
len(rescaled)

In [None]:
px.line(
    # alphas_df.sort_values("alphas").query("best_alpha_x == 6 and epsilon_min == 0.01"),
    # alphas_df.sort_values("alphas").query("best_alpha_x == 6"),
    rescaled.sort_values("alphas"),
    x="alphas",
    y="normalized_epsilons",
    color="task_name",
    log_y=True,
    log_x=True,
)

In [None]:
alphas_df.query("task_id == 1")

In [None]:
ranges = rescaled.query("alphas in [4,5,6,8]").groupby("task_id")["normalized_epsilons"].agg(min).reset_index().rename(columns={"normalized_epsilons": "epsilon_range_min"})
# rescaled.query("alphas in [4,5,6,8] and task_id == 90").groupby("task_id")["normalized_epsilons"].agg(min)
ranges = ranges.merge(rescaled.query("alphas in [4,5,6,8]").groupby("task_id")["normalized_epsilons"].agg(max).reset_index().rename(columns={"normalized_epsilons": "epsilon_range_max"}))
ranges = ranges.merge(
    rescaled.groupby("task_id")["normalized_epsilons"].agg(min).reset_index().rename(columns={"normalized_epsilons": "epsilon_min"})
)


ranges["epsilon_range"] = ranges["epsilon_range_max"] - ranges["epsilon_range_min"]

ranges

In [None]:
ranges.describe()

In [None]:
# rescaled["epsilon_range"] = (
#     rescaled.query("alphas in [4,5,6,8]")
#     .groupby("task_id")["normalized_epsilons"]
#     .agg(max)
# ) - (
#     rescaled.query("alphas in [4,5,6,8]")
#     .groupby("task_id")["normalized_epsilons"]
#     .agg(min)
# )

In [None]:
rescaled = rescaled.drop(columns=["epsilon_range","epsilon_min"]) # Obsolete since we rescaled
rescaled = rescaled.merge(ranges, on="task_id")

In [None]:
rescaled

In [None]:
offset_range = rescaled.query("alphas == 3").groupby("best_alpha").agg({"epsilon_range":"mean"}).reset_index().rename(columns={"alphas": "best_alpha", "epsilon_range": "epsilon_range_avg"})
offset_range_2 = rescaled.query("alphas == 3").groupby("best_alpha").agg({"epsilon_range":"std"}).reset_index().rename(columns={"alphas": "best_alpha", "epsilon_range": "epsilon_range_std"})
offset_range = offset_range.merge(offset_range_2)
offset_range

In [None]:
rescaled = rescaled.merge(offset_range)

In [None]:
# rescaled_with_range = rescaled.copy()
# rescaled_with_range["new_range"] = -1 + 

In [None]:
range_avg = 0.05
range_std = 0.03
rescaled_with_range = rescaled.copy()
rescaled_with_range["new_range"] = range_avg + range_std * (rescaled_with_range["epsilon_range"] - rescaled_with_range["epsilon_range_avg"]) / rescaled_with_range["epsilon_range_std"]
rescaled_with_range["normalized_epsilons"] = rescaled_with_range["epsilon_min"] + (rescaled_with_range["new_range"] / rescaled_with_range["epsilon_range"]) * (rescaled_with_range["normalized_epsilons"] -  rescaled_with_range["epsilon_min"]) 

In [None]:
px.scatter(
    rescaled_with_range.query("alphas == best_alpha"),
    x="alphas",
    y="new_range",
    color="task_id",
    # log_y=True,
    log_x=True,
    title="Range by best alpha (in theory)",
)

In [None]:
px.line(
    # alphas_df.sort_values("alphas").query("best_alpha_x == 6 and epsilon_min == 0.01"),
    # alphas_df.sort_values("alphas").query("best_alpha_x == 6"),
    rescaled_with_range.sort_values("alphas"),
    x="alphas",
    y="normalized_epsilons",
    color="task_name",
    log_y=True,
    log_x=True,
    range_y=[epsilon_min_avg - 3 * epsilon_min_std, 2]
)

In [None]:
px.scatter(
    rescaled_with_range.query("alphas == best_alpha"),
    x="alphas",
    y="normalized_epsilons",
    color="task_id",
    # log_y=True,
    log_x=True,
    title="Normalized eps by best alpha",
)

In [None]:
rescaled_with_range.query("alphas == best_alpha").groupby("best_alpha")["normalized_epsilons"].describe()

In [None]:
rescaled_with_range.query("alphas == best_alpha").groupby("best_alpha")["new_range"].describe()

In [None]:
ranges = rescaled_with_range.query("alphas in [4,5,6,8]").groupby("task_id")["normalized_epsilons"].agg(min).reset_index().rename(columns={"normalized_epsilons": "epsilon_range_min"})
# rescaled.query("alphas in [4,5,6,8] and task_id == 90").groupby("task_id")["normalized_epsilons"].agg(min)
ranges = ranges.merge(rescaled_with_range.query("alphas in [4,5,6,8]").groupby("task_id")["normalized_epsilons"].agg(max).reset_index().rename(columns={"normalized_epsilons": "epsilon_range_max"}))
ranges = ranges.merge(
    rescaled_with_range.groupby("task_id")["normalized_epsilons"].agg(min).reset_index().rename(columns={"normalized_epsilons": "epsilon_min"})
)


ranges["epsilon_range"] = ranges["epsilon_range_max"] - ranges["epsilon_range_min"]

ranges

In [None]:
rescaled_with_range = rescaled_with_range.drop(columns=["epsilon_range","epsilon_min"]) # Obsolete since we rescaled
rescaled_with_range = rescaled_with_range.merge(ranges, on="task_id")

In [None]:
px.scatter(
    rescaled_with_range.query("alphas == best_alpha"),
    x="alphas",
    y="epsilon_range",
    color="task_id",
    log_y=True,
    log_x=True,
    title="Range by best alpha (real)",
)

In [None]:
rescaled_with_range.query("alphas == best_alpha").groupby("best_alpha")["epsilon_range"].describe()

In [None]:
offset_range = rescaled_with_range.query("alphas == 3").groupby("best_alpha").agg({"epsilon_range":"mean"}).reset_index().rename(columns={"alphas": "best_alpha", "epsilon_range": "epsilon_range_avg"})
offset_range_2 = rescaled_with_range.query("alphas == 3").groupby("best_alpha").agg({"epsilon_range":"std"}).reset_index().rename(columns={"alphas": "best_alpha", "epsilon_range": "epsilon_range_std"})
offset_range = offset_range.merge(offset_range_2)
offset_range

In [None]:
# names_and_curves = build_zoo()
# alphas_df, tasks_df = zoo_df(names_and_curves)
# px.line(
#     # alphas_df.sort_values("alphas").query("best_alpha_x == 6 and epsilon_min == 0.01"),
#     # alphas_df.sort_values("alphas").query("best_alpha_x == 6"),
#     alphas_df.sort_values("alphas"),
#     x="alphas",
#     y="normalized_epsilons",
#     color="task_name",
#     log_y=True,
#     log_x=True,
# )

In [None]:
for task_name in rescaled_with_range.task_name.unique():
    for _, row in rescaled_with_range.query(f"task_name == '{task_name}'").iterrows():
        print(row["alphas"], row["normalized_epsilons"])
    break


Okay let's try our new normalization function!

In [None]:
from privacypacking.utils.zoo import normalize_zoo, build_zoo

In [None]:
names_and_curves = build_zoo()

In [None]:
original_alphas_df, _ = zoo_df(names_and_curves)

In [None]:
px.line(

    original_alphas_df.sort_values("alphas"),
    x="alphas",
    y="normalized_epsilons",
    color="task_name",
    log_y=True,
    log_x=True,
)

In [None]:
px.scatter(
    original_alphas_df.query("alphas == best_alpha"),
    x="alphas",
    y="normalized_epsilons",
    color="task_id",
    log_y=True,
    log_x=True,
    title="Normalized eps by best alpha",
)

In [None]:
px.scatter(
    original_alphas_df.query("alphas == best_alpha"),
    x="alphas",
    y="epsilon_max",
    color="task_id",
    log_y=True,
    log_x=True,
    title="Dominant share by best alpha",
)

In [None]:
px.scatter(
    original_alphas_df.query("alphas == best_alpha"),
    x="alphas",
    y="epsilon_range",
    color="task_id",
    log_y=True,
    log_x=True,
    title="Range by best alpha",
)

Now, let's normalize this and see how it looks like!

In [None]:
new_names_and_curves = normalize_zoo(
    names_and_curves,
    epsilon_min_avg=0.1,
    epsilon_min_std=0.01,
    range_avg=0.5,
    range_std=0.03
)

In [None]:
alphas_df, _ = zoo_df(new_names_and_curves)

In [None]:
px.line(
    # alphas_df.sort_values("alphas").query("best_alpha_x == 6 and epsilon_min == 0.01"),
    # alphas_df.sort_values("alphas").query("best_alpha_x == 6"),
    alphas_df.sort_values("alphas"),
    x="alphas",
    y="normalized_epsilons",
    color="task_name",
    log_y=True,
    log_x=True,
)

In [None]:
fig = px.line(
    # alphas_df.sort_values("alphas").query("best_alpha_x == 6 and epsilon_min == 0.01"),
    # alphas_df.sort_values("alphas").query("best_alpha_x == 6"),
    alphas_df.sort_values("alphas"),
    x="alphas",
    y="normalized_epsilons",
    color="task_name",
    log_y=True,
    log_x=True,
    height=1200,
    facet_row="best_alpha",
)
fig.update_layout(showlegend=False) 
fig

In [None]:
px.scatter(
    alphas_df.query("alphas == best_alpha"),
    x="alphas",
    y="epsilon_min",
    # y="normalized_epsilons",

    color="task_type",
    log_y=True,
    log_x=True,
    title="Best eps by best alpha",
)

In [None]:
px.scatter(
    alphas_df.query("alphas == best_alpha"),
    x="alphas",
    y="epsilon_max",
    color="task_type",
    log_y=True,
    log_x=True,
    title="Dominant share by best alpha",
)

In [None]:
px.scatter(
    alphas_df.query("alphas == best_alpha"),
    x="alphas",
    y="epsilon_range",
    color="task_id",
    log_y=True,
    log_x=True,
    title="Range by best alpha",
)

In [None]:
def map_range_to_bin(alpha):

    return ALPHAS.index(alpha)

In [None]:
tasks_df = alphas_df.query("alphas == 5")

In [None]:
df = tasks_df.copy()
df["bin_id"] = df["best_alpha"].apply(map_range_to_bin)

count_by_bin = list(df.groupby("bin_id").count().epsilon_range)

In [None]:
dict(df.groupby("bin_id").count().alphas)