In [None]:
%load_ext autoreload 
%autoreload 2

In [None]:
import json
from privacypacking.utils.utils import load_logs, global_metrics
import pandas as pd
from experiments.ray.analysis import load_tasks, load_ray_experiment, load_latest_ray_experiment, load_latest_scheduling_results, load_latest_scheduling_results, load_latest_ray_experiment, load_scheduling_queue
import plotly.express as px
from privacypacking.budget.curves import  LaplaceCurve, GaussianCurve, SubsampledGaussianCurve
from privacypacking.budget import Budget, Task, Block
from privacypacking.schedulers.metrics import OverflowRelevance, FlatRelevance
from privacypacking.budget.block_selection import RandomBlocks
from privacypacking.utils.plot import plot_budgets
import yaml
from pathlib import Path
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

In [None]:
from omegaconf import OmegaConf
from pathlib import Path
from collections import defaultdict

In [None]:
block = Budget.from_epsilon_delta(epsilon=10, delta=1e-8)

In [None]:
block

In [None]:
def load_task_dir(path: str) -> pd.DataFrame:
    dict_list = defaultdict(list)
    for curve_file in Path(path).glob("*.yaml"):
        d = OmegaConf.load(curve_file)
        for alpha, epsilon in zip(d["alphas"], d["rdp_epsilons"]):
            if block.epsilon(alpha) > 0:
                dict_list["alphas"].append(alpha)
                dict_list["rdp_epsilons"].append(epsilon)
                dict_list["normalized_epsilons"].append(epsilon / block.epsilon(alpha))
                dict_list["task"].append(curve_file.name)
    return pd.DataFrame(dict_list)

In [None]:
def sigma_range(min=0.01, max=100) -> pd.DataFrame:
    dict_list = defaultdict(list)
    for sigma in np.geomspace(min, max, 30):
        # gaussian = GaussianCurve(sigma=sigma)
        curve = LaplaceCurve(laplace_noise=sigma)
        # d = OmegaConf.load(curve_file)
        for alpha, epsilon in zip(curve.alphas, curve.epsilons):
            if block.epsilon(alpha) > 0:
                dict_list["alphas"].append(alpha)
                dict_list["rdp_epsilons"].append(epsilon)
                dict_list["normalized_epsilons"].append(epsilon / block.epsilon(alpha))
                dict_list["task"].append(sigma)
    return pd.DataFrame(dict_list)

In [None]:
from autodp.mechanism_zoo import ExponentialMechanism, RandresponseMechanism, GaussianSVT_Mechanism
from privacypacking.budget.utils import ALPHAS

In [None]:
m = ExponentialMechanism(eps=1.0)

In [None]:
def autodp_range(min=0.1, max=100, mechanism=GaussianSVT_Mechanism) -> pd.DataFrame:
    dict_list = defaultdict(list)
    for eps in np.geomspace(min, max, 10):
        for k in np.arange(1,100, step=10):
            # gaussian = GaussianCurve(sigma=sigma)
            # curve = mechanism(eps=eps)
            # curve=mechanism(p=eps)
            curve=mechanism(params={"sigma": eps, "k": k, "c": 100}, rdp_c_1=False)
            # d = OmegaConf.load(curve_file)
            # for alpha, epsilon in zip(curve.alphas, curve.epsilons):
            for alpha in ALPHAS:
                epsilon = curve.get_RDP(alpha)
                if block.epsilon(alpha) > 0:
                    dict_list["alphas"].append(alpha)
                    dict_list["rdp_epsilons"].append(epsilon)
                    dict_list["normalized_epsilons"].append(epsilon / block.epsilon(alpha))
                    dict_list["task"].append((eps,k))
    return pd.DataFrame(dict_list)

In [None]:
def subsampled_range(min=0.01, max=100) -> pd.DataFrame:
    dict_list = defaultdict(list)
    for sigma in np.geomspace(min, max, 10):
        for sampling in np.geomspace(1e-8, 1, 10):
            # gaussian = GaussianCurve(sigma=sigma)
            curve = SubsampledGaussianCurve(sigma=sigma, sampling_probability=sampling, steps=1)
            # d = OmegaConf.load(curve_file)
            for alpha, epsilon in zip(curve.alphas, curve.epsilons):
                if block.epsilon(alpha) > 0:
                    dict_list["alphas"].append(alpha)
                    dict_list["rdp_epsilons"].append(epsilon)
                    dict_list["normalized_epsilons"].append(epsilon / block.epsilon(alpha))
                    dict_list["task"].append((sigma, sampling))
    return pd.DataFrame(dict_list)

In [None]:
# df = load_task_dir("/home/pierre/privacypacking/data/mixed_curves/tasks")
# df = load_task_dir("/home/pierre/privacypacking/data/privatekube_event_g0.0_l0.5_p=grid/tasks")
# df = sigma_range()
# df = subsampled_range()
df = autodp_range()

In [None]:
# df.groupby("task").agg({"normalized_epsilons": "min", "alphas": "first"})
indx = df.groupby('task')['normalized_epsilons'].idxmin()
best_alpha = df.loc[indx]

In [None]:
px.line(
    df,
    x="alphas",
    y="normalized_epsilons",
    color="task",
    log_y=True,
    log_x=True,
)

In [None]:
px.scatter(
    best_alpha,
    x="alphas",
    y="normalized_epsilons",
    color="task",
    log_y=True,
    log_x=True,
    title="Epsilon for the best alpha of each task",
)

In [None]:
curve_zoo = []
for sigma in np.geomspace(0.01, 10, 100):
# for sigma in np.linspace(0.01, 100, 100):

    gaussian = GaussianCurve(sigma=sigma)
for sigma in np.geomspace(0.01, 10, 100):
    curve_zoo.append(LaplaceCurve(laplace_noise=sigma))
for sigma in np.geomspace(0.01, 10, 10):
# for sigma in np.linspace(0.01, 100, 100):

    for sampling in np.geomspace(1e-5, 1, 10):
        for steps in np.arange(1,100, step=50):
            curve_zoo.append(SubsampledGaussianCurve(sigma=sigma, sampling_probability=sampling, steps=steps))


In [None]:
def zoo_df(zoo: list, clipped= True) -> pd.DataFrame:
    dict_list = defaultdict(list)
    for index, curve in enumerate(zoo):
        for alpha, epsilon in zip(curve.alphas, curve.epsilons):
            if block.epsilon(alpha) > 0:
                dict_list["alphas"].append(alpha)
                dict_list["rdp_epsilons"].append(epsilon)
                if clipped:
                    dict_list["normalized_epsilons"].append(min(epsilon / block.epsilon(alpha), 1))
                else:
                    dict_list["normalized_epsilons"].append(epsilon / block.epsilon(alpha))
                dict_list["task"].append(index)
    return pd.DataFrame(dict_list)

In [None]:
df = zoo_df(curve_zoo)


tasks = pd.DataFrame(
df.groupby('task')['normalized_epsilons'].agg(min)
).reset_index()
tasks = tasks.rename(columns={"normalized_epsilons": "epsilon_min"})
tasks["epsilon_max"] = df.groupby('task')['normalized_epsilons'].agg(max)
tasks["epsilon_range"] = tasks["epsilon_max"] - tasks["epsilon_min"]

df = df.merge(tasks)
df = df.query("epsilon_min < 1 and epsilon_min > 1e-6")

# TODO: should we also filter out the tiny tiny tasks? Some subsampled guassians are ridiculously small.

In [None]:
df.epsilon_max.max()

In [None]:
px.line(
    df,
    x="alphas",
    y="normalized_epsilons",
    color="task",
    log_y=True,
    log_x=True,
)

In [None]:
len(df)

In [None]:
indx = df.groupby('task')['normalized_epsilons'].idxmin()
best_alpha = df.loc[indx]
px.scatter(
    best_alpha,
    x="alphas",
    y="normalized_epsilons",
    color="task",
    log_y=True,
    log_x=True,
    title="Epsilon for the best alpha of each task",
)

In [None]:
px.histogram(
    df,
    x="epsilon_min",
    # nbins=100,
)

In [None]:
px.histogram(
    df,
    x="epsilon_max",
    nbins=20,
)

In [None]:
px.histogram(
    df,
    x="epsilon_range",
    # nbins=100,
)

In [None]:
# Geometric distribution
n_bins=20
p = 0.01
f = []
for k in range(20):
    f.append((1-p)**(k-1)*p)
f = np.array(f)
f = f/sum(f)
px.line(f, title=f"Geometric distribution with p={p} and variance {(1-p)/p**2:.2f}, mean {1/p:.2f}")

In [None]:
n_samples=500
bin_ids = np.random.choice(n_bins, n_samples, p=f)

def sample_from_bin(bin_id):
    a = bin_id * (1/n_bins)
    b = a + (1/n_bins)
    bin_tasks = df.query(f"epsilon_range > {a} and epsilon_range <= {b}").task.unique()
    return np.random.choice(bin_tasks)

workload = [sample_from_bin(i) for i in bin_ids]

In [None]:
px.line(
    df.query(f"task in {workload}"),
    x="alphas",
    y="normalized_epsilons",
    color="task",
    log_y=True,
    log_x=True,
)

In [None]:
import scipy

In [None]:
# Negative binomial distribution
# Probably pointless since the epsilon_min are not uniform anyway
n_bins=20
mu = 2
# alpha = 10
# beta = mu * alpha

p = 0.01
r = mu * (1-p) / p
f = []
for k in range(20):
    f.append(
        scipy.special.binom(k + r - 1, k) * (1-p)**r * p**k
        # k **(alpha - 1) * np.exp(-beta * k) * beta ** alpha / scipy.special.gamma(alpha)
    )
f = np.array(f)
f = f/sum(f)

empirical_mean = np.average(np.array(range(n_bins)), weights=f)
empirical_var = np.average(np.array(range(n_bins)) ** 2, weights=f) - empirical_mean ** 2
px.line(f, title=f"NBN distribution with p={p}. Empirical var {empirical_var:.2f}, mean {empirical_mean:.2f}")

In [None]:
# Just a boring Gaussian?
n_bins=20
mu = 10
sigma = 1
f = []
for k in range(20):
    f.append(
        scipy.stats.norm.pdf(k, mu, sigma)
        # scipy.special.binom(k + r - 1, k) * (1-p)**r * p**k
        # k **(alpha - 1) * np.exp(-beta * k) * beta ** alpha / scipy.special.gamma(alpha)
    )
f = np.array(f)
f = f/sum(f)

empirical_mean = np.average(np.array(range(n_bins)), weights=f)
empirical_var = np.average(np.array(range(n_bins)) ** 2, weights=f) - empirical_mean ** 2
px.line(f, title=f"NBN distribution with p={p}. Empirical var {empirical_var:.2f}, mean {empirical_mean:.2f}")

In [None]:
px.histogram(
    # df.query("epsilon_min > 0.3 and epsilon_min < 0.4"),
    df.query("epsilon_min > 0.3"),
    x="epsilon_range",
    nbins=20,
    range_x=[0,1],
)

In [None]:
px.histogram(
    # df.query("epsilon_min > 0.3 and epsilon_min < 0.4"),
    df.query("epsilon_range > 0.1 and epsilon_range < 0.2"),
    x="epsilon_min",
    nbins=20,
    range_x=[0,1],
)

In [None]:
px.histogram(
    # df.query("epsilon_min > 0.3 and epsilon_min < 0.4"),
    df.query("epsilon_range < 0.3"),
    x="epsilon_min",
    nbins=20,
    range_x=[0,1],
)

Should we condition by epsilon_min (or epsilon_max)? Otherwise increasing the range variance will also increase the average size of the tasks. Well that's probably inevitable. Ok let's start with a first rough draft, we don't need the ultimate perfect metric.

In [None]:
def map_range_to_bin(r):
    return int(r * n_bins)

tasks["bin_id"] = tasks["epsilon_range"].apply(map_range_to_bin)


In [None]:
list(tasks.groupby("bin_id").count().reset_index().bin_id)

In [None]:
from experiments.ray.analysis import load_latest_ray_experiment
import plotly.express as px

In [None]:
rdf = load_latest_ray_experiment()

In [None]:
def get_block_std(path):
    if "sigma" not in path:
        return 0
    return float(path.split("sigma")[1])
rdf["block_std"] = rdf["tasks_path"].apply(get_block_std)

In [None]:
fig = px.line(
    rdf.sort_values("block_std"),
    x="block_std",
    y="n_allocated_tasks",
    color="scheduler_metric",
    width=800,
    height=600,
    log_x=True, 
    range_y=[0, 4000],
    title="Heterogenous blocks curves offline",
)
fig