In [134]:
import json
from privacypacking.utils.utils import load_logs, global_metrics, LOGS_PATH
import pandas as pd
from experiments.ray.analysis import load_tasks, load_ray_experiment, load_latest_ray_experiment, load_latest_scheduling_results, load_latest_scheduling_results, load_latest_ray_experiment, load_scheduling_queue
import plotly.express as px
from privacypacking.budget.curves import  LaplaceCurve, GaussianCurve, SubsampledGaussianCurve
from privacypacking.budget import Budget, Task, Block
from privacypacking.schedulers.metrics import OverflowRelevance, FlatRelevance
from privacypacking.budget.block_selection import RandomBlocks
from privacypacking.utils.plot import plot_budgets
from plotly.offline import download_plotlyjs, plot,iplot
import yaml
from pathlib import Path
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np


def get_df(path):
    logs = LOGS_PATH.joinpath("ray/run_and_report/" + path)
    df = load_ray_experiment(logs)
    return df

def plot_profit(df):
    return px.scatter(
        df,
        x="k",
        y="realized_profit",
        title="Total profit",
        range_y=[0, df['realized_profit'].max()+1000]
    )

def plot_tasks(df):
    return px.scatter(
        df,
        x="k",
        y="n_allocated_tasks",
        title="Num allocated tasks",
        range_y=[0, df['n_allocated_tasks'].max()+1000]
    )


def plot_realized_budget(df):
    return px.scatter(
        df,
        x="k",
        y="realized_budget",
        title="Total budget",
        range_y=[0, df['realized_budget'].max()+1000]
    )

def plot_bu(df, k):
    print(f"For k = {k}: \n  Total Profit: {df['realized_profit'].values}")
    print(f"             \n  Num-allocated: {df['n_allocated_tasks'].values}")
    print(f"    - cached subs: {df['cached_subs'].values}")
    print(f"    - subs: {df['subs'].values}")
    print(f"    - original: {df['original'].values}")
    print(f"    - cached original: {df['cached_original'].values}")
    blocks = []
#     for i in range(380):
    for b in df['blocks'].values[0]: 
#         b = df['blocks'].values[0][i]
        blocks.append(pd.DataFrame([{'id': b['id'], 
                                    'initial_budget': b['initial_budget']['orders']['0.0'],
                                    'budget': b['budget']['orders']['0.0']
                                   }]))
    blocks_df = pd.concat(blocks)
    blocks_df['budget'] = 10 - blocks_df['budget']
    blocks_df
    p = px.bar(
        blocks_df,
        x="id",
        y="budget",
        range_y=[0,10],
        title=f"Budget Utilization for k={k}",
    )
    return p

    
def plot_bu_k(df):
    dfs = []
    for (blocks,k) in df[['blocks', 'k']].values:
        blockslen = len(blocks)
#         for i in range(380):
        for block in blocks:
#             block = blocks[i]
            dfs.append(pd.DataFrame([{"budget": block['budget']['orders']['0.0'], "k": k,}]))
        
        
    df = pd.concat(dfs)
    df['budget'] = 10 - df['budget']
    df = df.groupby('k')['budget'].mean().reset_index()
    print(df)
    p = px.line(
        df,
        x="k",
        y="budget",
        range_y=[0, 10],
        title=f"Budget Utilization",
    )
    return p

def get_errors_k(df):
    dfs = []
    for (tasks,k) in df[['tasks', 'k']].values:
        for task in tasks:
            if task['substitute_result'] is not None:
                substitutes_result = task['original_result'].replace("\n", "").replace(" ", "").split('result0')[1]
                originals_result = task['substitute_result'].replace("\n", "").replace(" ", "").split('result0')[1]

                dfs.append(pd.DataFrame([{"id": task['id'], 
                                          "original_result": originals_result, 
                                          "substitute_result": substitutes_result,
                                          "k": k,
                                        }]))
    if dfs:
        df = pd.concat(dfs)
        df['original_result'] = df['original_result'].astype(float)
        df['substitute_result'] = df['substitute_result'].astype(float)
        df = df.sort_values('original_result')
        df.insert(0, 'newId', range(1, 1+len(df)))
        return df
    return None

def plot_results(df, k):
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    fig.add_trace(
        go.Scatter(x=df['newId'], y=df['substitute_result'], mode='markers', name="Substitute Results"),
        secondary_y=False,
    )
    fig.add_trace(
        go.Scatter(x=df['newId'], y=df['original_result'], mode='markers', name="Original Results"),
        secondary_y=True,
    )
    fig.update_layout(
        title_text=f"Original vs Substitute Results for k={k}"
    )
    fig.update_xaxes(title_text="task id")
    return fig

def plot_errors_cdf(df):
    df = get_errors_k(df)
    df = df.sort_values('k')
    denom = df[["original_result", "substitute_result"]].max(axis=1)
    df['denom'] = denom
    df['error'] = np.abs(df['original_result']-df['substitute_result']) / denom 
    p = px.ecdf(
        df,
        x="error",
        log_x=False,
        color='k',
        title=f"CDF of errors",
    )
    return p

def plot_errors(df, k):
    denom = df[["original_result", "substitute_result"]].max(axis=1)
    df['denom'] = denom
    df['error'] = np.abs(df['original_result']-df['substitute_result']) / denom 
    p = px.ecdf(
        df,
        x="error",
        log_x=False,
        range_x=[0,k],
        title=f"Substitution Errors for k={k}",
    )
    return p

def plot_results_errors(df, k):
    df = get_errors_k(df)
    if df is not None:
        return plot_results(df, k), plot_errors(df, k)
    return None, None


In [130]:
tasks = pd.read_csv("/home/kelly/privacypacking/data/covid19/privacy_tasks.csv")
tasks.reset_index()
tasks['id'] = tasks.index
# print(tasks)
px.histogram(
    tasks,
    x="submit_time",
    nbins=400,
    title="arrival time",
)

In [123]:
px.histogram(
    tasks,
    x="n_blocks",
    nbins=50,
    title="Requested numbers of Blocks",
)

In [111]:
tasks

Unnamed: 0,query_id,query_type,epsilon,delta,n_blocks,profit,block_selection_policy,task_name,alphas,rdp_epsilons,submit_time,relative_submit_time,id
0,13,average,0.5,0.00001,7,7,LatestBlocksFirst,task-13-7-0,[0.0],[0.5],0,0.0,0
1,17,average,0.5,0.00001,1,1,LatestBlocksFirst,task-17-1-0,[0.0],[0.5],0,0.0,1
2,23,average,0.5,0.00001,7,7,LatestBlocksFirst,task-23-7-0,[0.0],[0.5],0,0.0,2
3,3,average,0.5,0.00001,7,7,LatestBlocksFirst,task-3-7-0,[0.0],[0.5],0,0.0,3
4,29,average,0.5,0.00001,1,1,LatestBlocksFirst,task-29-1-0,[0.0],[0.5],0,0.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10899,28,average,0.5,0.00001,7,7,LatestBlocksFirst,task-28-7-399,[0.0],[0.5],399,0.0,10899
10900,19,average,0.5,0.00001,7,7,LatestBlocksFirst,task-19-7-399,[0.0],[0.5],399,0.0,10900
10901,25,average,0.5,0.00001,1,1,LatestBlocksFirst,task-25-1-399,[0.0],[0.5],399,0.0,10901
10902,15,average,0.5,0.00001,7,7,LatestBlocksFirst,task-15-7-399,[0.0],[0.5],399,0.0,10902


In [114]:
px.histogram(
    tasks,
    x="epsilon",
    nbins=10,
    title="epsilon",
)

In [66]:
path = "30queries"
df = get_df(path)
# iplot(plot_profit(df))
iplot(plot_realized_budget(df))
iplot(plot_tasks(df))
iplot(plot_bu_k(df))
iplot(plot_errors_cdf(df))
for k in [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3]:
    dff = df.query(f'k=={k}')
    iplot(plot_bu(dff, k))
    results, errors = plot_results_errors(dff, k)
    iplot(results)
#     iplot(errors)

      k   budget
0  0.00  7.26750
1  0.05  9.13750
2  0.10  9.42125
3  0.15  9.52000
4  0.20  9.60750
5  0.25  9.66875
6  0.30  9.73375


For k = 0: 
  Total Profit: [7842.]
             
  Num-allocated: [5592]
    - cached subs: [0]
    - subs: [1]
    - original: [4236]
    - cached original: [1355]


For k = 0.05: 
  Total Profit: [9771.]
             
  Num-allocated: [5835]
    - cached subs: [63]
    - subs: [428]
    - original: [4056]
    - cached original: [1288]


For k = 0.1: 
  Total Profit: [10737.]
             
  Num-allocated: [5960]
    - cached subs: [108]
    - subs: [540]
    - original: [4027]
    - cached original: [1285]


For k = 0.15: 
  Total Profit: [10886.]
             
  Num-allocated: [5979]
    - cached subs: [101]
    - subs: [600]
    - original: [4006]
    - cached original: [1272]


For k = 0.2: 
  Total Profit: [11204.]
             
  Num-allocated: [6027]
    - cached subs: [110]
    - subs: [636]
    - original: [4009]
    - cached original: [1272]


For k = 0.25: 
  Total Profit: [11297.]
             
  Num-allocated: [6041]
    - cached subs: [99]
    - subs: [673]
    - original: [3995]
    - cached original: [1274]


For k = 0.3: 
  Total Profit: [11462.]
             
  Num-allocated: [6072]
    - cached subs: [108]
    - subs: [692]
    - original: [4004]
    - cached original: [1268]


In [77]:
path = "1-30queries"
df = get_df(path)
# iplot(plot_profit(df))
iplot(plot_realized_budget(df))
iplot(plot_tasks(df))
iplot(plot_bu_k(df))
iplot(plot_errors_cdf(df))
for k in [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3]:
    dff = df.query(f'k=={k}')
    iplot(plot_bu(dff, k))
    results, errors = plot_results_errors(dff, k)
    iplot(results)
#     iplot(errors)

      k   budget
0  0.00  7.03250
1  0.05  8.95500
2  0.10  9.33500
3  0.15  9.55375
4  0.20  9.55250
5  0.25  9.62875
6  0.30  9.66250


For k = 0: 
  Total Profit: [7425.]
             
  Num-allocated: [4689]
    - cached subs: [0]
    - subs: [1]
    - original: [3568]
    - cached original: [1120]


For k = 0.05: 
  Total Profit: [9565.]
             
  Num-allocated: [4960]
    - cached subs: [69]
    - subs: [414]
    - original: [3406]
    - cached original: [1071]


For k = 0.1: 
  Total Profit: [10163.]
             
  Num-allocated: [5042]
    - cached subs: [74]
    - subs: [518]
    - original: [3383]
    - cached original: [1067]


For k = 0.15: 
  Total Profit: [10336.]
             
  Num-allocated: [5064]
    - cached subs: [59]
    - subs: [587]
    - original: [3364]
    - cached original: [1054]


For k = 0.2: 
  Total Profit: [10576.]
             
  Num-allocated: [5103]
    - cached subs: [80]
    - subs: [612]
    - original: [3358]
    - cached original: [1053]


For k = 0.25: 
  Total Profit: [10741.]
             
  Num-allocated: [5124]
    - cached subs: [81]
    - subs: [631]
    - original: [3357]
    - cached original: [1055]


For k = 0.3: 
  Total Profit: [10812.]
             
  Num-allocated: [5132]
    - cached subs: [83]
    - subs: [646]
    - original: [3352]
    - cached original: [1051]


In [82]:
path = "50queries"
df = get_df(path)
# iplot(plot_profit(df))
iplot(plot_realized_budget(df))
iplot(plot_tasks(df))
iplot(plot_bu_k(df))
iplot(plot_errors_cdf(df))
for k in [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3]:
    dff = df.query(f'k=={k}')
    iplot(plot_bu(dff, k))
    results, errors = plot_results_errors(dff, k)
    iplot(results)
#     iplot(errors)

      k   budget
0  0.00  7.04250
1  0.05  8.06250
2  0.10  8.43375
3  0.15  8.67500
4  0.20  8.77500
5  0.25  8.86000
6  0.30  8.89750


For k = 0: 
  Total Profit: [6690.]
             
  Num-allocated: [4764]
    - cached subs: [0]
    - subs: [0]
    - original: [3996]
    - cached original: [768]


ValueError: No objects to concatenate

In [90]:
path = "40queries"
df = get_df(path)
# iplot(plot_profit(df))
iplot(plot_realized_budget(df))
iplot(plot_tasks(df))
iplot(plot_bu_k(df))
iplot(plot_errors_cdf(df))
for k in [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3]:
    dff = df.query(f'k=={k}')
    iplot(plot_bu(dff, k))
    results, errors = plot_results_errors(dff, k)
    iplot(results)
#     iplot(errors)

      k   budget
0  0.00  6.82125
1  0.05  8.34875
2  0.10  8.69000
3  0.15  8.90875
4  0.20  9.06125
5  0.25  9.13750
6  0.30  9.23375


For k = 0: 
  Total Profit: [6762.]
             
  Num-allocated: [4740]
    - cached subs: [0]
    - subs: [1]
    - original: [3843]
    - cached original: [896]


For k = 0.05: 
  Total Profit: [8323.]
             
  Num-allocated: [4921]
    - cached subs: [31]
    - subs: [304]
    - original: [3709]
    - cached original: [877]


For k = 0.1: 
  Total Profit: [8826.]
             
  Num-allocated: [4997]
    - cached subs: [47]
    - subs: [405]
    - original: [3673]
    - cached original: [872]


For k = 0.15: 
  Total Profit: [9148.]
             
  Num-allocated: [5024]
    - cached subs: [51]
    - subs: [441]
    - original: [3668]
    - cached original: [864]


For k = 0.2: 
  Total Profit: [9330.]
             
  Num-allocated: [5041]
    - cached subs: [51]
    - subs: [477]
    - original: [3648]
    - cached original: [865]


For k = 0.25: 
  Total Profit: [9459.]
             
  Num-allocated: [5063]
    - cached subs: [54]
    - subs: [495]
    - original: [3650]
    - cached original: [864]


For k = 0.3: 
  Total Profit: [9620.]
             
  Num-allocated: [5078]
    - cached subs: [60]
    - subs: [506]
    - original: [3645]
    - cached original: [867]


In [136]:
path = "2-30queries"
df = get_df(path)
# iplot(plot_profit(df))
iplot(plot_realized_budget(df))
iplot(plot_tasks(df))
iplot(plot_bu_k(df))
iplot(plot_errors_cdf(df))
for k in [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3]:
    dff = df.query(f'k=={k}')
    iplot(plot_bu(dff, k))
    results, errors = plot_results_errors(dff, k)
    if results is not None:
        iplot(results)
#     iplot(errors)

      k   budget
0  0.00  7.42750
1  0.05  8.97625
2  0.10  9.32750
3  0.15  9.51500
4  0.20  9.55750
5  0.25  9.56375
6  0.30  9.60625


For k = 0: 
  Total Profit: [7809.]
             
  Num-allocated: [4461]
    - cached subs: [0]
    - subs: [0]
    - original: [3374]
    - cached original: [1087]


For k = 0.05: 
  Total Profit: [9706.]
             
  Num-allocated: [4715]
    - cached subs: [72]
    - subs: [410]
    - original: [3194]
    - cached original: [1039]


For k = 0.1: 
  Total Profit: [10471.]
             
  Num-allocated: [4805]
    - cached subs: [93]
    - subs: [523]
    - original: [3152]
    - cached original: [1037]


For k = 0.15: 
  Total Profit: [10672.]
             
  Num-allocated: [4830]
    - cached subs: [83]
    - subs: [584]
    - original: [3140]
    - cached original: [1023]


For k = 0.2: 
  Total Profit: [10838.]
             
  Num-allocated: [4847]
    - cached subs: [87]
    - subs: [611]
    - original: [3133]
    - cached original: [1016]


For k = 0.25: 
  Total Profit: [11047.]
             
  Num-allocated: [4874]
    - cached subs: [99]
    - subs: [632]
    - original: [3127]
    - cached original: [1016]


For k = 0.3: 
  Total Profit: [11065.]
             
  Num-allocated: [4881]
    - cached subs: [98]
    - subs: [642]
    - original: [3131]
    - cached original: [1010]


In [9]:
path = "baseline18"
df = get_df(path)
plot_bu_k(df) 
iplot(plot_bu(df, 0))

   k  budget
0  0  6.6075
For k = 0: 
  Total Profit: [5286.]
             
  Num-allocated: [5076]
    - cached subs: [0]
    - subs: [0]
    - original: [5076]
    - cached original: [0]


In [12]:
path = "baseline21"
df = get_df(path)
plot_bu_k(df) 
iplot(plot_bu(df, 0))

   k  budget
0  0   6.575
For k = 0: 
  Total Profit: [5260.]
             
  Num-allocated: [4774]
    - cached subs: [0]
    - subs: [0]
    - original: [4774]
    - cached original: [0]


In [63]:
path = "baseline30queries"
df = get_df(path)
plot_bu_k(df) 
iplot(plot_bu(df, 0))

   k   budget
0  0  7.26125
For k = 0: 
  Total Profit: [7836.]
             
  Num-allocated: [5592]
    - cached subs: [0]
    - subs: [0]
    - original: [4237]
    - cached original: [1355]


In [76]:
path = "baseline30"
df = get_df(path)
plot_bu_k(df) 
iplot(plot_bu(df, 0))

   k   budget
0  0  7.03375
For k = 0: 
  Total Profit: [7425.]
             
  Num-allocated: [4689]
    - cached subs: [0]
    - subs: [0]
    - original: [3569]
    - cached original: [1120]
