In [2]:
%load_ext autoreload
%autoreload 2

In [23]:
import pandas as pd
import plotly.io as pio
pio.renderers.default = "notebook"
import plotly.offline as pyo
import plotly.express as px
pyo.init_notebook_mode(connected=True)
from utils import analyze_results, plot_null_reports_analysis

In [None]:
conversions = pd.read_csv("../data/criteo/criteo_query_pool_large_conversions.csv")
advertiser_query_counts = (conversions[["partner_id", "key"]]
    .drop_duplicates()
    .groupby(["partner_id"])
    .key
    .count()
    .rename_axis("destination"))
advertiser_query_counts = advertiser_query_counts.reset_index()

In [72]:
path = "ray/criteo/large/bias_varying_workload_size"
results = analyze_results(path, "bias", parallelize=False, t=.90)
results = pd.merge(advertiser_query_counts, results, how="inner", on="destination")
results = results.loc[(results.requested_workload_size - results.key < 5)] # distance between buckets
results

Unnamed: 0,destination,key,workload_size,requested_workload_size,fraction_queries_without_null_reports,average_accuracy,fraction_relatively_accurate,baseline,num_days_per_epoch,initial_budget
0,319A2412BDB0EF669733053640B80112,19,10,10,1.000000,1.000000,1.000000,user_epoch_ara,7,1.0
1,319A2412BDB0EF669733053640B80112,19,19,20,0.526316,0.998798,1.000000,user_epoch_ara,7,1.0
2,319A2412BDB0EF669733053640B80112,19,1,1,1.000000,1.000000,1.000000,ipa,7,1.0
3,319A2412BDB0EF669733053640B80112,19,15,15,0.733333,0.733333,0.733333,ipa,7,1.0
4,319A2412BDB0EF669733053640B80112,19,19,20,0.894737,0.999876,1.000000,cookiemonster,7,1.0
...,...,...,...,...,...,...,...,...,...,...
175,F122B91F6D102E4630817566839A4F1F,43,43,45,0.302326,0.302326,0.302326,ipa,7,1.0
176,F122B91F6D102E4630817566839A4F1F,43,43,45,0.116279,0.998594,1.000000,cookiemonster,7,1.0
177,F122B91F6D102E4630817566839A4F1F,43,30,30,0.200000,0.998998,1.000000,user_epoch_ara,7,1.0
178,F122B91F6D102E4630817566839A4F1F,43,35,35,0.114286,0.998523,1.000000,user_epoch_ara,7,1.0


Can you also plot the following graph: x axis: advertisers (sorted in descending order
of the number of queries in their workload); y axis: fraction of queries that the
advertiser x is able to execute by the end of his experiment with a target relative
accuracy t. This t could be (say) 90% if the workload generation had provisioned the
epsilons for absolute accuracy 95% with 99% probability. You can take a sample of
advertisers, you dont have to run experiments for absolutely all of them!

In [74]:
sorted = results.sort_values(by=["key", "baseline"], ascending=False)

In [77]:
workload_sizes = [1, 5, 10, 15, 20, 25, 30, 35, 40, 45]
for workload_size in workload_sizes:
    content = sorted.loc[sorted.requested_workload_size == workload_size]
    fig = px.bar(
        content,
        x="destination",
        y="fraction_relatively_accurate",
        color="baseline",
        barmode="group",
        title=f"Fraction of relative accuracy by destination for workload size {workload_size}"
    )
    pyo.iplot(fig)

In [79]:
for destination in results.groupby(['destination']).destination.unique():
    advertiser = results[results['destination'].isin(destination)]
    plot_null_reports_analysis(advertiser, save_dir="large")

In [81]:
results.destination.unique()

array(['319A2412BDB0EF669733053640B80112',
       '51EBAAC05E372CBCDF0F207517A225AB',
       '9D9E93D1D461D7BAE47FB67EC0E01B62',
       '9FF550C0B17A3C493378CB6E2DEEE6E4',
       'E3DDEB04F8AFF944B11943BB57D2F620',
       'F122B91F6D102E4630817566839A4F1F'], dtype=object)

In [82]:
results.loc[(results.destination == '51EBAAC05E372CBCDF0F207517A225AB')]

Unnamed: 0,destination,key,workload_size,requested_workload_size,fraction_queries_without_null_reports,average_accuracy,fraction_relatively_accurate,baseline,num_days_per_epoch,initial_budget
30,51EBAAC05E372CBCDF0F207517A225AB,19,10,10,1.0,1.0,1.0,user_epoch_ara,7,1.0
31,51EBAAC05E372CBCDF0F207517A225AB,19,19,20,0.631579,0.998182,1.0,user_epoch_ara,7,1.0
32,51EBAAC05E372CBCDF0F207517A225AB,19,1,1,1.0,1.0,1.0,ipa,7,1.0
33,51EBAAC05E372CBCDF0F207517A225AB,19,15,15,0.6,0.6,0.533333,ipa,7,1.0
34,51EBAAC05E372CBCDF0F207517A225AB,19,19,20,1.0,1.0,0.947368,cookiemonster,7,1.0
35,51EBAAC05E372CBCDF0F207517A225AB,19,5,5,1.0,1.0,1.0,cookiemonster,7,1.0
36,51EBAAC05E372CBCDF0F207517A225AB,19,1,1,1.0,1.0,1.0,cookiemonster,7,1.0
37,51EBAAC05E372CBCDF0F207517A225AB,19,10,10,0.7,0.7,0.7,ipa,7,1.0
38,51EBAAC05E372CBCDF0F207517A225AB,19,19,20,0.578947,0.578947,0.578947,ipa,7,1.0
39,51EBAAC05E372CBCDF0F207517A225AB,19,1,1,1.0,1.0,1.0,user_epoch_ara,7,1.0
