In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import plotly.io as pio
pio.renderers.default = "notebook"
import plotly.offline as pyo
import plotly.express as px
pyo.init_notebook_mode(connected=True)
from utils import analyze_results, plot_null_reports_analysis

In [3]:
conversions = pd.read_csv("../data/criteo/criteo_query_pool_large_conversions.csv")
advertiser_query_counts = (conversions[["partner_id", "key"]]
    .drop_duplicates()
    .groupby(["partner_id"])
    .key
    .count()
    .rename_axis("destination"))
advertiser_query_counts = advertiser_query_counts.reset_index()

In [4]:
path = "ray/criteo/large/bias_varying_workload_size"
t = .90
results = analyze_results(path, "bias", parallelize=False, t=t)
results = pd.merge(advertiser_query_counts, results, how="inner", on="destination")
results = results.loc[(results.requested_workload_size - results.key < 5)] # distance between buckets
results

Unnamed: 0,destination,key,workload_size,requested_workload_size,fraction_queries_without_null_reports,null_report_bias_average_relative_accuracy,fraction_queries_relatively_accurate_e2e,e2e_bias_average_relative_accuracy,baseline,num_days_per_epoch,initial_budget
0,319A2412BDB0EF669733053640B80112,19,10,10,1.000000,1.000000,1.000000,0.993110,user_epoch_ara,7,1.0
1,319A2412BDB0EF669733053640B80112,19,19,20,0.526316,0.998798,1.000000,0.990986,user_epoch_ara,7,1.0
2,319A2412BDB0EF669733053640B80112,19,1,1,1.000000,1.000000,1.000000,0.997205,ipa,7,1.0
3,319A2412BDB0EF669733053640B80112,19,15,15,0.733333,0.733333,0.733333,0.729802,ipa,7,1.0
4,319A2412BDB0EF669733053640B80112,19,19,20,0.894737,0.999876,1.000000,0.992018,cookiemonster,7,1.0
...,...,...,...,...,...,...,...,...,...,...,...
175,F122B91F6D102E4630817566839A4F1F,43,43,45,0.302326,0.302326,0.302326,0.300453,ipa,7,1.0
176,F122B91F6D102E4630817566839A4F1F,43,43,45,0.116279,0.998594,1.000000,0.995107,cookiemonster,7,1.0
177,F122B91F6D102E4630817566839A4F1F,43,30,30,0.200000,0.998998,1.000000,0.995768,user_epoch_ara,7,1.0
178,F122B91F6D102E4630817566839A4F1F,43,35,35,0.114286,0.998523,1.000000,0.992870,user_epoch_ara,7,1.0


Can you also plot the following graph: x axis: advertisers (sorted in descending order
of the number of queries in their workload); y axis: fraction of queries that the
advertiser x is able to execute by the end of his experiment with a target relative
accuracy t. This t could be (say) 90% if the workload generation had provisioned the
epsilons for absolute accuracy 95% with 99% probability. You can take a sample of
advertisers, you dont have to run experiments for absolutely all of them!

In [5]:
sorted = results.sort_values(by=["key", "baseline"], ascending=False)

In [14]:
workload_sizes = [1, 5, 10, 15, 20, 25, 30, 35, 40, 45]
for workload_size in workload_sizes:
    content = sorted.loc[sorted.requested_workload_size == workload_size]
    fig1 = px.bar(
        content,
        x="destination",
        y="fraction_queries_relatively_accurate_e2e",
        color="baseline",
        barmode="group",
        title=f"Frac. queries with >= {t * 100}% rel. accuracy by destination (workload size {workload_size})"
    )
    fig2 = px.bar(
        content,
        x="destination",
        y="e2e_bias_average_relative_accuracy",
        color="baseline",
        barmode="group",
        title=f"Avg. rel. accuracy accross queries by destination (workload size {workload_size})"
    )
    pyo.iplot(fig1)
    pyo.iplot(fig2)
    fig1.write_image(f"./large/e2e_bias_fraction_relative_accuracy_ws_{workload_size}.png")
    fig2.write_image(f"./large/e2e_bias_average_relative_accuracy_ws_{workload_size}.png")

In [7]:
for destination in results.groupby(['destination']).destination.unique():
    advertiser = results[results['destination'].isin(destination)]
    plot_null_reports_analysis(advertiser, save_dir="large")