In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import plotly.io as pio
pio.renderers.default = "notebook"
import plotly.offline as pyo
import plotly.express as px
pyo.init_notebook_mode(connected=True)
from utils import analyze_results, plot_null_reports_analysis

In [7]:
# conversions = pd.read_csv("../data/criteo/archive/criteo_query_pool_large_conversions.csv")
# advertiser_query_counts = (conversions[["partner_id", "key"]]
#     .drop_duplicates()
#     .groupby(["partner_id"])
#     .key
#     .count()
#     .rename_axis("destination"))
# advertiser_query_counts = advertiser_query_counts.reset_index()

In [5]:
path = "ray/criteo/large/bias_varying_workload_size"
t = .90
results = analyze_results(path, "bias", parallelize=False, t=t)


In [None]:
# results = pd.merge(advertiser_query_counts, results, how="inner", on="destination")
# results = results.loc[(results.requested_workload_size - results.key < 5)] # distance between buckets
# results

In [6]:
requested_workload_sizes = results.requested_workload_size.unique()
advertisers = results.destination.unique()
baselines = results.baseline.unique()

records = []
for requested_workload_size in requested_workload_sizes:
    for advertiser in advertisers:
        for baseline in baselines:
            section = results[(results.baseline == baseline) & (results.destination == advertiser) & (results.requested_workload_size == requested_workload_size)]
            for _, row in section.iterrows():
                accuracies = zip(row.e2e_bias_relative_accuracies, row.null_report_bias_relative_accuracies)
                for i, dimension in enumerate(accuracies):
                    records.append({
                        "e2e_bias_accuracy": dimension[0],
                        "null_report_bias_accuracy": dimension[1],
                        "requested_workload_size": requested_workload_size,
                        "advertiser": advertiser,
                        "baseline": baseline,
                    })
p = pd.DataFrame.from_records(records)

In [7]:
p[p.requested_workload_size == 30]

Unnamed: 0,e2e_bias_accuracy,null_report_bias_accuracy,requested_workload_size,advertiser,baseline
900,0.997402,1.0,30,319A2412BDB0EF669733053640B80112,cookiemonster
901,0.961249,1.0,30,319A2412BDB0EF669733053640B80112,cookiemonster
902,0.987343,1.0,30,319A2412BDB0EF669733053640B80112,cookiemonster
903,0.995870,1.0,30,319A2412BDB0EF669733053640B80112,cookiemonster
904,0.999234,1.0,30,319A2412BDB0EF669733053640B80112,cookiemonster
...,...,...,...,...,...
1435,0.000000,0.0,30,F122B91F6D102E4630817566839A4F1F,ipa
1436,0.000000,0.0,30,F122B91F6D102E4630817566839A4F1F,ipa
1437,0.000000,0.0,30,F122B91F6D102E4630817566839A4F1F,ipa
1438,0.000000,0.0,30,F122B91F6D102E4630817566839A4F1F,ipa


In [8]:
for log_y in [True, False]:
    title = (
        "Zoomed in CDF for relative accuracy (workload size 30)"
        if log_y else
        "CDF for relative accuracy (workload size 30)"
    )
    filename = (
        "cdf_zoomed_relative_accuracy_ws_30"
        if log_y else
        "cdf_relative_accuracy_ws_30"
    )
    figcdf = px.ecdf(
        p[p.requested_workload_size == 30],
        y="e2e_bias_accuracy",
        orientation='h',
        color="baseline",
        log_y=log_y,
        
    )
    figcdf.update_layout(
        title=title,
        xaxis_title="proportion of queries",
        yaxis_title="relative accuracy"
    )
    figcdf.show()
    figcdf.write_image(f"./large/{filename}.png")

In [9]:
for log_y in [True, False]:
    title = (
        "Zoomed in CDF for null report bias accuracy (workload size 30)"
        if log_y else
        "CDF for null report accuracy (workload size 30)"
    )
    filename = (
        "cdf_zoomed_null_report_bias_relative_accuracy_ws_30"
        if log_y else
        "cdf_null_report_bias_relative_accuracy_ws_30"
    )
    figcdf = px.ecdf(
        p[p.requested_workload_size == 30],
        y="null_report_bias_accuracy",
        orientation='h',
        color="baseline",
        log_y=log_y,
        
    )
    figcdf.update_layout(
        title=title,
        xaxis_title="proportion of queries",
        yaxis_title="relative accuracy"
    )
    figcdf.show()
    figcdf.write_image(f"./large/{filename}.png")

In [49]:
# for requested_workload_size in m.requested_workload_size.unique():
#     chunk = m.loc[(m.requested_workload_size == requested_workload_size)]
#     fig = px.line(
#         chunk,
#         x="proportion_of_queries",
#         y="relative_accuracy",
#         range_y=[0, 1],
#         range_x=[0, 1],
#         color="baseline",
#         markers=True,
#         title=f"average relative accuracy per proportion of queries by baseline for workload size {requested_workload_size}"

#     )
#     pyo.iplot(fig)

Can you also plot the following graph: x axis: advertisers (sorted in descending order
of the number of queries in their workload); y axis: fraction of queries that the
advertiser x is able to execute by the end of his experiment with a target relative
accuracy t. This t could be (say) 90% if the workload generation had provisioned the
epsilons for absolute accuracy 95% with 99% probability. You can take a sample of
advertisers, you dont have to run experiments for absolutely all of them!

In [17]:
sorted_df = results.sort_values(by=["key", "baseline"], ascending=False)

In [15]:
workload_sizes = [5, 10, 15, 20, 25, 30]
for requested_workload_size in workload_sizes:
    content = sorted_df.loc[sorted_df.requested_workload_size == requested_workload_size]
    fig1 = px.bar(
        content,
        x="destination",
        y="fraction_queries_relatively_accurate_e2e",
        color="baseline",
        barmode="group",
        title=f"Frac. queries with >= {t * 100}% rel. accuracy by destination (workload size {requested_workload_size})"
    )
    fig2 = px.bar(
        content,
        x="destination",
        y="e2e_bias_average_relative_accuracy",
        color="baseline",
        barmode="group",
        title=f"Avg. rel. accuracy accross queries by destination (workload size {requested_workload_size})"
    )
    pyo.iplot(fig1)
    pyo.iplot(fig2)
    fig1.write_image(f"./large/e2e_bias_fraction_relative_accuracy_ws_{requested_workload_size}.png")
    fig2.write_image(f"./large/e2e_bias_average_relative_accuracy_ws_{requested_workload_size}.png")

In [7]:
for destination in results.groupby(['destination']).destination.unique():
    advertiser = results[results['destination'].isin(destination)]
    plot_null_reports_analysis(advertiser, save_dir="large")